src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "character.h"
 292 #include "buffer.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static ptrdiff_t coding_change_source (struct coding_system *);
 852 static void coding_set_destination (struct coding_system *);
 853 static ptrdiff_t coding_change_destination (struct coding_system *);
 854 static void coding_alloc_by_realloc (struct coding_system *, ptrdiff_t);
 855 static void coding_alloc_by_making_gap (struct coding_system *,
 856                                         ptrdiff_t, ptrdiff_t);
 857 static unsigned char *alloc_destination (struct coding_system *,
 858                                          ptrdiff_t, unsigned char *);
 859 static void setup_iso_safe_charsets (Lisp_Object);
 860 static ptrdiff_t encode_designation_at_bol (struct coding_system *,
 861                                       int *, int *, unsigned char *);
 862 static int detect_eol (const unsigned char *,
 863                        ptrdiff_t, enum coding_category);
 864 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 865 static void decode_eol (struct coding_system *);
 866 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 867 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 868 static int produce_chars (struct coding_system *, Lisp_Object, int);
 869 static inline void produce_charset (struct coding_system *, int *,
 870                                     ptrdiff_t);
 871 static void produce_annotation (struct coding_system *, ptrdiff_t);
 872 static int decode_coding (struct coding_system *);
 873 static inline int *handle_composition_annotation (ptrdiff_t, ptrdiff_t,
 874                                                   struct coding_system *,
 875                                                   int *, ptrdiff_t *);
 876 static inline int *handle_charset_annotation (ptrdiff_t, ptrdiff_t,
 877                                               struct coding_system *,
 878                                               int *, ptrdiff_t *);
 879 static void consume_chars (struct coding_system *, Lisp_Object, int);
 880 static int encode_coding (struct coding_system *);
 881 static Lisp_Object make_conversion_work_buffer (int);
 882 static Lisp_Object code_conversion_restore (Lisp_Object);
 883 static inline int char_encodable_p (int, Lisp_Object);
 884 static Lisp_Object make_subsidiaries (Lisp_Object);
 885
 886 static void
 887 record_conversion_result (struct coding_system *coding,
 888                           enum coding_result_code result)
 889 {
 890   coding->result = result;
 891   switch (result)
 892     {
 893     case CODING_RESULT_INSUFFICIENT_SRC:
 894       Vlast_code_conversion_error = Qinsufficient_source;
 895       break;
 896     case CODING_RESULT_INCONSISTENT_EOL:
 897       Vlast_code_conversion_error = Qinconsistent_eol;
 898       break;
 899     case CODING_RESULT_INVALID_SRC:
 900       Vlast_code_conversion_error = Qinvalid_source;
 901       break;
 902     case CODING_RESULT_INTERRUPT:
 903       Vlast_code_conversion_error = Qinterrupted;
 904       break;
 905     case CODING_RESULT_INSUFFICIENT_MEM:
 906       Vlast_code_conversion_error = Qinsufficient_memory;
 907       break;
 908     case CODING_RESULT_INSUFFICIENT_DST:
 909       /* Don't record this error in Vlast_code_conversion_error
 910          because it happens just temporarily and is resolved when the
 911          whole conversion is finished.  */
 912       break;
 913     case CODING_RESULT_SUCCESS:
 914       break;
 915     default:
 916       Vlast_code_conversion_error = intern ("Unknown error");
 917     }
 918 }
 919
 920 /* These wrapper macros are used to preserve validity of pointers into
 921    buffer text across calls to decode_char, encode_char, etc, which
 922    could cause relocation of buffers if it loads a charset map,
 923    because loading a charset map allocates large structures.  */
 924
 925 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 926   do {                                                                       \
 927     ptrdiff_t offset;                                                        \
 928                                                                              \
 929     charset_map_loaded = 0;                                                  \
 930     c = DECODE_CHAR (charset, code);                                         \
 931     if (charset_map_loaded                                                   \
 932         && (offset = coding_change_source (coding)))                         \
 933       {                                                                      \
 934         src += offset;                                                       \
 935         src_base += offset;                                                  \
 936         src_end += offset;                                                   \
 937       }                                                                      \
 938   } while (0)
 939
 940 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 941   do {                                                                  \
 942     ptrdiff_t offset;                                                   \
 943                                                                         \
 944     charset_map_loaded = 0;                                             \
 945     code = ENCODE_CHAR (charset, c);                                    \
 946     if (charset_map_loaded                                              \
 947         && (offset = coding_change_destination (coding)))               \
 948       {                                                                 \
 949         dst += offset;                                                  \
 950         dst_end += offset;                                              \
 951       }                                                                 \
 952   } while (0)
 953
 954 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 955   do {                                                                  \
 956     ptrdiff_t offset;                                                   \
 957                                                                         \
 958     charset_map_loaded = 0;                                             \
 959     charset = char_charset (c, charset_list, code_return);              \
 960     if (charset_map_loaded                                              \
 961         && (offset = coding_change_destination (coding)))               \
 962       {                                                                 \
 963         dst += offset;                                                  \
 964         dst_end += offset;                                              \
 965       }                                                                 \
 966   } while (0)
 967
 968 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 969   do {                                                                  \
 970     ptrdiff_t offset;                                                   \
 971                                                                         \
 972     charset_map_loaded = 0;                                             \
 973     result = CHAR_CHARSET_P (c, charset);                               \
 974     if (charset_map_loaded                                              \
 975         && (offset = coding_change_destination (coding)))               \
 976       {                                                                 \
 977         dst += offset;                                                  \
 978         dst_end += offset;                                              \
 979       }                                                                 \
 980   } while (0)
 981
 982
 983 /* If there are at least BYTES length of room at dst, allocate memory
 984    for coding->destination and update dst and dst_end.  We don't have
 985    to take care of coding->source which will be relocated.  It is
 986    handled by calling coding_set_source in encode_coding.  */
 987
 988 #define ASSURE_DESTINATION(bytes)                               \
 989   do {                                                          \
 990     if (dst + (bytes) >= dst_end)                               \
 991       {                                                         \
 992         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 993                                                                 \
 994         dst = alloc_destination (coding, more_bytes, dst);      \
 995         dst_end = coding->destination + coding->dst_bytes;      \
 996       }                                                         \
 997   } while (0)
 998
 999
1000 /* Store multibyte form of the character C in P, and advance P to the
1001    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1002    never calls MAYBE_UNIFY_CHAR.  */
1003
1004 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1005   do {                                          \
1006     if ((c) <= MAX_1_BYTE_CHAR)                 \
1007       *(p)++ = (c);                             \
1008     else if ((c) <= MAX_2_BYTE_CHAR)            \
1009       *(p)++ = (0xC0 | ((c) >> 6)),             \
1010         *(p)++ = (0x80 | ((c) & 0x3F));         \
1011     else if ((c) <= MAX_3_BYTE_CHAR)            \
1012       *(p)++ = (0xE0 | ((c) >> 12)),            \
1013         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1014         *(p)++ = (0x80 | ((c) & 0x3F));         \
1015     else if ((c) <= MAX_4_BYTE_CHAR)            \
1016       *(p)++ = (0xF0 | (c >> 18)),              \
1017         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1018         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1019         *(p)++ = (0x80 | (c & 0x3F));           \
1020     else if ((c) <= MAX_5_BYTE_CHAR)            \
1021       *(p)++ = 0xF8,                            \
1022         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1023         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1024         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1025         *(p)++ = (0x80 | (c & 0x3F));           \
1026     else                                        \
1027       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1028   } while (0)
1029
1030
1031 /* Return the character code of character whose multibyte form is at
1032    P, and advance P to the end of the multibyte form.  This is like
1033    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1034
1035 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1036   (!((p)[0] & 0x80)                                             \
1037    ? *(p)++                                                     \
1038    : ! ((p)[0] & 0x20)                                          \
1039    ? ((p) += 2,                                                 \
1040       ((((p)[-2] & 0x1F) << 6)                                  \
1041        | ((p)[-1] & 0x3F)                                       \
1042        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1043    : ! ((p)[0] & 0x10)                                          \
1044    ? ((p) += 3,                                                 \
1045       ((((p)[-3] & 0x0F) << 12)                                 \
1046        | (((p)[-2] & 0x3F) << 6)                                \
1047        | ((p)[-1] & 0x3F)))                                     \
1048    : ! ((p)[0] & 0x08)                                          \
1049    ? ((p) += 4,                                                 \
1050       ((((p)[-4] & 0xF) << 18)                                  \
1051        | (((p)[-3] & 0x3F) << 12)                               \
1052        | (((p)[-2] & 0x3F) << 6)                                \
1053        | ((p)[-1] & 0x3F)))                                     \
1054    : ((p) += 5,                                                 \
1055       ((((p)[-4] & 0x3F) << 18)                                 \
1056        | (((p)[-3] & 0x3F) << 12)                               \
1057        | (((p)[-2] & 0x3F) << 6)                                \
1058        | ((p)[-1] & 0x3F))))
1059
1060
1061 /* Set coding->source from coding->src_object.  */
1062
1063 static void
1064 coding_set_source (struct coding_system *coding)
1065 {
1066   if (BUFFERP (coding->src_object))
1067     {
1068       struct buffer *buf = XBUFFER (coding->src_object);
1069
1070       if (coding->src_pos < 0)
1071         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1072       else
1073         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1074     }
1075   else if (STRINGP (coding->src_object))
1076     {
1077       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1078     }
1079   else
1080     {
1081       /* Otherwise, the source is C string and is never relocated
1082          automatically.  Thus we don't have to update anything.  */
1083     }
1084 }
1085
1086
1087 /* Set coding->source from coding->src_object, and return how many
1088    bytes coding->source was changed.  */
1089
1090 static ptrdiff_t
1091 coding_change_source (struct coding_system *coding)
1092 {
1093   const unsigned char *orig = coding->source;
1094   coding_set_source (coding);
1095   return coding->source - orig;
1096 }
1097
1098
1099 /* Set coding->destination from coding->dst_object.  */
1100
1101 static void
1102 coding_set_destination (struct coding_system *coding)
1103 {
1104   if (BUFFERP (coding->dst_object))
1105     {
1106       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1107         {
1108           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1109           coding->dst_bytes = (GAP_END_ADDR
1110                                - (coding->src_bytes - coding->consumed)
1111                                - coding->destination);
1112         }
1113       else
1114         {
1115           /* We are sure that coding->dst_pos_byte is before the gap
1116              of the buffer. */
1117           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1118                                  + coding->dst_pos_byte - BEG_BYTE);
1119           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1120                                - coding->destination);
1121         }
1122     }
1123   else
1124     {
1125       /* Otherwise, the destination is C string and is never relocated
1126          automatically.  Thus we don't have to update anything.  */
1127     }
1128 }
1129
1130
1131 /* Set coding->destination from coding->dst_object, and return how
1132    many bytes coding->destination was changed.  */
1133
1134 static ptrdiff_t
1135 coding_change_destination (struct coding_system *coding)
1136 {
1137   const unsigned char *orig = coding->destination;
1138   coding_set_destination (coding);
1139   return coding->destination - orig;
1140 }
1141
1142
1143 static void
1144 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1145 {
1146   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1147     string_overflow ();
1148   coding->destination = (unsigned char *) xrealloc (coding->destination,
1149                                                     coding->dst_bytes + bytes);
1150   coding->dst_bytes += bytes;
1151 }
1152
1153 static void
1154 coding_alloc_by_making_gap (struct coding_system *coding,
1155                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1156 {
1157   if (EQ (coding->src_object, coding->dst_object))
1158     {
1159       /* The gap may contain the produced data at the head and not-yet
1160          consumed data at the tail.  To preserve those data, we at
1161          first make the gap size to zero, then increase the gap
1162          size.  */
1163       ptrdiff_t add = GAP_SIZE;
1164
1165       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1166       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1167       make_gap (bytes);
1168       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1169       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1170     }
1171   else
1172     {
1173       Lisp_Object this_buffer;
1174
1175       this_buffer = Fcurrent_buffer ();
1176       set_buffer_internal (XBUFFER (coding->dst_object));
1177       make_gap (bytes);
1178       set_buffer_internal (XBUFFER (this_buffer));
1179     }
1180 }
1181
1182
1183 static unsigned char *
1184 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1185                    unsigned char *dst)
1186 {
1187   ptrdiff_t offset = dst - coding->destination;
1188
1189   if (BUFFERP (coding->dst_object))
1190     {
1191       struct buffer *buf = XBUFFER (coding->dst_object);
1192
1193       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1194     }
1195   else
1196     coding_alloc_by_realloc (coding, nbytes);
1197   coding_set_destination (coding);
1198   dst = coding->destination + offset;
1199   return dst;
1200 }
1201
1202 /** Macros for annotations.  */
1203
1204 /* An annotation data is stored in the array coding->charbuf in this
1205    format:
1206      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1207    LENGTH is the number of elements in the annotation.
1208    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1209    NCHARS is the number of characters in the text annotated.
1210
1211    The format of the following elements depend on ANNOTATION_MASK.
1212
1213    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1214    follows:
1215      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1216
1217    NBYTES is the number of bytes specified in the header part of
1218    old-style emacs-mule encoding, or 0 for the other kind of
1219    composition.
1220
1221    METHOD is one of enum composition_method.
1222
1223    Optional COMPOSITION-COMPONENTS are characters and composition
1224    rules.
1225
1226    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1227    follows.
1228
1229    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1230    recover from an invalid annotation, and should be skipped by
1231    produce_annotation.  */
1232
1233 /* Maximum length of the header of annotation data.  */
1234 #define MAX_ANNOTATION_LENGTH 5
1235
1236 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1237   do {                                                  \
1238     *(buf)++ = -(len);                                  \
1239     *(buf)++ = (mask);                                  \
1240     *(buf)++ = (nchars);                                \
1241     coding->annotated = 1;                              \
1242   } while (0);
1243
1244 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1245   do {                                                                      \
1246     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1247     *buf++ = nbytes;                                                        \
1248     *buf++ = method;                                                        \
1249   } while (0)
1250
1251
1252 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1253   do {                                                                  \
1254     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1255     *buf++ = id;                                                        \
1256   } while (0)
1257
1258 \f
1259 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1260
1261
1262
1263 \f
1264 /*** 3. UTF-8 ***/
1265
1266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1267    Check if a text is encoded in UTF-8.  If it is, return 1, else
1268    return 0.  */
1269
1270 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1271 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1276
1277 #define UTF_8_BOM_1 0xEF
1278 #define UTF_8_BOM_2 0xBB
1279 #define UTF_8_BOM_3 0xBF
1280
1281 static int
1282 detect_coding_utf_8 (struct coding_system *coding,
1283                      struct coding_detection_info *detect_info)
1284 {
1285   const unsigned char *src = coding->source, *src_base;
1286   const unsigned char *src_end = coding->source + coding->src_bytes;
1287   int multibytep = coding->src_multibyte;
1288   ptrdiff_t consumed_chars = 0;
1289   int bom_found = 0;
1290   int found = 0;
1291
1292   detect_info->checked |= CATEGORY_MASK_UTF_8;
1293   /* A coding system of this category is always ASCII compatible.  */
1294   src += coding->head_ascii;
1295
1296   while (1)
1297     {
1298       int c, c1, c2, c3, c4;
1299
1300       src_base = src;
1301       ONE_MORE_BYTE (c);
1302       if (c < 0 || UTF_8_1_OCTET_P (c))
1303         continue;
1304       ONE_MORE_BYTE (c1);
1305       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1306         break;
1307       if (UTF_8_2_OCTET_LEADING_P (c))
1308         {
1309           found = 1;
1310           continue;
1311         }
1312       ONE_MORE_BYTE (c2);
1313       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1314         break;
1315       if (UTF_8_3_OCTET_LEADING_P (c))
1316         {
1317           found = 1;
1318           if (src_base == coding->source
1319               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1320             bom_found = 1;
1321           continue;
1322         }
1323       ONE_MORE_BYTE (c3);
1324       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1325         break;
1326       if (UTF_8_4_OCTET_LEADING_P (c))
1327         {
1328           found = 1;
1329           continue;
1330         }
1331       ONE_MORE_BYTE (c4);
1332       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1333         break;
1334       if (UTF_8_5_OCTET_LEADING_P (c))
1335         {
1336           found = 1;
1337           continue;
1338         }
1339       break;
1340     }
1341   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1342   return 0;
1343
1344  no_more_source:
1345   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1346     {
1347       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1348       return 0;
1349     }
1350   if (bom_found)
1351     {
1352       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1353       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1354     }
1355   else
1356     {
1357       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1358       if (found)
1359         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1360     }
1361   return 1;
1362 }
1363
1364
1365 static void
1366 decode_coding_utf_8 (struct coding_system *coding)
1367 {
1368   const unsigned char *src = coding->source + coding->consumed;
1369   const unsigned char *src_end = coding->source + coding->src_bytes;
1370   const unsigned char *src_base;
1371   int *charbuf = coding->charbuf + coding->charbuf_used;
1372   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1373   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1374   int multibytep = coding->src_multibyte;
1375   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1376   int eol_dos =
1377     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1378   int byte_after_cr = -1;
1379
1380   if (bom != utf_without_bom)
1381     {
1382       int c1, c2, c3;
1383
1384       src_base = src;
1385       ONE_MORE_BYTE (c1);
1386       if (! UTF_8_3_OCTET_LEADING_P (c1))
1387         src = src_base;
1388       else
1389         {
1390           ONE_MORE_BYTE (c2);
1391           if (! UTF_8_EXTRA_OCTET_P (c2))
1392             src = src_base;
1393           else
1394             {
1395               ONE_MORE_BYTE (c3);
1396               if (! UTF_8_EXTRA_OCTET_P (c3))
1397                 src = src_base;
1398               else
1399                 {
1400                   if ((c1 != UTF_8_BOM_1)
1401                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1402                     src = src_base;
1403                   else
1404                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1405                 }
1406             }
1407         }
1408     }
1409   CODING_UTF_8_BOM (coding) = utf_without_bom;
1410
1411   while (1)
1412     {
1413       int c, c1, c2, c3, c4, c5;
1414
1415       src_base = src;
1416       consumed_chars_base = consumed_chars;
1417
1418       if (charbuf >= charbuf_end)
1419         {
1420           if (byte_after_cr >= 0)
1421             src_base--;
1422           break;
1423         }
1424
1425       if (byte_after_cr >= 0)
1426         c1 = byte_after_cr, byte_after_cr = -1;
1427       else
1428         ONE_MORE_BYTE (c1);
1429       if (c1 < 0)
1430         {
1431           c = - c1;
1432         }
1433       else if (UTF_8_1_OCTET_P (c1))
1434         {
1435           if (eol_dos && c1 == '\r')
1436             ONE_MORE_BYTE (byte_after_cr);
1437           c = c1;
1438         }
1439       else
1440         {
1441           ONE_MORE_BYTE (c2);
1442           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1443             goto invalid_code;
1444           if (UTF_8_2_OCTET_LEADING_P (c1))
1445             {
1446               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1447               /* Reject overlong sequences here and below.  Encoders
1448                  producing them are incorrect, they can be misleading,
1449                  and they mess up read/write invariance.  */
1450               if (c < 128)
1451                 goto invalid_code;
1452             }
1453           else
1454             {
1455               ONE_MORE_BYTE (c3);
1456               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1457                 goto invalid_code;
1458               if (UTF_8_3_OCTET_LEADING_P (c1))
1459                 {
1460                   c = (((c1 & 0xF) << 12)
1461                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1462                   if (c < 0x800
1463                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1464                     goto invalid_code;
1465                 }
1466               else
1467                 {
1468                   ONE_MORE_BYTE (c4);
1469                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1470                     goto invalid_code;
1471                   if (UTF_8_4_OCTET_LEADING_P (c1))
1472                     {
1473                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1474                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1475                     if (c < 0x10000)
1476                       goto invalid_code;
1477                     }
1478                   else
1479                     {
1480                       ONE_MORE_BYTE (c5);
1481                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1482                         goto invalid_code;
1483                       if (UTF_8_5_OCTET_LEADING_P (c1))
1484                         {
1485                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1486                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1487                                | (c5 & 0x3F));
1488                           if ((c > MAX_CHAR) || (c < 0x200000))
1489                             goto invalid_code;
1490                         }
1491                       else
1492                         goto invalid_code;
1493                     }
1494                 }
1495             }
1496         }
1497
1498       *charbuf++ = c;
1499       continue;
1500
1501     invalid_code:
1502       src = src_base;
1503       consumed_chars = consumed_chars_base;
1504       ONE_MORE_BYTE (c);
1505       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1506       coding->errors++;
1507     }
1508
1509  no_more_source:
1510   coding->consumed_char += consumed_chars_base;
1511   coding->consumed = src_base - coding->source;
1512   coding->charbuf_used = charbuf - coding->charbuf;
1513 }
1514
1515
1516 static int
1517 encode_coding_utf_8 (struct coding_system *coding)
1518 {
1519   int multibytep = coding->dst_multibyte;
1520   int *charbuf = coding->charbuf;
1521   int *charbuf_end = charbuf + coding->charbuf_used;
1522   unsigned char *dst = coding->destination + coding->produced;
1523   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1524   ptrdiff_t produced_chars = 0;
1525   int c;
1526
1527   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1528     {
1529       ASSURE_DESTINATION (3);
1530       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1531       CODING_UTF_8_BOM (coding) = utf_without_bom;
1532     }
1533
1534   if (multibytep)
1535     {
1536       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1537
1538       while (charbuf < charbuf_end)
1539         {
1540           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1541
1542           ASSURE_DESTINATION (safe_room);
1543           c = *charbuf++;
1544           if (CHAR_BYTE8_P (c))
1545             {
1546               c = CHAR_TO_BYTE8 (c);
1547               EMIT_ONE_BYTE (c);
1548             }
1549           else
1550             {
1551               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1552               for (p = str; p < pend; p++)
1553                 EMIT_ONE_BYTE (*p);
1554             }
1555         }
1556     }
1557   else
1558     {
1559       int safe_room = MAX_MULTIBYTE_LENGTH;
1560
1561       while (charbuf < charbuf_end)
1562         {
1563           ASSURE_DESTINATION (safe_room);
1564           c = *charbuf++;
1565           if (CHAR_BYTE8_P (c))
1566             *dst++ = CHAR_TO_BYTE8 (c);
1567           else
1568             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1569           produced_chars++;
1570         }
1571     }
1572   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1573   coding->produced_char += produced_chars;
1574   coding->produced = dst - coding->destination;
1575   return 0;
1576 }
1577
1578
1579 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1580    Check if a text is encoded in one of UTF-16 based coding systems.
1581    If it is, return 1, else return 0.  */
1582
1583 #define UTF_16_HIGH_SURROGATE_P(val) \
1584   (((val) & 0xFC00) == 0xD800)
1585
1586 #define UTF_16_LOW_SURROGATE_P(val) \
1587   (((val) & 0xFC00) == 0xDC00)
1588
1589
1590 static int
1591 detect_coding_utf_16 (struct coding_system *coding,
1592                       struct coding_detection_info *detect_info)
1593 {
1594   const unsigned char *src = coding->source;
1595   const unsigned char *src_end = coding->source + coding->src_bytes;
1596   int multibytep = coding->src_multibyte;
1597   int c1, c2;
1598
1599   detect_info->checked |= CATEGORY_MASK_UTF_16;
1600   if (coding->mode & CODING_MODE_LAST_BLOCK
1601       && (coding->src_chars & 1))
1602     {
1603       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1604       return 0;
1605     }
1606
1607   TWO_MORE_BYTES (c1, c2);
1608   if ((c1 == 0xFF) && (c2 == 0xFE))
1609     {
1610       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1611                              | CATEGORY_MASK_UTF_16_AUTO);
1612       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1613                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1614                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1615     }
1616   else if ((c1 == 0xFE) && (c2 == 0xFF))
1617     {
1618       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1619                              | CATEGORY_MASK_UTF_16_AUTO);
1620       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1621                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1622                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1623     }
1624   else if (c2 < 0)
1625     {
1626       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1627       return 0;
1628     }
1629   else
1630     {
1631       /* We check the dispersion of Eth and Oth bytes where E is even and
1632          O is odd.  If both are high, we assume binary data.*/
1633       unsigned char e[256], o[256];
1634       unsigned e_num = 1, o_num = 1;
1635
1636       memset (e, 0, 256);
1637       memset (o, 0, 256);
1638       e[c1] = 1;
1639       o[c2] = 1;
1640
1641       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1642                                 |CATEGORY_MASK_UTF_16_BE
1643                                 | CATEGORY_MASK_UTF_16_LE);
1644
1645       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1646              != CATEGORY_MASK_UTF_16)
1647         {
1648           TWO_MORE_BYTES (c1, c2);
1649           if (c2 < 0)
1650             break;
1651           if (! e[c1])
1652             {
1653               e[c1] = 1;
1654               e_num++;
1655               if (e_num >= 128)
1656                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1657             }
1658           if (! o[c2])
1659             {
1660               o[c2] = 1;
1661               o_num++;
1662               if (o_num >= 128)
1663                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1664             }
1665         }
1666       return 0;
1667     }
1668
1669  no_more_source:
1670   return 1;
1671 }
1672
1673 static void
1674 decode_coding_utf_16 (struct coding_system *coding)
1675 {
1676   const unsigned char *src = coding->source + coding->consumed;
1677   const unsigned char *src_end = coding->source + coding->src_bytes;
1678   const unsigned char *src_base;
1679   int *charbuf = coding->charbuf + coding->charbuf_used;
1680   /* We may produces at most 3 chars in one loop.  */
1681   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1682   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1683   int multibytep = coding->src_multibyte;
1684   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1686   int surrogate = CODING_UTF_16_SURROGATE (coding);
1687   int eol_dos =
1688     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1689   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1690
1691   if (bom == utf_with_bom)
1692     {
1693       int c, c1, c2;
1694
1695       src_base = src;
1696       ONE_MORE_BYTE (c1);
1697       ONE_MORE_BYTE (c2);
1698       c = (c1 << 8) | c2;
1699
1700       if (endian == utf_16_big_endian
1701           ? c != 0xFEFF : c != 0xFFFE)
1702         {
1703           /* The first two bytes are not BOM.  Treat them as bytes
1704              for a normal character.  */
1705           src = src_base;
1706           coding->errors++;
1707         }
1708       CODING_UTF_16_BOM (coding) = utf_without_bom;
1709     }
1710   else if (bom == utf_detect_bom)
1711     {
1712       /* We have already tried to detect BOM and failed in
1713          detect_coding.  */
1714       CODING_UTF_16_BOM (coding) = utf_without_bom;
1715     }
1716
1717   while (1)
1718     {
1719       int c, c1, c2;
1720
1721       src_base = src;
1722       consumed_chars_base = consumed_chars;
1723
1724       if (charbuf >= charbuf_end)
1725         {
1726           if (byte_after_cr1 >= 0)
1727             src_base -= 2;
1728           break;
1729         }
1730
1731       if (byte_after_cr1 >= 0)
1732         c1 = byte_after_cr1, byte_after_cr1 = -1;
1733       else
1734         ONE_MORE_BYTE (c1);
1735       if (c1 < 0)
1736         {
1737           *charbuf++ = -c1;
1738           continue;
1739         }
1740       if (byte_after_cr2 >= 0)
1741         c2 = byte_after_cr2, byte_after_cr2 = -1;
1742       else
1743         ONE_MORE_BYTE (c2);
1744       if (c2 < 0)
1745         {
1746           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1747           *charbuf++ = -c2;
1748           continue;
1749         }
1750       c = (endian == utf_16_big_endian
1751            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1752
1753       if (surrogate)
1754         {
1755           if (! UTF_16_LOW_SURROGATE_P (c))
1756             {
1757               if (endian == utf_16_big_endian)
1758                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1759               else
1760                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1761               *charbuf++ = c1;
1762               *charbuf++ = c2;
1763               coding->errors++;
1764               if (UTF_16_HIGH_SURROGATE_P (c))
1765                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1766               else
1767                 *charbuf++ = c;
1768             }
1769           else
1770             {
1771               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1772               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1773               *charbuf++ = 0x10000 + c;
1774             }
1775         }
1776       else
1777         {
1778           if (UTF_16_HIGH_SURROGATE_P (c))
1779             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1780           else
1781             {
1782               if (eol_dos && c == '\r')
1783                 {
1784                   ONE_MORE_BYTE (byte_after_cr1);
1785                   ONE_MORE_BYTE (byte_after_cr2);
1786                 }
1787               *charbuf++ = c;
1788             }
1789         }
1790     }
1791
1792  no_more_source:
1793   coding->consumed_char += consumed_chars_base;
1794   coding->consumed = src_base - coding->source;
1795   coding->charbuf_used = charbuf - coding->charbuf;
1796 }
1797
1798 static int
1799 encode_coding_utf_16 (struct coding_system *coding)
1800 {
1801   int multibytep = coding->dst_multibyte;
1802   int *charbuf = coding->charbuf;
1803   int *charbuf_end = charbuf + coding->charbuf_used;
1804   unsigned char *dst = coding->destination + coding->produced;
1805   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1806   int safe_room = 8;
1807   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1808   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1809   ptrdiff_t produced_chars = 0;
1810   int c;
1811
1812   if (bom != utf_without_bom)
1813     {
1814       ASSURE_DESTINATION (safe_room);
1815       if (big_endian)
1816         EMIT_TWO_BYTES (0xFE, 0xFF);
1817       else
1818         EMIT_TWO_BYTES (0xFF, 0xFE);
1819       CODING_UTF_16_BOM (coding) = utf_without_bom;
1820     }
1821
1822   while (charbuf < charbuf_end)
1823     {
1824       ASSURE_DESTINATION (safe_room);
1825       c = *charbuf++;
1826       if (c > MAX_UNICODE_CHAR)
1827         c = coding->default_char;
1828
1829       if (c < 0x10000)
1830         {
1831           if (big_endian)
1832             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1833           else
1834             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1835         }
1836       else
1837         {
1838           int c1, c2;
1839
1840           c -= 0x10000;
1841           c1 = (c >> 10) + 0xD800;
1842           c2 = (c & 0x3FF) + 0xDC00;
1843           if (big_endian)
1844             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1845           else
1846             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1847         }
1848     }
1849   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1850   coding->produced = dst - coding->destination;
1851   coding->produced_char += produced_chars;
1852   return 0;
1853 }
1854
1855 \f
1856 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1857
1858 /* Emacs' internal format for representation of multiple character
1859    sets is a kind of multi-byte encoding, i.e. characters are
1860    represented by variable-length sequences of one-byte codes.
1861
1862    ASCII characters and control characters (e.g. `tab', `newline') are
1863    represented by one-byte sequences which are their ASCII codes, in
1864    the range 0x00 through 0x7F.
1865
1866    8-bit characters of the range 0x80..0x9F are represented by
1867    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1868    code + 0x20).
1869
1870    8-bit characters of the range 0xA0..0xFF are represented by
1871    one-byte sequences which are their 8-bit code.
1872
1873    The other characters are represented by a sequence of `base
1874    leading-code', optional `extended leading-code', and one or two
1875    `position-code's.  The length of the sequence is determined by the
1876    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1877    whereas extended leading-code and position-code take the range 0xA0
1878    through 0xFF.  See `charset.h' for more details about leading-code
1879    and position-code.
1880
1881    --- CODE RANGE of Emacs' internal format ---
1882    character set        range
1883    -------------        -----
1884    ascii                0x00..0x7F
1885    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1886    eight-bit-graphic    0xA0..0xBF
1887    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1888    ---------------------------------------------
1889
1890    As this is the internal character representation, the format is
1891    usually not used externally (i.e. in a file or in a data sent to a
1892    process).  But, it is possible to have a text externally in this
1893    format (i.e. by encoding by the coding system `emacs-mule').
1894
1895    In that case, a sequence of one-byte codes has a slightly different
1896    form.
1897
1898    At first, all characters in eight-bit-control are represented by
1899    one-byte sequences which are their 8-bit code.
1900
1901    Next, character composition data are represented by the byte
1902    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1903    where,
1904         METHOD is 0xF2 plus one of composition method (enum
1905         composition_method),
1906
1907         BYTES is 0xA0 plus a byte length of this composition data,
1908
1909         CHARS is 0xA0 plus a number of characters composed by this
1910         data,
1911
1912         COMPONENTs are characters of multibyte form or composition
1913         rules encoded by two-byte of ASCII codes.
1914
1915    In addition, for backward compatibility, the following formats are
1916    also recognized as composition data on decoding.
1917
1918    0x80 MSEQ ...
1919    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1920
1921    Here,
1922         MSEQ is a multibyte form but in these special format:
1923           ASCII: 0xA0 ASCII_CODE+0x80,
1924           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1925         RULE is a one byte code of the range 0xA0..0xF0 that
1926         represents a composition rule.
1927   */
1928
1929 char emacs_mule_bytes[256];
1930
1931
1932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1933    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1934    else return 0.  */
1935
1936 static int
1937 detect_coding_emacs_mule (struct coding_system *coding,
1938                           struct coding_detection_info *detect_info)
1939 {
1940   const unsigned char *src = coding->source, *src_base;
1941   const unsigned char *src_end = coding->source + coding->src_bytes;
1942   int multibytep = coding->src_multibyte;
1943   ptrdiff_t consumed_chars = 0;
1944   int c;
1945   int found = 0;
1946
1947   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1948   /* A coding system of this category is always ASCII compatible.  */
1949   src += coding->head_ascii;
1950
1951   while (1)
1952     {
1953       src_base = src;
1954       ONE_MORE_BYTE (c);
1955       if (c < 0)
1956         continue;
1957       if (c == 0x80)
1958         {
1959           /* Perhaps the start of composite character.  We simply skip
1960              it because analyzing it is too heavy for detecting.  But,
1961              at least, we check that the composite character
1962              constitutes of more than 4 bytes.  */
1963           const unsigned char *src_start;
1964
1965         repeat:
1966           src_start = src;
1967           do
1968             {
1969               ONE_MORE_BYTE (c);
1970             }
1971           while (c >= 0xA0);
1972
1973           if (src - src_start <= 4)
1974             break;
1975           found = CATEGORY_MASK_EMACS_MULE;
1976           if (c == 0x80)
1977             goto repeat;
1978         }
1979
1980       if (c < 0x80)
1981         {
1982           if (c < 0x20
1983               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1984             break;
1985         }
1986       else
1987         {
1988           int more_bytes = emacs_mule_bytes[c] - 1;
1989
1990           while (more_bytes > 0)
1991             {
1992               ONE_MORE_BYTE (c);
1993               if (c < 0xA0)
1994                 {
1995                   src--;        /* Unread the last byte.  */
1996                   break;
1997                 }
1998               more_bytes--;
1999             }
2000           if (more_bytes != 0)
2001             break;
2002           found = CATEGORY_MASK_EMACS_MULE;
2003         }
2004     }
2005   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2006   return 0;
2007
2008  no_more_source:
2009   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2010     {
2011       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2012       return 0;
2013     }
2014   detect_info->found |= found;
2015   return 1;
2016 }
2017
2018
2019 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2020    character.  If CMP_STATUS indicates that we must expect MSEQ or
2021    RULE described above, decode it and return the negative value of
2022    the decoded character or rule.  If an invalid byte is found, return
2023    -1.  If SRC is too short, return -2.  */
2024
2025 static int
2026 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2027                  int *nbytes, int *nchars, int *id,
2028                  struct composition_status *cmp_status)
2029 {
2030   const unsigned char *src_end = coding->source + coding->src_bytes;
2031   const unsigned char *src_base = src;
2032   int multibytep = coding->src_multibyte;
2033   int charset_ID;
2034   unsigned code;
2035   int c;
2036   int consumed_chars = 0;
2037   int mseq_found = 0;
2038
2039   ONE_MORE_BYTE (c);
2040   if (c < 0)
2041     {
2042       c = -c;
2043       charset_ID = emacs_mule_charset[0];
2044     }
2045   else
2046     {
2047       if (c >= 0xA0)
2048         {
2049           if (cmp_status->state != COMPOSING_NO
2050               && cmp_status->old_form)
2051             {
2052               if (cmp_status->state == COMPOSING_CHAR)
2053                 {
2054                   if (c == 0xA0)
2055                     {
2056                       ONE_MORE_BYTE (c);
2057                       c -= 0x80;
2058                       if (c < 0)
2059                         goto invalid_code;
2060                     }
2061                   else
2062                     c -= 0x20;
2063                   mseq_found = 1;
2064                 }
2065               else
2066                 {
2067                   *nbytes = src - src_base;
2068                   *nchars = consumed_chars;
2069                   return -c;
2070                 }
2071             }
2072           else
2073             goto invalid_code;
2074         }
2075
2076       switch (emacs_mule_bytes[c])
2077         {
2078         case 2:
2079           if ((charset_ID = emacs_mule_charset[c]) < 0)
2080             goto invalid_code;
2081           ONE_MORE_BYTE (c);
2082           if (c < 0xA0)
2083             goto invalid_code;
2084           code = c & 0x7F;
2085           break;
2086
2087         case 3:
2088           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2089               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2090             {
2091               ONE_MORE_BYTE (c);
2092               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2093                 goto invalid_code;
2094               ONE_MORE_BYTE (c);
2095               if (c < 0xA0)
2096                 goto invalid_code;
2097               code = c & 0x7F;
2098             }
2099           else
2100             {
2101               if ((charset_ID = emacs_mule_charset[c]) < 0)
2102                 goto invalid_code;
2103               ONE_MORE_BYTE (c);
2104               if (c < 0xA0)
2105                 goto invalid_code;
2106               code = (c & 0x7F) << 8;
2107               ONE_MORE_BYTE (c);
2108               if (c < 0xA0)
2109                 goto invalid_code;
2110               code |= c & 0x7F;
2111             }
2112           break;
2113
2114         case 4:
2115           ONE_MORE_BYTE (c);
2116           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2117             goto invalid_code;
2118           ONE_MORE_BYTE (c);
2119           if (c < 0xA0)
2120             goto invalid_code;
2121           code = (c & 0x7F) << 8;
2122           ONE_MORE_BYTE (c);
2123           if (c < 0xA0)
2124             goto invalid_code;
2125           code |= c & 0x7F;
2126           break;
2127
2128         case 1:
2129           code = c;
2130           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2131           break;
2132
2133         default:
2134           abort ();
2135         }
2136       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2137                           CHARSET_FROM_ID (charset_ID), code, c);
2138       if (c < 0)
2139         goto invalid_code;
2140     }
2141   *nbytes = src - src_base;
2142   *nchars = consumed_chars;
2143   if (id)
2144     *id = charset_ID;
2145   return (mseq_found ? -c : c);
2146
2147  no_more_source:
2148   return -2;
2149
2150  invalid_code:
2151   return -1;
2152 }
2153
2154
2155 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2156
2157 /* Handle these composition sequence ('|': the end of header elements,
2158    BYTES and CHARS >= 0xA0):
2159
2160    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2161    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2162    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2163
2164    and these old form:
2165
2166    (4) relative composition: 0x80 | MSEQ ... MSEQ
2167    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2168
2169    When the starter 0x80 and the following header elements are found,
2170    this annotation header is produced.
2171
2172         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2173
2174    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2175    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2176
2177    Then, upon reading the following elements, these codes are produced
2178    until the composition end is found:
2179
2180    (1) CHAR ... CHAR
2181    (2) ALT ... ALT CHAR ... CHAR
2182    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2183    (4) CHAR ... CHAR
2184    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2185
2186    When the composition end is found, LENGTH and NCHARS in the
2187    annotation header is updated as below:
2188
2189    (1) LENGTH: unchanged, NCHARS: unchanged
2190    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2191    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2192    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2193    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2194
2195    If an error is found while composing, the annotation header is
2196    changed to the original composition header (plus filler -1s) as
2197    below:
2198
2199    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2200    (5)          [ 0x80 0xFF -1 -1- -1 ]
2201
2202    and the sequence [ -2 DECODED-RULE ] is changed to the original
2203    byte sequence as below:
2204         o the original byte sequence is B: [ B -1 ]
2205         o the original byte sequence is B1 B2: [ B1 B2 ]
2206
2207    Most of the routines are implemented by macros because many
2208    variables and labels in the caller decode_coding_emacs_mule must be
2209    accessible, and they are usually called just once (thus doesn't
2210    increase the size of compiled object).  */
2211
2212 /* Decode a composition rule represented by C as a component of
2213    composition sequence of Emacs 20 style.  Set RULE to the decoded
2214    rule. */
2215
2216 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2217   do {                                                  \
2218     int gref, nref;                                     \
2219                                                         \
2220     c -= 0xA0;                                          \
2221     if (c < 0 || c >= 81)                               \
2222       goto invalid_code;                                \
2223     gref = c / 9, nref = c % 9;                         \
2224     if (gref == 4) gref = 10;                           \
2225     if (nref == 4) nref = 10;                           \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Decode a composition rule represented by C and the following byte
2231    at SRC as a component of composition sequence of Emacs 21 style.
2232    Set RULE to the decoded rule.  */
2233
2234 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2235   do {                                                  \
2236     int gref, nref;                                     \
2237                                                         \
2238     gref = c - 0x20;                                    \
2239     if (gref < 0 || gref >= 81)                         \
2240       goto invalid_code;                                \
2241     ONE_MORE_BYTE (c);                                  \
2242     nref = c - 0x20;                                    \
2243     if (nref < 0 || nref >= 81)                         \
2244       goto invalid_code;                                \
2245     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2246   } while (0)
2247
2248
2249 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2250    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2251    byte length of this composition information, CHARS is the number of
2252    characters composed by this composition.  */
2253
2254 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2255   do {                                                                  \
2256     enum composition_method method = c - 0xF2;                          \
2257     int nbytes, nchars;                                                 \
2258                                                                         \
2259     ONE_MORE_BYTE (c);                                                  \
2260     if (c < 0)                                                          \
2261       goto invalid_code;                                                \
2262     nbytes = c - 0xA0;                                                  \
2263     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2264       goto invalid_code;                                                \
2265     ONE_MORE_BYTE (c);                                                  \
2266     nchars = c - 0xA0;                                                  \
2267     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2268       goto invalid_code;                                                \
2269     cmp_status->old_form = 0;                                           \
2270     cmp_status->method = method;                                        \
2271     if (method == COMPOSITION_RELATIVE)                                 \
2272       cmp_status->state = COMPOSING_CHAR;                               \
2273     else                                                                \
2274       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2275     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2276     cmp_status->nchars = nchars;                                        \
2277     cmp_status->ncomps = nbytes - 4;                                    \
2278     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2279   } while (0)
2280
2281
2282 /* Start of Emacs 20 style format for relative composition.  */
2283
2284 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2285   do {                                                          \
2286     cmp_status->old_form = 1;                                   \
2287     cmp_status->method = COMPOSITION_RELATIVE;                  \
2288     cmp_status->state = COMPOSING_CHAR;                         \
2289     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2290     cmp_status->nchars = cmp_status->ncomps = 0;                \
2291     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2292   } while (0)
2293
2294
2295 /* Start of Emacs 20 style format for rule-base composition.  */
2296
2297 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2298   do {                                                          \
2299     cmp_status->old_form = 1;                                   \
2300     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2301     cmp_status->state = COMPOSING_CHAR;                         \
2302     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2303     cmp_status->nchars = cmp_status->ncomps = 0;                \
2304     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2305   } while (0)
2306
2307
2308 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2309   do {                                                  \
2310     const unsigned char *current_src = src;             \
2311                                                         \
2312     ONE_MORE_BYTE (c);                                  \
2313     if (c < 0)                                          \
2314       goto invalid_code;                                \
2315     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2316         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2317       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2318     else if (c < 0xA0)                                  \
2319       goto invalid_code;                                \
2320     else if (c < 0xC0)                                  \
2321       {                                                 \
2322         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2323         /* Re-read C as a composition component.  */    \
2324         src = current_src;                              \
2325       }                                                 \
2326     else if (c == 0xFF)                                 \
2327       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2328     else                                                \
2329       goto invalid_code;                                \
2330   } while (0)
2331
2332 #define EMACS_MULE_COMPOSITION_END()                            \
2333   do {                                                          \
2334     int idx = - cmp_status->length;                             \
2335                                                                 \
2336     if (cmp_status->old_form)                                   \
2337       charbuf[idx + 2] = cmp_status->nchars;                    \
2338     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2339       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2340     cmp_status->state = COMPOSING_NO;                           \
2341   } while (0)
2342
2343
2344 static int
2345 emacs_mule_finish_composition (int *charbuf,
2346                                struct composition_status *cmp_status)
2347 {
2348   int idx = - cmp_status->length;
2349   int new_chars;
2350
2351   if (cmp_status->old_form && cmp_status->nchars > 0)
2352     {
2353       charbuf[idx + 2] = cmp_status->nchars;
2354       new_chars = 0;
2355       if (cmp_status->method == COMPOSITION_WITH_RULE
2356           && cmp_status->state == COMPOSING_CHAR)
2357         {
2358           /* The last rule was invalid.  */
2359           int rule = charbuf[-1] + 0xA0;
2360
2361           charbuf[-2] = BYTE8_TO_CHAR (rule);
2362           charbuf[-1] = -1;
2363           new_chars = 1;
2364         }
2365     }
2366   else
2367     {
2368       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2369
2370       if (cmp_status->method == COMPOSITION_WITH_RULE)
2371         {
2372           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2373           charbuf[idx++] = -3;
2374           charbuf[idx++] = 0;
2375           new_chars = 1;
2376         }
2377       else
2378         {
2379           int nchars = charbuf[idx + 1] + 0xA0;
2380           int nbytes = charbuf[idx + 2] + 0xA0;
2381
2382           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2383           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2384           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2385           charbuf[idx++] = -1;
2386           new_chars = 4;
2387         }
2388     }
2389   cmp_status->state = COMPOSING_NO;
2390   return new_chars;
2391 }
2392
2393 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2394   do {                                                                    \
2395     if (cmp_status->state != COMPOSING_NO)                                \
2396       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2397   } while (0)
2398
2399
2400 static void
2401 decode_coding_emacs_mule (struct coding_system *coding)
2402 {
2403   const unsigned char *src = coding->source + coding->consumed;
2404   const unsigned char *src_end = coding->source + coding->src_bytes;
2405   const unsigned char *src_base;
2406   int *charbuf = coding->charbuf + coding->charbuf_used;
2407   /* We may produce two annotations (charset and composition) in one
2408      loop and one more charset annotation at the end.  */
2409   int *charbuf_end
2410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2411       /* We can produce up to 2 characters in a loop.  */
2412       - 1;
2413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2414   int multibytep = coding->src_multibyte;
2415   ptrdiff_t char_offset = coding->produced_char;
2416   ptrdiff_t last_offset = char_offset;
2417   int last_id = charset_ascii;
2418   int eol_dos =
2419     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2420   int byte_after_cr = -1;
2421   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2422
2423   if (cmp_status->state != COMPOSING_NO)
2424     {
2425       int i;
2426
2427       if (charbuf_end - charbuf < cmp_status->length)
2428         abort ();
2429       for (i = 0; i < cmp_status->length; i++)
2430         *charbuf++ = cmp_status->carryover[i];
2431       coding->annotated = 1;
2432     }
2433
2434   while (1)
2435     {
2436       int c, id IF_LINT (= 0);
2437
2438       src_base = src;
2439       consumed_chars_base = consumed_chars;
2440
2441       if (charbuf >= charbuf_end)
2442         {
2443           if (byte_after_cr >= 0)
2444             src_base--;
2445           break;
2446         }
2447
2448       if (byte_after_cr >= 0)
2449         c = byte_after_cr, byte_after_cr = -1;
2450       else
2451         ONE_MORE_BYTE (c);
2452
2453       if (c < 0 || c == 0x80)
2454         {
2455           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456           if (c < 0)
2457             {
2458               *charbuf++ = -c;
2459               char_offset++;
2460             }
2461           else
2462             DECODE_EMACS_MULE_COMPOSITION_START ();
2463           continue;
2464         }
2465
2466       if (c < 0x80)
2467         {
2468           if (eol_dos && c == '\r')
2469             ONE_MORE_BYTE (byte_after_cr);
2470           id = charset_ascii;
2471           if (cmp_status->state != COMPOSING_NO)
2472             {
2473               if (cmp_status->old_form)
2474                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2476                 cmp_status->ncomps--;
2477             }
2478         }
2479       else
2480         {
2481           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2482           /* emacs_mule_char can load a charset map from a file, which
2483              allocates a large structure and might cause buffer text
2484              to be relocated as result.  Thus, we need to remember the
2485              original pointer to buffer text, and fix up all related
2486              pointers after the call.  */
2487           const unsigned char *orig = coding->source;
2488           ptrdiff_t offset;
2489
2490           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2491                                cmp_status);
2492           offset = coding->source - orig;
2493           if (offset)
2494             {
2495               src += offset;
2496               src_base += offset;
2497               src_end += offset;
2498             }
2499           if (c < 0)
2500             {
2501               if (c == -1)
2502                 goto invalid_code;
2503               if (c == -2)
2504                 break;
2505             }
2506           src = src_base + nbytes;
2507           consumed_chars = consumed_chars_base + nchars;
2508           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2509             cmp_status->ncomps -= nchars;
2510         }
2511
2512       /* Now if C >= 0, we found a normally encoded character, if C <
2513          0, we found an old-style composition component character or
2514          rule.  */
2515
2516       if (cmp_status->state == COMPOSING_NO)
2517         {
2518           if (last_id != id)
2519             {
2520               if (last_id != charset_ascii)
2521                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2522                                   last_id);
2523               last_id = id;
2524               last_offset = char_offset;
2525             }
2526           *charbuf++ = c;
2527           char_offset++;
2528         }
2529       else if (cmp_status->state == COMPOSING_CHAR)
2530         {
2531           if (cmp_status->old_form)
2532             {
2533               if (c >= 0)
2534                 {
2535                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2536                   *charbuf++ = c;
2537                   char_offset++;
2538                 }
2539               else
2540                 {
2541                   *charbuf++ = -c;
2542                   cmp_status->nchars++;
2543                   cmp_status->length++;
2544                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2545                     EMACS_MULE_COMPOSITION_END ();
2546                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2547                     cmp_status->state = COMPOSING_RULE;
2548                 }
2549             }
2550           else
2551             {
2552               *charbuf++ = c;
2553               cmp_status->length++;
2554               cmp_status->nchars--;
2555               if (cmp_status->nchars == 0)
2556                 EMACS_MULE_COMPOSITION_END ();
2557             }
2558         }
2559       else if (cmp_status->state == COMPOSING_RULE)
2560         {
2561           int rule;
2562
2563           if (c >= 0)
2564             {
2565               EMACS_MULE_COMPOSITION_END ();
2566               *charbuf++ = c;
2567               char_offset++;
2568             }
2569           else
2570             {
2571               c = -c;
2572               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2573               if (rule < 0)
2574                 goto invalid_code;
2575               *charbuf++ = -2;
2576               *charbuf++ = rule;
2577               cmp_status->length += 2;
2578               cmp_status->state = COMPOSING_CHAR;
2579             }
2580         }
2581       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2582         {
2583           *charbuf++ = c;
2584           cmp_status->length++;
2585           if (cmp_status->ncomps == 0)
2586             cmp_status->state = COMPOSING_CHAR;
2587           else if (cmp_status->ncomps > 0)
2588             {
2589               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2590                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2591             }
2592           else
2593             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2594         }
2595       else                      /* COMPOSING_COMPONENT_RULE */
2596         {
2597           int rule;
2598
2599           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2600           if (rule < 0)
2601             goto invalid_code;
2602           *charbuf++ = -2;
2603           *charbuf++ = rule;
2604           cmp_status->length += 2;
2605           cmp_status->ncomps--;
2606           if (cmp_status->ncomps > 0)
2607             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2608           else
2609             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610         }
2611       continue;
2612
2613     invalid_code:
2614       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2615       src = src_base;
2616       consumed_chars = consumed_chars_base;
2617       ONE_MORE_BYTE (c);
2618       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2619       char_offset++;
2620       coding->errors++;
2621     }
2622
2623  no_more_source:
2624   if (cmp_status->state != COMPOSING_NO)
2625     {
2626       if (coding->mode & CODING_MODE_LAST_BLOCK)
2627         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2628       else
2629         {
2630           int i;
2631
2632           charbuf -= cmp_status->length;
2633           for (i = 0; i < cmp_status->length; i++)
2634             cmp_status->carryover[i] = charbuf[i];
2635         }
2636     }
2637   if (last_id != charset_ascii)
2638     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2639   coding->consumed_char += consumed_chars_base;
2640   coding->consumed = src_base - coding->source;
2641   coding->charbuf_used = charbuf - coding->charbuf;
2642 }
2643
2644
2645 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2646   do {                                          \
2647     if (id < 0xA0)                              \
2648       codes[0] = id, codes[1] = 0;              \
2649     else if (id < 0xE0)                         \
2650       codes[0] = 0x9A, codes[1] = id;           \
2651     else if (id < 0xF0)                         \
2652       codes[0] = 0x9B, codes[1] = id;           \
2653     else if (id < 0xF5)                         \
2654       codes[0] = 0x9C, codes[1] = id;           \
2655     else                                        \
2656       codes[0] = 0x9D, codes[1] = id;           \
2657   } while (0);
2658
2659
2660 static int
2661 encode_coding_emacs_mule (struct coding_system *coding)
2662 {
2663   int multibytep = coding->dst_multibyte;
2664   int *charbuf = coding->charbuf;
2665   int *charbuf_end = charbuf + coding->charbuf_used;
2666   unsigned char *dst = coding->destination + coding->produced;
2667   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2668   int safe_room = 8;
2669   ptrdiff_t produced_chars = 0;
2670   Lisp_Object attrs, charset_list;
2671   int c;
2672   int preferred_charset_id = -1;
2673
2674   CODING_GET_INFO (coding, attrs, charset_list);
2675   if (! EQ (charset_list, Vemacs_mule_charset_list))
2676     {
2677       CODING_ATTR_CHARSET_LIST (attrs)
2678         = charset_list = Vemacs_mule_charset_list;
2679     }
2680
2681   while (charbuf < charbuf_end)
2682     {
2683       ASSURE_DESTINATION (safe_room);
2684       c = *charbuf++;
2685
2686       if (c < 0)
2687         {
2688           /* Handle an annotation.  */
2689           switch (*charbuf)
2690             {
2691             case CODING_ANNOTATE_COMPOSITION_MASK:
2692               /* Not yet implemented.  */
2693               break;
2694             case CODING_ANNOTATE_CHARSET_MASK:
2695               preferred_charset_id = charbuf[3];
2696               if (preferred_charset_id >= 0
2697                   && NILP (Fmemq (make_number (preferred_charset_id),
2698                                   charset_list)))
2699                 preferred_charset_id = -1;
2700               break;
2701             default:
2702               abort ();
2703             }
2704           charbuf += -c - 1;
2705           continue;
2706         }
2707
2708       if (ASCII_CHAR_P (c))
2709         EMIT_ONE_ASCII_BYTE (c);
2710       else if (CHAR_BYTE8_P (c))
2711         {
2712           c = CHAR_TO_BYTE8 (c);
2713           EMIT_ONE_BYTE (c);
2714         }
2715       else
2716         {
2717           struct charset *charset;
2718           unsigned code;
2719           int dimension;
2720           int emacs_mule_id;
2721           unsigned char leading_codes[2];
2722
2723           if (preferred_charset_id >= 0)
2724             {
2725               int result;
2726
2727               charset = CHARSET_FROM_ID (preferred_charset_id);
2728               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2729               if (result)
2730                 code = ENCODE_CHAR (charset, c);
2731               else
2732                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2733                                      &code, charset);
2734             }
2735           else
2736             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2737                                  &code, charset);
2738           if (! charset)
2739             {
2740               c = coding->default_char;
2741               if (ASCII_CHAR_P (c))
2742                 {
2743                   EMIT_ONE_ASCII_BYTE (c);
2744                   continue;
2745                 }
2746               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2747                                    &code, charset);
2748             }
2749           dimension = CHARSET_DIMENSION (charset);
2750           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2751           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2752           EMIT_ONE_BYTE (leading_codes[0]);
2753           if (leading_codes[1])
2754             EMIT_ONE_BYTE (leading_codes[1]);
2755           if (dimension == 1)
2756             EMIT_ONE_BYTE (code | 0x80);
2757           else
2758             {
2759               code |= 0x8080;
2760               EMIT_ONE_BYTE (code >> 8);
2761               EMIT_ONE_BYTE (code & 0xFF);
2762             }
2763         }
2764     }
2765   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2766   coding->produced_char += produced_chars;
2767   coding->produced = dst - coding->destination;
2768   return 0;
2769 }
2770
2771 \f
2772 /*** 7. ISO2022 handlers ***/
2773
2774 /* The following note describes the coding system ISO2022 briefly.
2775    Since the intention of this note is to help understand the
2776    functions in this file, some parts are NOT ACCURATE or are OVERLY
2777    SIMPLIFIED.  For thorough understanding, please refer to the
2778    original document of ISO2022.  This is equivalent to the standard
2779    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2780
2781    ISO2022 provides many mechanisms to encode several character sets
2782    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2783    is encoded using bytes less than 128.  This may make the encoded
2784    text a little bit longer, but the text passes more easily through
2785    several types of gateway, some of which strip off the MSB (Most
2786    Significant Bit).
2787
2788    There are two kinds of character sets: control character sets and
2789    graphic character sets.  The former contain control characters such
2790    as `newline' and `escape' to provide control functions (control
2791    functions are also provided by escape sequences).  The latter
2792    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2793    two control character sets and many graphic character sets.
2794
2795    Graphic character sets are classified into one of the following
2796    four classes, according to the number of bytes (DIMENSION) and
2797    number of characters in one dimension (CHARS) of the set:
2798    - DIMENSION1_CHARS94
2799    - DIMENSION1_CHARS96
2800    - DIMENSION2_CHARS94
2801    - DIMENSION2_CHARS96
2802
2803    In addition, each character set is assigned an identification tag,
2804    unique for each set, called the "final character" (denoted as <F>
2805    hereafter).  The <F> of each character set is decided by ECMA(*)
2806    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2807    (0x30..0x3F are for private use only).
2808
2809    Note (*): ECMA = European Computer Manufacturers Association
2810
2811    Here are examples of graphic character sets [NAME(<F>)]:
2812         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2813         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2814         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2815         o DIMENSION2_CHARS96 -- none for the moment
2816
2817    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2818         C0 [0x00..0x1F] -- control character plane 0
2819         GL [0x20..0x7F] -- graphic character plane 0
2820         C1 [0x80..0x9F] -- control character plane 1
2821         GR [0xA0..0xFF] -- graphic character plane 1
2822
2823    A control character set is directly designated and invoked to C0 or
2824    C1 by an escape sequence.  The most common case is that:
2825    - ISO646's  control character set is designated/invoked to C0, and
2826    - ISO6429's control character set is designated/invoked to C1,
2827    and usually these designations/invocations are omitted in encoded
2828    text.  In a 7-bit environment, only C0 can be used, and a control
2829    character for C1 is encoded by an appropriate escape sequence to
2830    fit into the environment.  All control characters for C1 are
2831    defined to have corresponding escape sequences.
2832
2833    A graphic character set is at first designated to one of four
2834    graphic registers (G0 through G3), then these graphic registers are
2835    invoked to GL or GR.  These designations and invocations can be
2836    done independently.  The most common case is that G0 is invoked to
2837    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2838    these invocations and designations are omitted in encoded text.
2839    In a 7-bit environment, only GL can be used.
2840
2841    When a graphic character set of CHARS94 is invoked to GL, codes
2842    0x20 and 0x7F of the GL area work as control characters SPACE and
2843    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2844    be used.
2845
2846    There are two ways of invocation: locking-shift and single-shift.
2847    With locking-shift, the invocation lasts until the next different
2848    invocation, whereas with single-shift, the invocation affects the
2849    following character only and doesn't affect the locking-shift
2850    state.  Invocations are done by the following control characters or
2851    escape sequences:
2852
2853    ----------------------------------------------------------------------
2854    abbrev  function                  cntrl escape seq   description
2855    ----------------------------------------------------------------------
2856    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2857    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2858    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2859    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2860    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2861    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2862    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2863    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2864    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2865    ----------------------------------------------------------------------
2866    (*) These are not used by any known coding system.
2867
2868    Control characters for these functions are defined by macros
2869    ISO_CODE_XXX in `coding.h'.
2870
2871    Designations are done by the following escape sequences:
2872    ----------------------------------------------------------------------
2873    escape sequence      description
2874    ----------------------------------------------------------------------
2875    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2876    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2877    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2878    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2879    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2880    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2881    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2882    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2883    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2884    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2885    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2886    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2887    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2888    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2889    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2890    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2891    ----------------------------------------------------------------------
2892
2893    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2894    of dimension 1, chars 94, and final character <F>, etc...
2895
2896    Note (*): Although these designations are not allowed in ISO2022,
2897    Emacs accepts them on decoding, and produces them on encoding
2898    CHARS96 character sets in a coding system which is characterized as
2899    7-bit environment, non-locking-shift, and non-single-shift.
2900
2901    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2902    '(' must be omitted.  We refer to this as "short-form" hereafter.
2903
2904    Now you may notice that there are a lot of ways of encoding the
2905    same multilingual text in ISO2022.  Actually, there exist many
2906    coding systems such as Compound Text (used in X11's inter client
2907    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2908    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2909    localized platforms), and all of these are variants of ISO2022.
2910
2911    In addition to the above, Emacs handles two more kinds of escape
2912    sequences: ISO6429's direction specification and Emacs' private
2913    sequence for specifying character composition.
2914
2915    ISO6429's direction specification takes the following form:
2916         o CSI ']'      -- end of the current direction
2917         o CSI '0' ']'  -- end of the current direction
2918         o CSI '1' ']'  -- start of left-to-right text
2919         o CSI '2' ']'  -- start of right-to-left text
2920    The control character CSI (0x9B: control sequence introducer) is
2921    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2922
2923    Character composition specification takes the following form:
2924         o ESC '0' -- start relative composition
2925         o ESC '1' -- end composition
2926         o ESC '2' -- start rule-base composition (*)
2927         o ESC '3' -- start relative composition with alternate chars  (**)
2928         o ESC '4' -- start rule-base composition with alternate chars  (**)
2929   Since these are not standard escape sequences of any ISO standard,
2930   the use of them with these meanings is restricted to Emacs only.
2931
2932   (*) This form is used only in Emacs 20.7 and older versions,
2933   but newer versions can safely decode it.
2934   (**) This form is used only in Emacs 21.1 and newer versions,
2935   and older versions can't decode it.
2936
2937   Here's a list of example usages of these composition escape
2938   sequences (categorized by `enum composition_method').
2939
2940   COMPOSITION_RELATIVE:
2941         ESC 0 CHAR [ CHAR ] ESC 1
2942   COMPOSITION_WITH_RULE:
2943         ESC 2 CHAR [ RULE CHAR ] ESC 1
2944   COMPOSITION_WITH_ALTCHARS:
2945         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2946   COMPOSITION_WITH_RULE_ALTCHARS:
2947         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2948
2949 static enum iso_code_class_type iso_code_class[256];
2950
2951 #define SAFE_CHARSET_P(coding, id)      \
2952   ((id) <= (coding)->max_charset_id     \
2953    && (coding)->safe_charsets[id] != 255)
2954
2955 static void
2956 setup_iso_safe_charsets (Lisp_Object attrs)
2957 {
2958   Lisp_Object charset_list, safe_charsets;
2959   Lisp_Object request;
2960   Lisp_Object reg_usage;
2961   Lisp_Object tail;
2962   EMACS_INT reg94, reg96;
2963   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2964   int max_charset_id;
2965
2966   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2967   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2968       && ! EQ (charset_list, Viso_2022_charset_list))
2969     {
2970       CODING_ATTR_CHARSET_LIST (attrs)
2971         = charset_list = Viso_2022_charset_list;
2972       ASET (attrs, coding_attr_safe_charsets, Qnil);
2973     }
2974
2975   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2976     return;
2977
2978   max_charset_id = 0;
2979   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2980     {
2981       int id = XINT (XCAR (tail));
2982       if (max_charset_id < id)
2983         max_charset_id = id;
2984     }
2985
2986   safe_charsets = make_uninit_string (max_charset_id + 1);
2987   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2988   request = AREF (attrs, coding_attr_iso_request);
2989   reg_usage = AREF (attrs, coding_attr_iso_usage);
2990   reg94 = XINT (XCAR (reg_usage));
2991   reg96 = XINT (XCDR (reg_usage));
2992
2993   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2994     {
2995       Lisp_Object id;
2996       Lisp_Object reg;
2997       struct charset *charset;
2998
2999       id = XCAR (tail);
3000       charset = CHARSET_FROM_ID (XINT (id));
3001       reg = Fcdr (Fassq (id, request));
3002       if (! NILP (reg))
3003         SSET (safe_charsets, XINT (id), XINT (reg));
3004       else if (charset->iso_chars_96)
3005         {
3006           if (reg96 < 4)
3007             SSET (safe_charsets, XINT (id), reg96);
3008         }
3009       else
3010         {
3011           if (reg94 < 4)
3012             SSET (safe_charsets, XINT (id), reg94);
3013         }
3014     }
3015   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3016 }
3017
3018
3019 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3020    Check if a text is encoded in one of ISO-2022 based coding systems.
3021    If it is, return 1, else return 0.  */
3022
3023 static int
3024 detect_coding_iso_2022 (struct coding_system *coding,
3025                         struct coding_detection_info *detect_info)
3026 {
3027   const unsigned char *src = coding->source, *src_base = src;
3028   const unsigned char *src_end = coding->source + coding->src_bytes;
3029   int multibytep = coding->src_multibyte;
3030   int single_shifting = 0;
3031   int id;
3032   int c, c1;
3033   ptrdiff_t consumed_chars = 0;
3034   int i;
3035   int rejected = 0;
3036   int found = 0;
3037   int composition_count = -1;
3038
3039   detect_info->checked |= CATEGORY_MASK_ISO;
3040
3041   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3042     {
3043       struct coding_system *this = &(coding_categories[i]);
3044       Lisp_Object attrs, val;
3045
3046       if (this->id < 0)
3047         continue;
3048       attrs = CODING_ID_ATTRS (this->id);
3049       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3050           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3051         setup_iso_safe_charsets (attrs);
3052       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3053       this->max_charset_id = SCHARS (val) - 1;
3054       this->safe_charsets = SDATA (val);
3055     }
3056
3057   /* A coding system of this category is always ASCII compatible.  */
3058   src += coding->head_ascii;
3059
3060   while (rejected != CATEGORY_MASK_ISO)
3061     {
3062       src_base = src;
3063       ONE_MORE_BYTE (c);
3064       switch (c)
3065         {
3066         case ISO_CODE_ESC:
3067           if (inhibit_iso_escape_detection)
3068             break;
3069           single_shifting = 0;
3070           ONE_MORE_BYTE (c);
3071           if (c == 'N' || c == 'O')
3072             {
3073               /* ESC <Fe> for SS2 or SS3.  */
3074               single_shifting = 1;
3075               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3076             }
3077           else if (c == '1')
3078             {
3079               /* End of composition.  */
3080               if (composition_count < 0
3081                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3082                 /* Invalid */
3083                 break;
3084               composition_count = -1;
3085               found |= CATEGORY_MASK_ISO;
3086             }
3087           else if (c >= '0' && c <= '4')
3088             {
3089               /* ESC <Fp> for start/end composition.  */
3090               composition_count = 0;
3091             }
3092           else
3093             {
3094               if (c >= '(' && c <= '/')
3095                 {
3096                   /* Designation sequence for a charset of dimension 1.  */
3097                   ONE_MORE_BYTE (c1);
3098                   if (c1 < ' ' || c1 >= 0x80
3099                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3100                     /* Invalid designation sequence.  Just ignore.  */
3101                     break;
3102                 }
3103               else if (c == '$')
3104                 {
3105                   /* Designation sequence for a charset of dimension 2.  */
3106                   ONE_MORE_BYTE (c);
3107                   if (c >= '@' && c <= 'B')
3108                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3109                     id = iso_charset_table[1][0][c];
3110                   else if (c >= '(' && c <= '/')
3111                     {
3112                       ONE_MORE_BYTE (c1);
3113                       if (c1 < ' ' || c1 >= 0x80
3114                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3115                         /* Invalid designation sequence.  Just ignore.  */
3116                         break;
3117                     }
3118                   else
3119                     /* Invalid designation sequence.  Just ignore it.  */
3120                     break;
3121                 }
3122               else
3123                 {
3124                   /* Invalid escape sequence.  Just ignore it.  */
3125                   break;
3126                 }
3127
3128               /* We found a valid designation sequence for CHARSET.  */
3129               rejected |= CATEGORY_MASK_ISO_8BIT;
3130               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3131                                   id))
3132                 found |= CATEGORY_MASK_ISO_7;
3133               else
3134                 rejected |= CATEGORY_MASK_ISO_7;
3135               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3136                                   id))
3137                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3138               else
3139                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3140               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3141                                   id))
3142                 found |= CATEGORY_MASK_ISO_7_ELSE;
3143               else
3144                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3145               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3146                                   id))
3147                 found |= CATEGORY_MASK_ISO_8_ELSE;
3148               else
3149                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3150             }
3151           break;
3152
3153         case ISO_CODE_SO:
3154         case ISO_CODE_SI:
3155           /* Locking shift out/in.  */
3156           if (inhibit_iso_escape_detection)
3157             break;
3158           single_shifting = 0;
3159           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160           break;
3161
3162         case ISO_CODE_CSI:
3163           /* Control sequence introducer.  */
3164           single_shifting = 0;
3165           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3166           found |= CATEGORY_MASK_ISO_8_ELSE;
3167           goto check_extra_latin;
3168
3169         case ISO_CODE_SS2:
3170         case ISO_CODE_SS3:
3171           /* Single shift.   */
3172           if (inhibit_iso_escape_detection)
3173             break;
3174           single_shifting = 0;
3175           rejected |= CATEGORY_MASK_ISO_7BIT;
3176           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3177               & CODING_ISO_FLAG_SINGLE_SHIFT)
3178             {
3179               found |= CATEGORY_MASK_ISO_8_1;
3180               single_shifting = 1;
3181             }
3182           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3183               & CODING_ISO_FLAG_SINGLE_SHIFT)
3184             {
3185               found |= CATEGORY_MASK_ISO_8_2;
3186               single_shifting = 1;
3187             }
3188           if (single_shifting)
3189             break;
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204
3205         default:
3206           if (c < 0)
3207             continue;
3208           if (c < 0x80)
3209             {
3210               if (composition_count >= 0)
3211                 composition_count++;
3212               single_shifting = 0;
3213               break;
3214             }
3215           if (c >= 0xA0)
3216             {
3217               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3218               found |= CATEGORY_MASK_ISO_8_1;
3219               /* Check the length of succeeding codes of the range
3220                  0xA0..0FF.  If the byte length is even, we include
3221                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3222                  only when we are not single shifting.  */
3223               if (! single_shifting
3224                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3225                 {
3226                   int len = 1;
3227                   while (src < src_end)
3228                     {
3229                       src_base = src;
3230                       ONE_MORE_BYTE (c);
3231                       if (c < 0xA0)
3232                         {
3233                           src = src_base;
3234                           break;
3235                         }
3236                       len++;
3237                     }
3238
3239                   if (len & 1 && src < src_end)
3240                     {
3241                       rejected |= CATEGORY_MASK_ISO_8_2;
3242                       if (composition_count >= 0)
3243                         composition_count += len;
3244                     }
3245                   else
3246                     {
3247                       found |= CATEGORY_MASK_ISO_8_2;
3248                       if (composition_count >= 0)
3249                         composition_count += len / 2;
3250                     }
3251                 }
3252               break;
3253             }
3254         }
3255     }
3256   detect_info->rejected |= CATEGORY_MASK_ISO;
3257   return 0;
3258
3259  no_more_source:
3260   detect_info->rejected |= rejected;
3261   detect_info->found |= (found & ~rejected);
3262   return 1;
3263 }
3264
3265
3266 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3267    escape sequence should be kept.  */
3268 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3269   do {                                                                  \
3270     int id, prev;                                                       \
3271                                                                         \
3272     if (final < '0' || final >= 128                                     \
3273         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3274         || !SAFE_CHARSET_P (coding, id))                                \
3275       {                                                                 \
3276         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3277         chars_96 = -1;                                                  \
3278         break;                                                          \
3279       }                                                                 \
3280     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3281     if (id == charset_jisx0201_roman)                                   \
3282       {                                                                 \
3283         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3284           id = charset_ascii;                                           \
3285       }                                                                 \
3286     else if (id == charset_jisx0208_1978)                               \
3287       {                                                                 \
3288         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3289           id = charset_jisx0208;                                        \
3290       }                                                                 \
3291     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3292     /* If there was an invalid designation to REG previously, and this  \
3293        designation is ASCII to REG, we should keep this designation     \
3294        sequence.  */                                                    \
3295     if (prev == -2 && id == charset_ascii)                              \
3296       chars_96 = -1;                                                    \
3297   } while (0)
3298
3299
3300 /* Handle these composition sequence (ALT: alternate char):
3301
3302    (1) relative composition: ESC 0 CHAR ... ESC 1
3303    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3304    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3305    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3306
3307    When the start sequence (ESC 0/2/3/4) is found, this annotation
3308    header is produced.
3309
3310         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3311
3312    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3313    produced until the end sequence (ESC 1) is found:
3314
3315    (1) CHAR ... CHAR
3316    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3317    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3318    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3319
3320    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3321    annotation header is updated as below:
3322
3323    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3324    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3325    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3326    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3327
3328    If an error is found while composing, the annotation header is
3329    changed to:
3330
3331         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3332
3333    and the sequence [ -2 DECODED-RULE ] is changed to the original
3334    byte sequence as below:
3335         o the original byte sequence is B: [ B -1 ]
3336         o the original byte sequence is B1 B2: [ B1 B2 ]
3337    and the sequence [ -1 -1 ] is changed to the original byte
3338    sequence:
3339         [ ESC '0' ]
3340 */
3341
3342 /* Decode a composition rule C1 and maybe one more byte from the
3343    source, and set RULE to the encoded composition rule.  If the rule
3344    is invalid, goto invalid_code.  */
3345
3346 #define DECODE_COMPOSITION_RULE(rule)                                   \
3347   do {                                                                  \
3348     rule = c1 - 32;                                                     \
3349     if (rule < 0)                                                       \
3350       goto invalid_code;                                                \
3351     if (rule < 81)              /* old format (before ver.21) */        \
3352       {                                                                 \
3353         int gref = (rule) / 9;                                          \
3354         int nref = (rule) % 9;                                          \
3355         if (gref == 4) gref = 10;                                       \
3356         if (nref == 4) nref = 10;                                       \
3357         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3358       }                                                                 \
3359     else                        /* new format (after ver.21) */         \
3360       {                                                                 \
3361         int b;                                                          \
3362                                                                         \
3363         ONE_MORE_BYTE (b);                                              \
3364         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3365           goto invalid_code;                                            \
3366         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3367         rule += 0x100;   /* Distinguish it from the old format.  */     \
3368       }                                                                 \
3369   } while (0)
3370
3371 #define ENCODE_COMPOSITION_RULE(rule)                           \
3372   do {                                                          \
3373     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3374                                                                 \
3375     if (rule < 0x100)           /* old format */                \
3376       {                                                         \
3377         if (gref == 10) gref = 4;                               \
3378         if (nref == 10) nref = 4;                               \
3379         charbuf[idx] = 32 + gref * 9 + nref;                    \
3380         charbuf[idx + 1] = -1;                                  \
3381         new_chars++;                                            \
3382       }                                                         \
3383     else                                /* new format */        \
3384       {                                                         \
3385         charbuf[idx] = 32 + 81 + gref;                          \
3386         charbuf[idx + 1] = 32 + nref;                           \
3387         new_chars += 2;                                         \
3388       }                                                         \
3389   } while (0)
3390
3391 /* Finish the current composition as invalid.  */
3392
3393 static int finish_composition (int *, struct composition_status *);
3394
3395 static int
3396 finish_composition (int *charbuf, struct composition_status *cmp_status)
3397 {
3398   int idx = - cmp_status->length;
3399   int new_chars;
3400
3401   /* Recover the original ESC sequence */
3402   charbuf[idx++] = ISO_CODE_ESC;
3403   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3404                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3405                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3406                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3407                     : '4');
3408   charbuf[idx++] = -2;
3409   charbuf[idx++] = 0;
3410   charbuf[idx++] = -1;
3411   new_chars = cmp_status->nchars;
3412   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3413     for (; idx < 0; idx++)
3414       {
3415         int elt = charbuf[idx];
3416
3417         if (elt == -2)
3418           {
3419             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3420             idx++;
3421           }
3422         else if (elt == -1)
3423           {
3424             charbuf[idx++] = ISO_CODE_ESC;
3425             charbuf[idx] = '0';
3426             new_chars += 2;
3427           }
3428       }
3429   cmp_status->state = COMPOSING_NO;
3430   return new_chars;
3431 }
3432
3433 /* If characters are under composition, finish the composition.  */
3434 #define MAYBE_FINISH_COMPOSITION()                              \
3435   do {                                                          \
3436     if (cmp_status->state != COMPOSING_NO)                      \
3437       char_offset += finish_composition (charbuf, cmp_status);  \
3438   } while (0)
3439
3440 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3441
3442    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3443    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3444    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3445    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3446
3447    Produce this annotation sequence now:
3448
3449    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3450 */
3451
3452 #define DECODE_COMPOSITION_START(c1)                                       \
3453   do {                                                                     \
3454     if (c1 == '0'                                                          \
3455         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3456              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3457             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3458                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3459       {                                                                    \
3460         *charbuf++ = -1;                                                   \
3461         *charbuf++= -1;                                                    \
3462         cmp_status->state = COMPOSING_CHAR;                                \
3463         cmp_status->length += 2;                                           \
3464       }                                                                    \
3465     else                                                                   \
3466       {                                                                    \
3467         MAYBE_FINISH_COMPOSITION ();                                       \
3468         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3469                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3470                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3471                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3472         cmp_status->state                                                  \
3473           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3474         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3475         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3476         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3477         coding->annotated = 1;                                             \
3478       }                                                                    \
3479   } while (0)
3480
3481
3482 /* Handle composition end sequence ESC 1.  */
3483
3484 #define DECODE_COMPOSITION_END()                                        \
3485   do {                                                                  \
3486     if (cmp_status->nchars == 0                                         \
3487         || ((cmp_status->state == COMPOSING_CHAR)                       \
3488             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3489       {                                                                 \
3490         MAYBE_FINISH_COMPOSITION ();                                    \
3491         goto invalid_code;                                              \
3492       }                                                                 \
3493     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3494       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3495     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3496       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3497     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3498     char_offset += cmp_status->nchars;                                  \
3499     cmp_status->state = COMPOSING_NO;                                   \
3500   } while (0)
3501
3502 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3503
3504 #define STORE_COMPOSITION_RULE(rule)    \
3505   do {                                  \
3506     *charbuf++ = -2;                    \
3507     *charbuf++ = rule;                  \
3508     cmp_status->length += 2;            \
3509     cmp_status->state--;                \
3510   } while (0)
3511
3512 /* Store a composed char or a component char C in charbuf, and update
3513    cmp_status.  */
3514
3515 #define STORE_COMPOSITION_CHAR(c)                                       \
3516   do {                                                                  \
3517     *charbuf++ = (c);                                                   \
3518     cmp_status->length++;                                               \
3519     if (cmp_status->state == COMPOSING_CHAR)                            \
3520       cmp_status->nchars++;                                             \
3521     else                                                                \
3522       cmp_status->ncomps++;                                             \
3523     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3524         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3525             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3526       cmp_status->state++;                                              \
3527   } while (0)
3528
3529
3530 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3531
3532 static void
3533 decode_coding_iso_2022 (struct coding_system *coding)
3534 {
3535   const unsigned char *src = coding->source + coding->consumed;
3536   const unsigned char *src_end = coding->source + coding->src_bytes;
3537   const unsigned char *src_base;
3538   int *charbuf = coding->charbuf + coding->charbuf_used;
3539   /* We may produce two annotations (charset and composition) in one
3540      loop and one more charset annotation at the end.  */
3541   int *charbuf_end
3542     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3543   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3544   int multibytep = coding->src_multibyte;
3545   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3546   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3547   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3548   int charset_id_2, charset_id_3;
3549   struct charset *charset;
3550   int c;
3551   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3552   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3553   ptrdiff_t char_offset = coding->produced_char;
3554   ptrdiff_t last_offset = char_offset;
3555   int last_id = charset_ascii;
3556   int eol_dos =
3557     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3558   int byte_after_cr = -1;
3559   int i;
3560
3561   setup_iso_safe_charsets (attrs);
3562   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3563
3564   if (cmp_status->state != COMPOSING_NO)
3565     {
3566       if (charbuf_end - charbuf < cmp_status->length)
3567         abort ();
3568       for (i = 0; i < cmp_status->length; i++)
3569         *charbuf++ = cmp_status->carryover[i];
3570       coding->annotated = 1;
3571     }
3572
3573   while (1)
3574     {
3575       int c1, c2, c3;
3576
3577       src_base = src;
3578       consumed_chars_base = consumed_chars;
3579
3580       if (charbuf >= charbuf_end)
3581         {
3582           if (byte_after_cr >= 0)
3583             src_base--;
3584           break;
3585         }
3586
3587       if (byte_after_cr >= 0)
3588         c1 = byte_after_cr, byte_after_cr = -1;
3589       else
3590         ONE_MORE_BYTE (c1);
3591       if (c1 < 0)
3592         goto invalid_code;
3593
3594       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3595         {
3596           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3597           char_offset++;
3598           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3599           continue;
3600         }
3601
3602       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3603         {
3604           if (c1 == ISO_CODE_ESC)
3605             {
3606               if (src + 1 >= src_end)
3607                 goto no_more_source;
3608               *charbuf++ = ISO_CODE_ESC;
3609               char_offset++;
3610               if (src[0] == '%' && src[1] == '@')
3611                 {
3612                   src += 2;
3613                   consumed_chars += 2;
3614                   char_offset += 2;
3615                   /* We are sure charbuf can contain two more chars. */
3616                   *charbuf++ = '%';
3617                   *charbuf++ = '@';
3618                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3619                 }
3620             }
3621           else
3622             {
3623               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3624               char_offset++;
3625             }
3626           continue;
3627         }
3628
3629       if ((cmp_status->state == COMPOSING_RULE
3630            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3631           && c1 != ISO_CODE_ESC)
3632         {
3633           int rule;
3634
3635           DECODE_COMPOSITION_RULE (rule);
3636           STORE_COMPOSITION_RULE (rule);
3637           continue;
3638         }
3639
3640       /* We produce at most one character.  */
3641       switch (iso_code_class [c1])
3642         {
3643         case ISO_0x20_or_0x7F:
3644           if (charset_id_0 < 0
3645               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3646             /* This is SPACE or DEL.  */
3647             charset = CHARSET_FROM_ID (charset_ascii);
3648           else
3649             charset = CHARSET_FROM_ID (charset_id_0);
3650           break;
3651
3652         case ISO_graphic_plane_0:
3653           if (charset_id_0 < 0)
3654             charset = CHARSET_FROM_ID (charset_ascii);
3655           else
3656             charset = CHARSET_FROM_ID (charset_id_0);
3657           break;
3658
3659         case ISO_0xA0_or_0xFF:
3660           if (charset_id_1 < 0
3661               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3662               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3663             goto invalid_code;
3664           /* This is a graphic character, we fall down ... */
3665
3666         case ISO_graphic_plane_1:
3667           if (charset_id_1 < 0)
3668             goto invalid_code;
3669           charset = CHARSET_FROM_ID (charset_id_1);
3670           break;
3671
3672         case ISO_control_0:
3673           if (eol_dos && c1 == '\r')
3674             ONE_MORE_BYTE (byte_after_cr);
3675           MAYBE_FINISH_COMPOSITION ();
3676           charset = CHARSET_FROM_ID (charset_ascii);
3677           break;
3678
3679         case ISO_control_1:
3680           goto invalid_code;
3681
3682         case ISO_shift_out:
3683           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3684               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3685             goto invalid_code;
3686           CODING_ISO_INVOCATION (coding, 0) = 1;
3687           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3688           continue;
3689
3690         case ISO_shift_in:
3691           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3692             goto invalid_code;
3693           CODING_ISO_INVOCATION (coding, 0) = 0;
3694           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3695           continue;
3696
3697         case ISO_single_shift_2_7:
3698           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3699             goto invalid_code;
3700         case ISO_single_shift_2:
3701           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3702             goto invalid_code;
3703           /* SS2 is handled as an escape sequence of ESC 'N' */
3704           c1 = 'N';
3705           goto label_escape_sequence;
3706
3707         case ISO_single_shift_3:
3708           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3709             goto invalid_code;
3710           /* SS2 is handled as an escape sequence of ESC 'O' */
3711           c1 = 'O';
3712           goto label_escape_sequence;
3713
3714         case ISO_control_sequence_introducer:
3715           /* CSI is handled as an escape sequence of ESC '[' ...  */
3716           c1 = '[';
3717           goto label_escape_sequence;
3718
3719         case ISO_escape:
3720           ONE_MORE_BYTE (c1);
3721         label_escape_sequence:
3722           /* Escape sequences handled here are invocation,
3723              designation, direction specification, and character
3724              composition specification.  */
3725           switch (c1)
3726             {
3727             case '&':           /* revision of following character set */
3728               ONE_MORE_BYTE (c1);
3729               if (!(c1 >= '@' && c1 <= '~'))
3730                 goto invalid_code;
3731               ONE_MORE_BYTE (c1);
3732               if (c1 != ISO_CODE_ESC)
3733                 goto invalid_code;
3734               ONE_MORE_BYTE (c1);
3735               goto label_escape_sequence;
3736
3737             case '$':           /* designation of 2-byte character set */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3739                 goto invalid_code;
3740               {
3741                 int reg, chars96;
3742
3743                 ONE_MORE_BYTE (c1);
3744                 if (c1 >= '@' && c1 <= 'B')
3745                   {     /* designation of JISX0208.1978, GB2312.1980,
3746                            or JISX0208.1980 */
3747                     reg = 0, chars96 = 0;
3748                   }
3749                 else if (c1 >= 0x28 && c1 <= 0x2B)
3750                   { /* designation of DIMENSION2_CHARS94 character set */
3751                     reg = c1 - 0x28, chars96 = 0;
3752                     ONE_MORE_BYTE (c1);
3753                   }
3754                 else if (c1 >= 0x2C && c1 <= 0x2F)
3755                   { /* designation of DIMENSION2_CHARS96 character set */
3756                     reg = c1 - 0x2C, chars96 = 1;
3757                     ONE_MORE_BYTE (c1);
3758                   }
3759                 else
3760                   goto invalid_code;
3761                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3762                 /* We must update these variables now.  */
3763                 if (reg == 0)
3764                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3765                 else if (reg == 1)
3766                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3767                 if (chars96 < 0)
3768                   goto invalid_code;
3769               }
3770               continue;
3771
3772             case 'n':           /* invocation of locking-shift-2 */
3773               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3774                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3775                 goto invalid_code;
3776               CODING_ISO_INVOCATION (coding, 0) = 2;
3777               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3778               continue;
3779
3780             case 'o':           /* invocation of locking-shift-3 */
3781               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3782                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3783                 goto invalid_code;
3784               CODING_ISO_INVOCATION (coding, 0) = 3;
3785               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3786               continue;
3787
3788             case 'N':           /* invocation of single-shift-2 */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3790                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3791                 goto invalid_code;
3792               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3793               if (charset_id_2 < 0)
3794                 charset = CHARSET_FROM_ID (charset_ascii);
3795               else
3796                 charset = CHARSET_FROM_ID (charset_id_2);
3797               ONE_MORE_BYTE (c1);
3798               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3799                 goto invalid_code;
3800               break;
3801
3802             case 'O':           /* invocation of single-shift-3 */
3803               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3804                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3805                 goto invalid_code;
3806               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3807               if (charset_id_3 < 0)
3808                 charset = CHARSET_FROM_ID (charset_ascii);
3809               else
3810                 charset = CHARSET_FROM_ID (charset_id_3);
3811               ONE_MORE_BYTE (c1);
3812               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3813                 goto invalid_code;
3814               break;
3815
3816             case '0': case '2': case '3': case '4': /* start composition */
3817               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3818                 goto invalid_code;
3819               if (last_id != charset_ascii)
3820                 {
3821                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3822                   last_id = charset_ascii;
3823                   last_offset = char_offset;
3824                 }
3825               DECODE_COMPOSITION_START (c1);
3826               continue;
3827
3828             case '1':           /* end composition */
3829               if (cmp_status->state == COMPOSING_NO)
3830                 goto invalid_code;
3831               DECODE_COMPOSITION_END ();
3832               continue;
3833
3834             case '[':           /* specification of direction */
3835               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3836                 goto invalid_code;
3837               /* For the moment, nested direction is not supported.
3838                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3839                  left-to-right, and nonzero means right-to-left.  */
3840               ONE_MORE_BYTE (c1);
3841               switch (c1)
3842                 {
3843                 case ']':       /* end of the current direction */
3844                   coding->mode &= ~CODING_MODE_DIRECTION;
3845
3846                 case '0':       /* end of the current direction */
3847                 case '1':       /* start of left-to-right direction */
3848                   ONE_MORE_BYTE (c1);
3849                   if (c1 == ']')
3850                     coding->mode &= ~CODING_MODE_DIRECTION;
3851                   else
3852                     goto invalid_code;
3853                   break;
3854
3855                 case '2':       /* start of right-to-left direction */
3856                   ONE_MORE_BYTE (c1);
3857                   if (c1 == ']')
3858                     coding->mode |= CODING_MODE_DIRECTION;
3859                   else
3860                     goto invalid_code;
3861                   break;
3862
3863                 default:
3864                   goto invalid_code;
3865                 }
3866               continue;
3867
3868             case '%':
3869               ONE_MORE_BYTE (c1);
3870               if (c1 == '/')
3871                 {
3872                   /* CTEXT extended segment:
3873                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3874                      We keep these bytes as is for the moment.
3875                      They may be decoded by post-read-conversion.  */
3876                   int dim, M, L;
3877                   int size;
3878
3879                   ONE_MORE_BYTE (dim);
3880                   if (dim < '0' || dim > '4')
3881                     goto invalid_code;
3882                   ONE_MORE_BYTE (M);
3883                   if (M < 128)
3884                     goto invalid_code;
3885                   ONE_MORE_BYTE (L);
3886                   if (L < 128)
3887                     goto invalid_code;
3888                   size = ((M - 128) * 128) + (L - 128);
3889                   if (charbuf + 6 > charbuf_end)
3890                     goto break_loop;
3891                   *charbuf++ = ISO_CODE_ESC;
3892                   *charbuf++ = '%';
3893                   *charbuf++ = '/';
3894                   *charbuf++ = dim;
3895                   *charbuf++ = BYTE8_TO_CHAR (M);
3896                   *charbuf++ = BYTE8_TO_CHAR (L);
3897                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3898                 }
3899               else if (c1 == 'G')
3900                 {
3901                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3902                      ESC % G --UTF-8-BYTES-- ESC % @
3903                      We keep these bytes as is for the moment.
3904                      They may be decoded by post-read-conversion.  */
3905                   if (charbuf + 3 > charbuf_end)
3906                     goto break_loop;
3907                   *charbuf++ = ISO_CODE_ESC;
3908                   *charbuf++ = '%';
3909                   *charbuf++ = 'G';
3910                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3911                 }
3912               else
3913                 goto invalid_code;
3914               continue;
3915               break;
3916
3917             default:
3918               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3919                 goto invalid_code;
3920               {
3921                 int reg, chars96;
3922
3923                 if (c1 >= 0x28 && c1 <= 0x2B)
3924                   { /* designation of DIMENSION1_CHARS94 character set */
3925                     reg = c1 - 0x28, chars96 = 0;
3926                     ONE_MORE_BYTE (c1);
3927                   }
3928                 else if (c1 >= 0x2C && c1 <= 0x2F)
3929                   { /* designation of DIMENSION1_CHARS96 character set */
3930                     reg = c1 - 0x2C, chars96 = 1;
3931                     ONE_MORE_BYTE (c1);
3932                   }
3933                 else
3934                   goto invalid_code;
3935                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3936                 /* We must update these variables now.  */
3937                 if (reg == 0)
3938                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3939                 else if (reg == 1)
3940                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3941                 if (chars96 < 0)
3942                   goto invalid_code;
3943               }
3944               continue;
3945             }
3946           break;
3947
3948         default:
3949           abort ();
3950         }
3951
3952       if (cmp_status->state == COMPOSING_NO
3953           && charset->id != charset_ascii
3954           && last_id != charset->id)
3955         {
3956           if (last_id != charset_ascii)
3957             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3958           last_id = charset->id;
3959           last_offset = char_offset;
3960         }
3961
3962       /* Now we know CHARSET and 1st position code C1 of a character.
3963          Produce a decoded character while getting 2nd and 3rd
3964          position codes C2, C3 if necessary.  */
3965       if (CHARSET_DIMENSION (charset) > 1)
3966         {
3967           ONE_MORE_BYTE (c2);
3968           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3969               || ((c1 & 0x80) != (c2 & 0x80)))
3970             /* C2 is not in a valid range.  */
3971             goto invalid_code;
3972           if (CHARSET_DIMENSION (charset) == 2)
3973             c1 = (c1 << 8) | c2;
3974           else
3975             {
3976               ONE_MORE_BYTE (c3);
3977               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3978                   || ((c1 & 0x80) != (c3 & 0x80)))
3979                 /* C3 is not in a valid range.  */
3980                 goto invalid_code;
3981               c1 = (c1 << 16) | (c2 << 8) | c2;
3982             }
3983         }
3984       c1 &= 0x7F7F7F;
3985       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3986       if (c < 0)
3987         {
3988           MAYBE_FINISH_COMPOSITION ();
3989           for (; src_base < src; src_base++, char_offset++)
3990             {
3991               if (ASCII_BYTE_P (*src_base))
3992                 *charbuf++ = *src_base;
3993               else
3994                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3995             }
3996         }
3997       else if (cmp_status->state == COMPOSING_NO)
3998         {
3999           *charbuf++ = c;
4000           char_offset++;
4001         }
4002       else if ((cmp_status->state == COMPOSING_CHAR
4003                 ? cmp_status->nchars
4004                 : cmp_status->ncomps)
4005                >= MAX_COMPOSITION_COMPONENTS)
4006         {
4007           /* Too long composition.  */
4008           MAYBE_FINISH_COMPOSITION ();
4009           *charbuf++ = c;
4010           char_offset++;
4011         }
4012       else
4013         STORE_COMPOSITION_CHAR (c);
4014       continue;
4015
4016     invalid_code:
4017       MAYBE_FINISH_COMPOSITION ();
4018       src = src_base;
4019       consumed_chars = consumed_chars_base;
4020       ONE_MORE_BYTE (c);
4021       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4022       char_offset++;
4023       coding->errors++;
4024       continue;
4025
4026     break_loop:
4027       break;
4028     }
4029
4030  no_more_source:
4031   if (cmp_status->state != COMPOSING_NO)
4032     {
4033       if (coding->mode & CODING_MODE_LAST_BLOCK)
4034         MAYBE_FINISH_COMPOSITION ();
4035       else
4036         {
4037           charbuf -= cmp_status->length;
4038           for (i = 0; i < cmp_status->length; i++)
4039             cmp_status->carryover[i] = charbuf[i];
4040         }
4041     }
4042   else if (last_id != charset_ascii)
4043     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4044   coding->consumed_char += consumed_chars_base;
4045   coding->consumed = src_base - coding->source;
4046   coding->charbuf_used = charbuf - coding->charbuf;
4047 }
4048
4049
4050 /* ISO2022 encoding stuff.  */
4051
4052 /*
4053    It is not enough to say just "ISO2022" on encoding, we have to
4054    specify more details.  In Emacs, each coding system of ISO2022
4055    variant has the following specifications:
4056         1. Initial designation to G0 thru G3.
4057         2. Allows short-form designation?
4058         3. ASCII should be designated to G0 before control characters?
4059         4. ASCII should be designated to G0 at end of line?
4060         5. 7-bit environment or 8-bit environment?
4061         6. Use locking-shift?
4062         7. Use Single-shift?
4063    And the following two are only for Japanese:
4064         8. Use ASCII in place of JIS0201-1976-Roman?
4065         9. Use JISX0208-1983 in place of JISX0208-1978?
4066    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4067    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4068    details.
4069 */
4070
4071 /* Produce codes (escape sequence) for designating CHARSET to graphic
4072    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4073    '@', 'A', or 'B' and the coding system CODING allows, produce
4074    designation sequence of short-form.  */
4075
4076 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4077   do {                                                                  \
4078     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4079     const char *intermediate_char_94 = "()*+";                          \
4080     const char *intermediate_char_96 = ",-./";                          \
4081     int revision = -1;                                                  \
4082                                                                         \
4083     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4084       revision = CHARSET_ISO_REVISION (charset);                        \
4085                                                                         \
4086     if (revision >= 0)                                                  \
4087       {                                                                 \
4088         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4089         EMIT_ONE_BYTE ('@' + revision);                                 \
4090       }                                                                 \
4091     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4092     if (CHARSET_DIMENSION (charset) == 1)                               \
4093       {                                                                 \
4094         int b;                                                          \
4095         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4096           b = intermediate_char_94[reg];                                \
4097         else                                                            \
4098           b = intermediate_char_96[reg];                                \
4099         EMIT_ONE_ASCII_BYTE (b);                                        \
4100       }                                                                 \
4101     else                                                                \
4102       {                                                                 \
4103         EMIT_ONE_ASCII_BYTE ('$');                                      \
4104         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4105           {                                                             \
4106             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4107                 || reg != 0                                             \
4108                 || final_char < '@' || final_char > 'B')                \
4109               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4110           }                                                             \
4111         else                                                            \
4112           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4113       }                                                                 \
4114     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4115                                                                         \
4116     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4117   } while (0)
4118
4119
4120 /* The following two macros produce codes (control character or escape
4121    sequence) for ISO2022 single-shift functions (single-shift-2 and
4122    single-shift-3).  */
4123
4124 #define ENCODE_SINGLE_SHIFT_2                                           \
4125   do {                                                                  \
4126     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4127       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4128     else                                                                \
4129       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4130     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4131   } while (0)
4132
4133
4134 #define ENCODE_SINGLE_SHIFT_3                                           \
4135   do {                                                                  \
4136     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4137       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4138     else                                                                \
4139       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4140     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4141   } while (0)
4142
4143
4144 /* The following four macros produce codes (control character or
4145    escape sequence) for ISO2022 locking-shift functions (shift-in,
4146    shift-out, locking-shift-2, and locking-shift-3).  */
4147
4148 #define ENCODE_SHIFT_IN                                 \
4149   do {                                                  \
4150     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4151     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4152   } while (0)
4153
4154
4155 #define ENCODE_SHIFT_OUT                                \
4156   do {                                                  \
4157     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4158     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4159   } while (0)
4160
4161
4162 #define ENCODE_LOCKING_SHIFT_2                          \
4163   do {                                                  \
4164     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4165     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4166   } while (0)
4167
4168
4169 #define ENCODE_LOCKING_SHIFT_3                          \
4170   do {                                                  \
4171     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4172     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4173   } while (0)
4174
4175
4176 /* Produce codes for a DIMENSION1 character whose character set is
4177    CHARSET and whose position-code is C1.  Designation and invocation
4178    sequences are also produced in advance if necessary.  */
4179
4180 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4181   do {                                                                  \
4182     int id = CHARSET_ID (charset);                                      \
4183                                                                         \
4184     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4185         && id == charset_ascii)                                         \
4186       {                                                                 \
4187         id = charset_jisx0201_roman;                                    \
4188         charset = CHARSET_FROM_ID (id);                                 \
4189       }                                                                 \
4190                                                                         \
4191     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4192       {                                                                 \
4193         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4194           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4195         else                                                            \
4196           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4197         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4198         break;                                                          \
4199       }                                                                 \
4200     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4201       {                                                                 \
4202         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4206       {                                                                 \
4207         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4208         break;                                                          \
4209       }                                                                 \
4210     else                                                                \
4211       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4212          must invoke it, or, at first, designate it to some graphic     \
4213          register.  Then repeat the loop to actually produce the        \
4214          character.  */                                                 \
4215       dst = encode_invocation_designation (charset, coding, dst,        \
4216                                            &produced_chars);            \
4217   } while (1)
4218
4219
4220 /* Produce codes for a DIMENSION2 character whose character set is
4221    CHARSET and whose position-codes are C1 and C2.  Designation and
4222    invocation codes are also produced in advance if necessary.  */
4223
4224 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4225   do {                                                                  \
4226     int id = CHARSET_ID (charset);                                      \
4227                                                                         \
4228     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4229         && id == charset_jisx0208)                                      \
4230       {                                                                 \
4231         id = charset_jisx0208_1978;                                     \
4232         charset = CHARSET_FROM_ID (id);                                 \
4233       }                                                                 \
4234                                                                         \
4235     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4236       {                                                                 \
4237         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4238           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4239         else                                                            \
4240           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4241         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4242         break;                                                          \
4243       }                                                                 \
4244     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4245       {                                                                 \
4246         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4247         break;                                                          \
4248       }                                                                 \
4249     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4250       {                                                                 \
4251         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4252         break;                                                          \
4253       }                                                                 \
4254     else                                                                \
4255       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4256          must invoke it, or, at first, designate it to some graphic     \
4257          register.  Then repeat the loop to actually produce the        \
4258          character.  */                                                 \
4259       dst = encode_invocation_designation (charset, coding, dst,        \
4260                                            &produced_chars);            \
4261   } while (1)
4262
4263
4264 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4265   do {                                                                     \
4266     unsigned code;                                                         \
4267     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4268                                                                            \
4269     if (CHARSET_DIMENSION (charset) == 1)                                  \
4270       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4271     else                                                                   \
4272       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4273   } while (0)
4274
4275
4276 /* Produce designation and invocation codes at a place pointed by DST
4277    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4278    Return new DST.  */
4279
4280 static unsigned char *
4281 encode_invocation_designation (struct charset *charset,
4282                                struct coding_system *coding,
4283                                unsigned char *dst, ptrdiff_t *p_nchars)
4284 {
4285   int multibytep = coding->dst_multibyte;
4286   ptrdiff_t produced_chars = *p_nchars;
4287   int reg;                      /* graphic register number */
4288   int id = CHARSET_ID (charset);
4289
4290   /* At first, check designations.  */
4291   for (reg = 0; reg < 4; reg++)
4292     if (id == CODING_ISO_DESIGNATION (coding, reg))
4293       break;
4294
4295   if (reg >= 4)
4296     {
4297       /* CHARSET is not yet designated to any graphic registers.  */
4298       /* At first check the requested designation.  */
4299       reg = CODING_ISO_REQUEST (coding, id);
4300       if (reg < 0)
4301         /* Since CHARSET requests no special designation, designate it
4302            to graphic register 0.  */
4303         reg = 0;
4304
4305       ENCODE_DESIGNATION (charset, reg, coding);
4306     }
4307
4308   if (CODING_ISO_INVOCATION (coding, 0) != reg
4309       && CODING_ISO_INVOCATION (coding, 1) != reg)
4310     {
4311       /* Since the graphic register REG is not invoked to any graphic
4312          planes, invoke it to graphic plane 0.  */
4313       switch (reg)
4314         {
4315         case 0:                 /* graphic register 0 */
4316           ENCODE_SHIFT_IN;
4317           break;
4318
4319         case 1:                 /* graphic register 1 */
4320           ENCODE_SHIFT_OUT;
4321           break;
4322
4323         case 2:                 /* graphic register 2 */
4324           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4325             ENCODE_SINGLE_SHIFT_2;
4326           else
4327             ENCODE_LOCKING_SHIFT_2;
4328           break;
4329
4330         case 3:                 /* graphic register 3 */
4331           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4332             ENCODE_SINGLE_SHIFT_3;
4333           else
4334             ENCODE_LOCKING_SHIFT_3;
4335           break;
4336         }
4337     }
4338
4339   *p_nchars = produced_chars;
4340   return dst;
4341 }
4342
4343
4344 /* Produce codes for designation and invocation to reset the graphic
4345    planes and registers to initial state.  */
4346 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4347   do {                                                                  \
4348     int reg;                                                            \
4349     struct charset *charset;                                            \
4350                                                                         \
4351     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4352       ENCODE_SHIFT_IN;                                                  \
4353     for (reg = 0; reg < 4; reg++)                                       \
4354       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4355           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4356               != CODING_ISO_INITIAL (coding, reg)))                     \
4357         {                                                               \
4358           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4359           ENCODE_DESIGNATION (charset, reg, coding);                    \
4360         }                                                               \
4361   } while (0)
4362
4363
4364 /* Produce designation sequences of charsets in the line started from
4365    CHARBUF to a place pointed by DST, and return the number of
4366    produced bytes.  DST should not directly point a buffer text area
4367    which may be relocated by char_charset call.
4368
4369    If the current block ends before any end-of-line, we may fail to
4370    find all the necessary designations.  */
4371
4372 static ptrdiff_t
4373 encode_designation_at_bol (struct coding_system *coding,
4374                            int *charbuf, int *charbuf_end,
4375                            unsigned char *dst)
4376 {
4377   unsigned char *orig = dst;
4378   struct charset *charset;
4379   /* Table of charsets to be designated to each graphic register.  */
4380   int r[4];
4381   int c, found = 0, reg;
4382   ptrdiff_t produced_chars = 0;
4383   int multibytep = coding->dst_multibyte;
4384   Lisp_Object attrs;
4385   Lisp_Object charset_list;
4386
4387   attrs = CODING_ID_ATTRS (coding->id);
4388   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4389   if (EQ (charset_list, Qiso_2022))
4390     charset_list = Viso_2022_charset_list;
4391
4392   for (reg = 0; reg < 4; reg++)
4393     r[reg] = -1;
4394
4395   while (charbuf < charbuf_end && found < 4)
4396     {
4397       int id;
4398
4399       c = *charbuf++;
4400       if (c == '\n')
4401         break;
4402       charset = char_charset (c, charset_list, NULL);
4403       id = CHARSET_ID (charset);
4404       reg = CODING_ISO_REQUEST (coding, id);
4405       if (reg >= 0 && r[reg] < 0)
4406         {
4407           found++;
4408           r[reg] = id;
4409         }
4410     }
4411
4412   if (found)
4413     {
4414       for (reg = 0; reg < 4; reg++)
4415         if (r[reg] >= 0
4416             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4417           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4418     }
4419
4420   return dst - orig;
4421 }
4422
4423 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4424
4425 static int
4426 encode_coding_iso_2022 (struct coding_system *coding)
4427 {
4428   int multibytep = coding->dst_multibyte;
4429   int *charbuf = coding->charbuf;
4430   int *charbuf_end = charbuf + coding->charbuf_used;
4431   unsigned char *dst = coding->destination + coding->produced;
4432   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4433   int safe_room = 16;
4434   int bol_designation
4435     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4436        && CODING_ISO_BOL (coding));
4437   ptrdiff_t produced_chars = 0;
4438   Lisp_Object attrs, eol_type, charset_list;
4439   int ascii_compatible;
4440   int c;
4441   int preferred_charset_id = -1;
4442
4443   CODING_GET_INFO (coding, attrs, charset_list);
4444   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4445   if (VECTORP (eol_type))
4446     eol_type = Qunix;
4447
4448   setup_iso_safe_charsets (attrs);
4449   /* Charset list may have been changed.  */
4450   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4451   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4452
4453   ascii_compatible
4454     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4455        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4456                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4457
4458   while (charbuf < charbuf_end)
4459     {
4460       ASSURE_DESTINATION (safe_room);
4461
4462       if (bol_designation)
4463         {
4464           /* We have to produce designation sequences if any now.  */
4465           unsigned char desig_buf[16];
4466           int nbytes;
4467           ptrdiff_t offset;
4468
4469           charset_map_loaded = 0;
4470           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4471                                               desig_buf);
4472           if (charset_map_loaded
4473               && (offset = coding_change_destination (coding)))
4474             {
4475               dst += offset;
4476               dst_end += offset;
4477             }
4478           memcpy (dst, desig_buf, nbytes);
4479           dst += nbytes;
4480           /* We are sure that designation sequences are all ASCII bytes.  */
4481           produced_chars += nbytes;
4482           bol_designation = 0;
4483           ASSURE_DESTINATION (safe_room);
4484         }
4485
4486       c = *charbuf++;
4487
4488       if (c < 0)
4489         {
4490           /* Handle an annotation.  */
4491           switch (*charbuf)
4492             {
4493             case CODING_ANNOTATE_COMPOSITION_MASK:
4494               /* Not yet implemented.  */
4495               break;
4496             case CODING_ANNOTATE_CHARSET_MASK:
4497               preferred_charset_id = charbuf[2];
4498               if (preferred_charset_id >= 0
4499                   && NILP (Fmemq (make_number (preferred_charset_id),
4500                                   charset_list)))
4501                 preferred_charset_id = -1;
4502               break;
4503             default:
4504               abort ();
4505             }
4506           charbuf += -c - 1;
4507           continue;
4508         }
4509
4510       /* Now encode the character C.  */
4511       if (c < 0x20 || c == 0x7F)
4512         {
4513           if (c == '\n'
4514               || (c == '\r' && EQ (eol_type, Qmac)))
4515             {
4516               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4517                 ENCODE_RESET_PLANE_AND_REGISTER ();
4518               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4519                 {
4520                   int i;
4521
4522                   for (i = 0; i < 4; i++)
4523                     CODING_ISO_DESIGNATION (coding, i)
4524                       = CODING_ISO_INITIAL (coding, i);
4525                 }
4526               bol_designation
4527                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4528             }
4529           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4530             ENCODE_RESET_PLANE_AND_REGISTER ();
4531           EMIT_ONE_ASCII_BYTE (c);
4532         }
4533       else if (ASCII_CHAR_P (c))
4534         {
4535           if (ascii_compatible)
4536             EMIT_ONE_ASCII_BYTE (c);
4537           else
4538             {
4539               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4540               ENCODE_ISO_CHARACTER (charset, c);
4541             }
4542         }
4543       else if (CHAR_BYTE8_P (c))
4544         {
4545           c = CHAR_TO_BYTE8 (c);
4546           EMIT_ONE_BYTE (c);
4547         }
4548       else
4549         {
4550           struct charset *charset;
4551
4552           if (preferred_charset_id >= 0)
4553             {
4554               int result;
4555
4556               charset = CHARSET_FROM_ID (preferred_charset_id);
4557               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4558               if (! result)
4559                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4560                                      NULL, charset);
4561             }
4562           else
4563             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4564                                  NULL, charset);
4565           if (!charset)
4566             {
4567               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4568                 {
4569                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4570                   charset = CHARSET_FROM_ID (charset_ascii);
4571                 }
4572               else
4573                 {
4574                   c = coding->default_char;
4575                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4576                                        charset_list, NULL, charset);
4577                 }
4578             }
4579           ENCODE_ISO_CHARACTER (charset, c);
4580         }
4581     }
4582
4583   if (coding->mode & CODING_MODE_LAST_BLOCK
4584       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4585     {
4586       ASSURE_DESTINATION (safe_room);
4587       ENCODE_RESET_PLANE_AND_REGISTER ();
4588     }
4589   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4590   CODING_ISO_BOL (coding) = bol_designation;
4591   coding->produced_char += produced_chars;
4592   coding->produced = dst - coding->destination;
4593   return 0;
4594 }
4595
4596 \f
4597 /*** 8,9. SJIS and BIG5 handlers ***/
4598
4599 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4600    quite widely.  So, for the moment, Emacs supports them in the bare
4601    C code.  But, in the future, they may be supported only by CCL.  */
4602
4603 /* SJIS is a coding system encoding three character sets: ASCII, right
4604    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4605    as is.  A character of charset katakana-jisx0201 is encoded by
4606    "position-code + 0x80".  A character of charset japanese-jisx0208
4607    is encoded in 2-byte but two position-codes are divided and shifted
4608    so that it fit in the range below.
4609
4610    --- CODE RANGE of SJIS ---
4611    (character set)      (range)
4612    ASCII                0x00 .. 0x7F
4613    KATAKANA-JISX0201    0xA0 .. 0xDF
4614    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4615             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4616    -------------------------------
4617
4618 */
4619
4620 /* BIG5 is a coding system encoding two character sets: ASCII and
4621    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4622    character set and is encoded in two-byte.
4623
4624    --- CODE RANGE of BIG5 ---
4625    (character set)      (range)
4626    ASCII                0x00 .. 0x7F
4627    Big5 (1st byte)      0xA1 .. 0xFE
4628         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4629    --------------------------
4630
4631   */
4632
4633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634    Check if a text is encoded in SJIS.  If it is, return
4635    CATEGORY_MASK_SJIS, else return 0.  */
4636
4637 static int
4638 detect_coding_sjis (struct coding_system *coding,
4639                     struct coding_detection_info *detect_info)
4640 {
4641   const unsigned char *src = coding->source, *src_base;
4642   const unsigned char *src_end = coding->source + coding->src_bytes;
4643   int multibytep = coding->src_multibyte;
4644   ptrdiff_t consumed_chars = 0;
4645   int found = 0;
4646   int c;
4647   Lisp_Object attrs, charset_list;
4648   int max_first_byte_of_2_byte_code;
4649
4650   CODING_GET_INFO (coding, attrs, charset_list);
4651   max_first_byte_of_2_byte_code
4652     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4653
4654   detect_info->checked |= CATEGORY_MASK_SJIS;
4655   /* A coding system of this category is always ASCII compatible.  */
4656   src += coding->head_ascii;
4657
4658   while (1)
4659     {
4660       src_base = src;
4661       ONE_MORE_BYTE (c);
4662       if (c < 0x80)
4663         continue;
4664       if ((c >= 0x81 && c <= 0x9F)
4665           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4666         {
4667           ONE_MORE_BYTE (c);
4668           if (c < 0x40 || c == 0x7F || c > 0xFC)
4669             break;
4670           found = CATEGORY_MASK_SJIS;
4671         }
4672       else if (c >= 0xA0 && c < 0xE0)
4673         found = CATEGORY_MASK_SJIS;
4674       else
4675         break;
4676     }
4677   detect_info->rejected |= CATEGORY_MASK_SJIS;
4678   return 0;
4679
4680  no_more_source:
4681   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4682     {
4683       detect_info->rejected |= CATEGORY_MASK_SJIS;
4684       return 0;
4685     }
4686   detect_info->found |= found;
4687   return 1;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4691    Check if a text is encoded in BIG5.  If it is, return
4692    CATEGORY_MASK_BIG5, else return 0.  */
4693
4694 static int
4695 detect_coding_big5 (struct coding_system *coding,
4696                     struct coding_detection_info *detect_info)
4697 {
4698   const unsigned char *src = coding->source, *src_base;
4699   const unsigned char *src_end = coding->source + coding->src_bytes;
4700   int multibytep = coding->src_multibyte;
4701   ptrdiff_t consumed_chars = 0;
4702   int found = 0;
4703   int c;
4704
4705   detect_info->checked |= CATEGORY_MASK_BIG5;
4706   /* A coding system of this category is always ASCII compatible.  */
4707   src += coding->head_ascii;
4708
4709   while (1)
4710     {
4711       src_base = src;
4712       ONE_MORE_BYTE (c);
4713       if (c < 0x80)
4714         continue;
4715       if (c >= 0xA1)
4716         {
4717           ONE_MORE_BYTE (c);
4718           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4719             return 0;
4720           found = CATEGORY_MASK_BIG5;
4721         }
4722       else
4723         break;
4724     }
4725   detect_info->rejected |= CATEGORY_MASK_BIG5;
4726   return 0;
4727
4728  no_more_source:
4729   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4730     {
4731       detect_info->rejected |= CATEGORY_MASK_BIG5;
4732       return 0;
4733     }
4734   detect_info->found |= found;
4735   return 1;
4736 }
4737
4738 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4739    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4740
4741 static void
4742 decode_coding_sjis (struct coding_system *coding)
4743 {
4744   const unsigned char *src = coding->source + coding->consumed;
4745   const unsigned char *src_end = coding->source + coding->src_bytes;
4746   const unsigned char *src_base;
4747   int *charbuf = coding->charbuf + coding->charbuf_used;
4748   /* We may produce one charset annotation in one loop and one more at
4749      the end.  */
4750   int *charbuf_end
4751     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4752   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4753   int multibytep = coding->src_multibyte;
4754   struct charset *charset_roman, *charset_kanji, *charset_kana;
4755   struct charset *charset_kanji2;
4756   Lisp_Object attrs, charset_list, val;
4757   ptrdiff_t char_offset = coding->produced_char;
4758   ptrdiff_t last_offset = char_offset;
4759   int last_id = charset_ascii;
4760   int eol_dos =
4761     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4762   int byte_after_cr = -1;
4763
4764   CODING_GET_INFO (coding, attrs, charset_list);
4765
4766   val = charset_list;
4767   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4768   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4769   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4770   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4771
4772   while (1)
4773     {
4774       int c, c1;
4775       struct charset *charset;
4776
4777       src_base = src;
4778       consumed_chars_base = consumed_chars;
4779
4780       if (charbuf >= charbuf_end)
4781         {
4782           if (byte_after_cr >= 0)
4783             src_base--;
4784           break;
4785         }
4786
4787       if (byte_after_cr >= 0)
4788         c = byte_after_cr, byte_after_cr = -1;
4789       else
4790         ONE_MORE_BYTE (c);
4791       if (c < 0)
4792         goto invalid_code;
4793       if (c < 0x80)
4794         {
4795           if (eol_dos && c == '\r')
4796             ONE_MORE_BYTE (byte_after_cr);
4797           charset = charset_roman;
4798         }
4799       else if (c == 0x80 || c == 0xA0)
4800         goto invalid_code;
4801       else if (c >= 0xA1 && c <= 0xDF)
4802         {
4803           /* SJIS -> JISX0201-Kana */
4804           c &= 0x7F;
4805           charset = charset_kana;
4806         }
4807       else if (c <= 0xEF)
4808         {
4809           /* SJIS -> JISX0208 */
4810           ONE_MORE_BYTE (c1);
4811           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4812             goto invalid_code;
4813           c = (c << 8) | c1;
4814           SJIS_TO_JIS (c);
4815           charset = charset_kanji;
4816         }
4817       else if (c <= 0xFC && charset_kanji2)
4818         {
4819           /* SJIS -> JISX0213-2 */
4820           ONE_MORE_BYTE (c1);
4821           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4822             goto invalid_code;
4823           c = (c << 8) | c1;
4824           SJIS_TO_JIS2 (c);
4825           charset = charset_kanji2;
4826         }
4827       else
4828         goto invalid_code;
4829       if (charset->id != charset_ascii
4830           && last_id != charset->id)
4831         {
4832           if (last_id != charset_ascii)
4833             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4834           last_id = charset->id;
4835           last_offset = char_offset;
4836         }
4837       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4838       *charbuf++ = c;
4839       char_offset++;
4840       continue;
4841
4842     invalid_code:
4843       src = src_base;
4844       consumed_chars = consumed_chars_base;
4845       ONE_MORE_BYTE (c);
4846       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4847       char_offset++;
4848       coding->errors++;
4849     }
4850
4851  no_more_source:
4852   if (last_id != charset_ascii)
4853     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4854   coding->consumed_char += consumed_chars_base;
4855   coding->consumed = src_base - coding->source;
4856   coding->charbuf_used = charbuf - coding->charbuf;
4857 }
4858
4859 static void
4860 decode_coding_big5 (struct coding_system *coding)
4861 {
4862   const unsigned char *src = coding->source + coding->consumed;
4863   const unsigned char *src_end = coding->source + coding->src_bytes;
4864   const unsigned char *src_base;
4865   int *charbuf = coding->charbuf + coding->charbuf_used;
4866   /* We may produce one charset annotation in one loop and one more at
4867      the end.  */
4868   int *charbuf_end
4869     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4870   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4871   int multibytep = coding->src_multibyte;
4872   struct charset *charset_roman, *charset_big5;
4873   Lisp_Object attrs, charset_list, val;
4874   ptrdiff_t char_offset = coding->produced_char;
4875   ptrdiff_t last_offset = char_offset;
4876   int last_id = charset_ascii;
4877   int eol_dos =
4878     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4879   int byte_after_cr = -1;
4880
4881   CODING_GET_INFO (coding, attrs, charset_list);
4882   val = charset_list;
4883   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4884   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4885
4886   while (1)
4887     {
4888       int c, c1;
4889       struct charset *charset;
4890
4891       src_base = src;
4892       consumed_chars_base = consumed_chars;
4893
4894       if (charbuf >= charbuf_end)
4895         {
4896           if (byte_after_cr >= 0)
4897             src_base--;
4898           break;
4899         }
4900
4901       if (byte_after_cr >= 0)
4902         c = byte_after_cr, byte_after_cr = -1;
4903       else
4904         ONE_MORE_BYTE (c);
4905
4906       if (c < 0)
4907         goto invalid_code;
4908       if (c < 0x80)
4909         {
4910           if (eol_dos && c == '\r')
4911             ONE_MORE_BYTE (byte_after_cr);
4912           charset = charset_roman;
4913         }
4914       else
4915         {
4916           /* BIG5 -> Big5 */
4917           if (c < 0xA1 || c > 0xFE)
4918             goto invalid_code;
4919           ONE_MORE_BYTE (c1);
4920           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4921             goto invalid_code;
4922           c = c << 8 | c1;
4923           charset = charset_big5;
4924         }
4925       if (charset->id != charset_ascii
4926           && last_id != charset->id)
4927         {
4928           if (last_id != charset_ascii)
4929             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4930           last_id = charset->id;
4931           last_offset = char_offset;
4932         }
4933       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4934       *charbuf++ = c;
4935       char_offset++;
4936       continue;
4937
4938     invalid_code:
4939       src = src_base;
4940       consumed_chars = consumed_chars_base;
4941       ONE_MORE_BYTE (c);
4942       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4943       char_offset++;
4944       coding->errors++;
4945     }
4946
4947  no_more_source:
4948   if (last_id != charset_ascii)
4949     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4950   coding->consumed_char += consumed_chars_base;
4951   coding->consumed = src_base - coding->source;
4952   coding->charbuf_used = charbuf - coding->charbuf;
4953 }
4954
4955 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4956    This function can encode charsets `ascii', `katakana-jisx0201',
4957    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4958    are sure that all these charsets are registered as official charset
4959    (i.e. do not have extended leading-codes).  Characters of other
4960    charsets are produced without any encoding.  If SJIS_P is 1, encode
4961    SJIS text, else encode BIG5 text.  */
4962
4963 static int
4964 encode_coding_sjis (struct coding_system *coding)
4965 {
4966   int multibytep = coding->dst_multibyte;
4967   int *charbuf = coding->charbuf;
4968   int *charbuf_end = charbuf + coding->charbuf_used;
4969   unsigned char *dst = coding->destination + coding->produced;
4970   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4971   int safe_room = 4;
4972   ptrdiff_t produced_chars = 0;
4973   Lisp_Object attrs, charset_list, val;
4974   int ascii_compatible;
4975   struct charset *charset_kanji, *charset_kana;
4976   struct charset *charset_kanji2;
4977   int c;
4978
4979   CODING_GET_INFO (coding, attrs, charset_list);
4980   val = XCDR (charset_list);
4981   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4982   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4983   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4984
4985   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4986
4987   while (charbuf < charbuf_end)
4988     {
4989       ASSURE_DESTINATION (safe_room);
4990       c = *charbuf++;
4991       /* Now encode the character C.  */
4992       if (ASCII_CHAR_P (c) && ascii_compatible)
4993         EMIT_ONE_ASCII_BYTE (c);
4994       else if (CHAR_BYTE8_P (c))
4995         {
4996           c = CHAR_TO_BYTE8 (c);
4997           EMIT_ONE_BYTE (c);
4998         }
4999       else
5000         {
5001           unsigned code;
5002           struct charset *charset;
5003           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5004                                &code, charset);
5005
5006           if (!charset)
5007             {
5008               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5009                 {
5010                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5011                   charset = CHARSET_FROM_ID (charset_ascii);
5012                 }
5013               else
5014                 {
5015                   c = coding->default_char;
5016                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5017                                        charset_list, &code, charset);
5018                 }
5019             }
5020           if (code == CHARSET_INVALID_CODE (charset))
5021             abort ();
5022           if (charset == charset_kanji)
5023             {
5024               int c1, c2;
5025               JIS_TO_SJIS (code);
5026               c1 = code >> 8, c2 = code & 0xFF;
5027               EMIT_TWO_BYTES (c1, c2);
5028             }
5029           else if (charset == charset_kana)
5030             EMIT_ONE_BYTE (code | 0x80);
5031           else if (charset_kanji2 && charset == charset_kanji2)
5032             {
5033               int c1, c2;
5034
5035               c1 = code >> 8;
5036               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5037                   || c1 == 0x28
5038                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5039                 {
5040                   JIS_TO_SJIS2 (code);
5041                   c1 = code >> 8, c2 = code & 0xFF;
5042                   EMIT_TWO_BYTES (c1, c2);
5043                 }
5044               else
5045                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5046             }
5047           else
5048             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5049         }
5050     }
5051   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5052   coding->produced_char += produced_chars;
5053   coding->produced = dst - coding->destination;
5054   return 0;
5055 }
5056
5057 static int
5058 encode_coding_big5 (struct coding_system *coding)
5059 {
5060   int multibytep = coding->dst_multibyte;
5061   int *charbuf = coding->charbuf;
5062   int *charbuf_end = charbuf + coding->charbuf_used;
5063   unsigned char *dst = coding->destination + coding->produced;
5064   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5065   int safe_room = 4;
5066   ptrdiff_t produced_chars = 0;
5067   Lisp_Object attrs, charset_list, val;
5068   int ascii_compatible;
5069   struct charset *charset_big5;
5070   int c;
5071
5072   CODING_GET_INFO (coding, attrs, charset_list);
5073   val = XCDR (charset_list);
5074   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5075   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5076
5077   while (charbuf < charbuf_end)
5078     {
5079       ASSURE_DESTINATION (safe_room);
5080       c = *charbuf++;
5081       /* Now encode the character C.  */
5082       if (ASCII_CHAR_P (c) && ascii_compatible)
5083         EMIT_ONE_ASCII_BYTE (c);
5084       else if (CHAR_BYTE8_P (c))
5085         {
5086           c = CHAR_TO_BYTE8 (c);
5087           EMIT_ONE_BYTE (c);
5088         }
5089       else
5090         {
5091           unsigned code;
5092           struct charset *charset;
5093           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5094                                &code, charset);
5095
5096           if (! charset)
5097             {
5098               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5099                 {
5100                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5101                   charset = CHARSET_FROM_ID (charset_ascii);
5102                 }
5103               else
5104                 {
5105                   c = coding->default_char;
5106                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5107                                        charset_list, &code, charset);
5108                 }
5109             }
5110           if (code == CHARSET_INVALID_CODE (charset))
5111             abort ();
5112           if (charset == charset_big5)
5113             {
5114               int c1, c2;
5115
5116               c1 = code >> 8, c2 = code & 0xFF;
5117               EMIT_TWO_BYTES (c1, c2);
5118             }
5119           else
5120             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5121         }
5122     }
5123   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5124   coding->produced_char += produced_chars;
5125   coding->produced = dst - coding->destination;
5126   return 0;
5127 }
5128
5129 \f
5130 /*** 10. CCL handlers ***/
5131
5132 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5133    Check if a text is encoded in a coding system of which
5134    encoder/decoder are written in CCL program.  If it is, return
5135    CATEGORY_MASK_CCL, else return 0.  */
5136
5137 static int
5138 detect_coding_ccl (struct coding_system *coding,
5139                    struct coding_detection_info *detect_info)
5140 {
5141   const unsigned char *src = coding->source, *src_base;
5142   const unsigned char *src_end = coding->source + coding->src_bytes;
5143   int multibytep = coding->src_multibyte;
5144   ptrdiff_t consumed_chars = 0;
5145   int found = 0;
5146   unsigned char *valids;
5147   ptrdiff_t head_ascii = coding->head_ascii;
5148   Lisp_Object attrs;
5149
5150   detect_info->checked |= CATEGORY_MASK_CCL;
5151
5152   coding = &coding_categories[coding_category_ccl];
5153   valids = CODING_CCL_VALIDS (coding);
5154   attrs = CODING_ID_ATTRS (coding->id);
5155   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5156     src += head_ascii;
5157
5158   while (1)
5159     {
5160       int c;
5161
5162       src_base = src;
5163       ONE_MORE_BYTE (c);
5164       if (c < 0 || ! valids[c])
5165         break;
5166       if ((valids[c] > 1))
5167         found = CATEGORY_MASK_CCL;
5168     }
5169   detect_info->rejected |= CATEGORY_MASK_CCL;
5170   return 0;
5171
5172  no_more_source:
5173   detect_info->found |= found;
5174   return 1;
5175 }
5176
5177 static void
5178 decode_coding_ccl (struct coding_system *coding)
5179 {
5180   const unsigned char *src = coding->source + coding->consumed;
5181   const unsigned char *src_end = coding->source + coding->src_bytes;
5182   int *charbuf = coding->charbuf + coding->charbuf_used;
5183   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5184   ptrdiff_t consumed_chars = 0;
5185   int multibytep = coding->src_multibyte;
5186   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5187   int source_charbuf[1024];
5188   int source_byteidx[1025];
5189   Lisp_Object attrs, charset_list;
5190
5191   CODING_GET_INFO (coding, attrs, charset_list);
5192
5193   while (1)
5194     {
5195       const unsigned char *p = src;
5196       int i = 0;
5197
5198       if (multibytep)
5199         {
5200           while (i < 1024 && p < src_end)
5201             {
5202               source_byteidx[i] = p - src;
5203               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5204             }
5205           source_byteidx[i] = p - src;
5206         }
5207       else
5208         while (i < 1024 && p < src_end)
5209           source_charbuf[i++] = *p++;
5210
5211       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5212         ccl->last_block = 1;
5213       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5214                   charset_list);
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static int
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = 1;
5264
5265   do
5266     {
5267       ccl_driver (ccl, charbuf, destination_charbuf,
5268                   charbuf_end - charbuf, 1024, charset_list);
5269       if (multibytep)
5270         {
5271           ASSURE_DESTINATION (ccl->produced * 2);
5272           for (i = 0; i < ccl->produced; i++)
5273             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5274         }
5275       else
5276         {
5277           ASSURE_DESTINATION (ccl->produced);
5278           for (i = 0; i < ccl->produced; i++)
5279             *dst++ = destination_charbuf[i] & 0xFF;
5280           produced_chars += ccl->produced;
5281         }
5282       charbuf += ccl->consumed;
5283       if (ccl->status == CCL_STAT_QUIT
5284           || ccl->status == CCL_STAT_INVALID_CMD)
5285         break;
5286     }
5287   while (charbuf < charbuf_end);
5288
5289   switch (ccl->status)
5290     {
5291     case CCL_STAT_SUSPEND_BY_SRC:
5292       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5293       break;
5294     case CCL_STAT_SUSPEND_BY_DST:
5295       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5296       break;
5297     case CCL_STAT_QUIT:
5298     case CCL_STAT_INVALID_CMD:
5299       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5300       break;
5301     default:
5302       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5303       break;
5304     }
5305
5306   coding->produced_char += produced_chars;
5307   coding->produced = dst - coding->destination;
5308   return 0;
5309 }
5310
5311
5312 \f
5313 /*** 10, 11. no-conversion handlers ***/
5314
5315 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5316
5317 static void
5318 decode_coding_raw_text (struct coding_system *coding)
5319 {
5320   int eol_dos =
5321     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5322
5323   coding->chars_at_source = 1;
5324   coding->consumed_char = coding->src_chars;
5325   coding->consumed = coding->src_bytes;
5326   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5327     {
5328       coding->consumed_char--;
5329       coding->consumed--;
5330       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5331     }
5332   else
5333     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5334 }
5335
5336 static int
5337 encode_coding_raw_text (struct coding_system *coding)
5338 {
5339   int multibytep = coding->dst_multibyte;
5340   int *charbuf = coding->charbuf;
5341   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5342   unsigned char *dst = coding->destination + coding->produced;
5343   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5344   ptrdiff_t produced_chars = 0;
5345   int c;
5346
5347   if (multibytep)
5348     {
5349       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5350
5351       if (coding->src_multibyte)
5352         while (charbuf < charbuf_end)
5353           {
5354             ASSURE_DESTINATION (safe_room);
5355             c = *charbuf++;
5356             if (ASCII_CHAR_P (c))
5357               EMIT_ONE_ASCII_BYTE (c);
5358             else if (CHAR_BYTE8_P (c))
5359               {
5360                 c = CHAR_TO_BYTE8 (c);
5361                 EMIT_ONE_BYTE (c);
5362               }
5363             else
5364               {
5365                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5366
5367                 CHAR_STRING_ADVANCE (c, p1);
5368                 do
5369                   {
5370                     EMIT_ONE_BYTE (*p0);
5371                     p0++;
5372                   }
5373                 while (p0 < p1);
5374               }
5375           }
5376       else
5377         while (charbuf < charbuf_end)
5378           {
5379             ASSURE_DESTINATION (safe_room);
5380             c = *charbuf++;
5381             EMIT_ONE_BYTE (c);
5382           }
5383     }
5384   else
5385     {
5386       if (coding->src_multibyte)
5387         {
5388           int safe_room = MAX_MULTIBYTE_LENGTH;
5389
5390           while (charbuf < charbuf_end)
5391             {
5392               ASSURE_DESTINATION (safe_room);
5393               c = *charbuf++;
5394               if (ASCII_CHAR_P (c))
5395                 *dst++ = c;
5396               else if (CHAR_BYTE8_P (c))
5397                 *dst++ = CHAR_TO_BYTE8 (c);
5398               else
5399                 CHAR_STRING_ADVANCE (c, dst);
5400             }
5401         }
5402       else
5403         {
5404           ASSURE_DESTINATION (charbuf_end - charbuf);
5405           while (charbuf < charbuf_end && dst < dst_end)
5406             *dst++ = *charbuf++;
5407         }
5408       produced_chars = dst - (coding->destination + coding->produced);
5409     }
5410   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5411   coding->produced_char += produced_chars;
5412   coding->produced = dst - coding->destination;
5413   return 0;
5414 }
5415
5416 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5417    Check if a text is encoded in a charset-based coding system.  If it
5418    is, return 1, else return 0.  */
5419
5420 static int
5421 detect_coding_charset (struct coding_system *coding,
5422                        struct coding_detection_info *detect_info)
5423 {
5424   const unsigned char *src = coding->source, *src_base;
5425   const unsigned char *src_end = coding->source + coding->src_bytes;
5426   int multibytep = coding->src_multibyte;
5427   ptrdiff_t consumed_chars = 0;
5428   Lisp_Object attrs, valids, name;
5429   int found = 0;
5430   ptrdiff_t head_ascii = coding->head_ascii;
5431   int check_latin_extra = 0;
5432
5433   detect_info->checked |= CATEGORY_MASK_CHARSET;
5434
5435   coding = &coding_categories[coding_category_charset];
5436   attrs = CODING_ID_ATTRS (coding->id);
5437   valids = AREF (attrs, coding_attr_charset_valids);
5438   name = CODING_ID_NAME (coding->id);
5439   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5440                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5441       || strncmp (SSDATA (SYMBOL_NAME (name)),
5442                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5443     check_latin_extra = 1;
5444
5445   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5446     src += head_ascii;
5447
5448   while (1)
5449     {
5450       int c;
5451       Lisp_Object val;
5452       struct charset *charset;
5453       int dim, idx;
5454
5455       src_base = src;
5456       ONE_MORE_BYTE (c);
5457       if (c < 0)
5458         continue;
5459       val = AREF (valids, c);
5460       if (NILP (val))
5461         break;
5462       if (c >= 0x80)
5463         {
5464           if (c < 0xA0
5465               && check_latin_extra
5466               && (!VECTORP (Vlatin_extra_code_table)
5467                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5468             break;
5469           found = CATEGORY_MASK_CHARSET;
5470         }
5471       if (INTEGERP (val))
5472         {
5473           charset = CHARSET_FROM_ID (XFASTINT (val));
5474           dim = CHARSET_DIMENSION (charset);
5475           for (idx = 1; idx < dim; idx++)
5476             {
5477               if (src == src_end)
5478                 goto too_short;
5479               ONE_MORE_BYTE (c);
5480               if (c < charset->code_space[(dim - 1 - idx) * 4]
5481                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5482                 break;
5483             }
5484           if (idx < dim)
5485             break;
5486         }
5487       else
5488         {
5489           idx = 1;
5490           for (; CONSP (val); val = XCDR (val))
5491             {
5492               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5493               dim = CHARSET_DIMENSION (charset);
5494               while (idx < dim)
5495                 {
5496                   if (src == src_end)
5497                     goto too_short;
5498                   ONE_MORE_BYTE (c);
5499                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5500                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5501                     break;
5502                   idx++;
5503                 }
5504               if (idx == dim)
5505                 {
5506                   val = Qnil;
5507                   break;
5508                 }
5509             }
5510           if (CONSP (val))
5511             break;
5512         }
5513     }
5514  too_short:
5515   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5516   return 0;
5517
5518  no_more_source:
5519   detect_info->found |= found;
5520   return 1;
5521 }
5522
5523 static void
5524 decode_coding_charset (struct coding_system *coding)
5525 {
5526   const unsigned char *src = coding->source + coding->consumed;
5527   const unsigned char *src_end = coding->source + coding->src_bytes;
5528   const unsigned char *src_base;
5529   int *charbuf = coding->charbuf + coding->charbuf_used;
5530   /* We may produce one charset annotation in one loop and one more at
5531      the end.  */
5532   int *charbuf_end
5533     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5534   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5535   int multibytep = coding->src_multibyte;
5536   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5537   Lisp_Object valids;
5538   ptrdiff_t char_offset = coding->produced_char;
5539   ptrdiff_t last_offset = char_offset;
5540   int last_id = charset_ascii;
5541   int eol_dos =
5542     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5543   int byte_after_cr = -1;
5544
5545   valids = AREF (attrs, coding_attr_charset_valids);
5546
5547   while (1)
5548     {
5549       int c;
5550       Lisp_Object val;
5551       struct charset *charset;
5552       int dim;
5553       int len = 1;
5554       unsigned code;
5555
5556       src_base = src;
5557       consumed_chars_base = consumed_chars;
5558
5559       if (charbuf >= charbuf_end)
5560         {
5561           if (byte_after_cr >= 0)
5562             src_base--;
5563           break;
5564         }
5565
5566       if (byte_after_cr >= 0)
5567         {
5568           c = byte_after_cr;
5569           byte_after_cr = -1;
5570         }
5571       else
5572         {
5573           ONE_MORE_BYTE (c);
5574           if (eol_dos && c == '\r')
5575             ONE_MORE_BYTE (byte_after_cr);
5576         }
5577       if (c < 0)
5578         goto invalid_code;
5579       code = c;
5580
5581       val = AREF (valids, c);
5582       if (! INTEGERP (val) && ! CONSP (val))
5583         goto invalid_code;
5584       if (INTEGERP (val))
5585         {
5586           charset = CHARSET_FROM_ID (XFASTINT (val));
5587           dim = CHARSET_DIMENSION (charset);
5588           while (len < dim)
5589             {
5590               ONE_MORE_BYTE (c);
5591               code = (code << 8) | c;
5592               len++;
5593             }
5594           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5595                               charset, code, c);
5596         }
5597       else
5598         {
5599           /* VAL is a list of charset IDs.  It is assured that the
5600              list is sorted by charset dimensions (smaller one
5601              comes first).  */
5602           while (CONSP (val))
5603             {
5604               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5605               dim = CHARSET_DIMENSION (charset);
5606               while (len < dim)
5607                 {
5608                   ONE_MORE_BYTE (c);
5609                   code = (code << 8) | c;
5610                   len++;
5611                 }
5612               CODING_DECODE_CHAR (coding, src, src_base,
5613                                   src_end, charset, code, c);
5614               if (c >= 0)
5615                 break;
5616               val = XCDR (val);
5617             }
5618         }
5619       if (c < 0)
5620         goto invalid_code;
5621       if (charset->id != charset_ascii
5622           && last_id != charset->id)
5623         {
5624           if (last_id != charset_ascii)
5625             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5626           last_id = charset->id;
5627           last_offset = char_offset;
5628         }
5629
5630       *charbuf++ = c;
5631       char_offset++;
5632       continue;
5633
5634     invalid_code:
5635       src = src_base;
5636       consumed_chars = consumed_chars_base;
5637       ONE_MORE_BYTE (c);
5638       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5639       char_offset++;
5640       coding->errors++;
5641     }
5642
5643  no_more_source:
5644   if (last_id != charset_ascii)
5645     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5646   coding->consumed_char += consumed_chars_base;
5647   coding->consumed = src_base - coding->source;
5648   coding->charbuf_used = charbuf - coding->charbuf;
5649 }
5650
5651 static int
5652 encode_coding_charset (struct coding_system *coding)
5653 {
5654   int multibytep = coding->dst_multibyte;
5655   int *charbuf = coding->charbuf;
5656   int *charbuf_end = charbuf + coding->charbuf_used;
5657   unsigned char *dst = coding->destination + coding->produced;
5658   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5659   int safe_room = MAX_MULTIBYTE_LENGTH;
5660   ptrdiff_t produced_chars = 0;
5661   Lisp_Object attrs, charset_list;
5662   int ascii_compatible;
5663   int c;
5664
5665   CODING_GET_INFO (coding, attrs, charset_list);
5666   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5667
5668   while (charbuf < charbuf_end)
5669     {
5670       struct charset *charset;
5671       unsigned code;
5672
5673       ASSURE_DESTINATION (safe_room);
5674       c = *charbuf++;
5675       if (ascii_compatible && ASCII_CHAR_P (c))
5676         EMIT_ONE_ASCII_BYTE (c);
5677       else if (CHAR_BYTE8_P (c))
5678         {
5679           c = CHAR_TO_BYTE8 (c);
5680           EMIT_ONE_BYTE (c);
5681         }
5682       else
5683         {
5684           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5685                                &code, charset);
5686
5687           if (charset)
5688             {
5689               if (CHARSET_DIMENSION (charset) == 1)
5690                 EMIT_ONE_BYTE (code);
5691               else if (CHARSET_DIMENSION (charset) == 2)
5692                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5693               else if (CHARSET_DIMENSION (charset) == 3)
5694                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5695               else
5696                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5697                                  (code >> 8) & 0xFF, code & 0xFF);
5698             }
5699           else
5700             {
5701               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5702                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5703               else
5704                 c = coding->default_char;
5705               EMIT_ONE_BYTE (c);
5706             }
5707         }
5708     }
5709
5710   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5711   coding->produced_char += produced_chars;
5712   coding->produced = dst - coding->destination;
5713   return 0;
5714 }
5715
5716 \f
5717 /*** 7. C library functions ***/
5718
5719 /* Setup coding context CODING from information about CODING_SYSTEM.
5720    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5721    CODING_SYSTEM is invalid, signal an error.  */
5722
5723 void
5724 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5725 {
5726   Lisp_Object attrs;
5727   Lisp_Object eol_type;
5728   Lisp_Object coding_type;
5729   Lisp_Object val;
5730
5731   if (NILP (coding_system))
5732     coding_system = Qundecided;
5733
5734   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5735
5736   attrs = CODING_ID_ATTRS (coding->id);
5737   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5738
5739   coding->mode = 0;
5740   coding->head_ascii = -1;
5741   if (VECTORP (eol_type))
5742     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5743                             | CODING_REQUIRE_DETECTION_MASK);
5744   else if (! EQ (eol_type, Qunix))
5745     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5746                             | CODING_REQUIRE_ENCODING_MASK);
5747   else
5748     coding->common_flags = 0;
5749   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5750     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5751   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5752     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5753   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5754     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5755
5756   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5757   coding->max_charset_id = SCHARS (val) - 1;
5758   coding->safe_charsets = SDATA (val);
5759   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5760   coding->carryover_bytes = 0;
5761
5762   coding_type = CODING_ATTR_TYPE (attrs);
5763   if (EQ (coding_type, Qundecided))
5764     {
5765       coding->detector = NULL;
5766       coding->decoder = decode_coding_raw_text;
5767       coding->encoder = encode_coding_raw_text;
5768       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5769     }
5770   else if (EQ (coding_type, Qiso_2022))
5771     {
5772       int i;
5773       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5774
5775       /* Invoke graphic register 0 to plane 0.  */
5776       CODING_ISO_INVOCATION (coding, 0) = 0;
5777       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5778       CODING_ISO_INVOCATION (coding, 1)
5779         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5780       /* Setup the initial status of designation.  */
5781       for (i = 0; i < 4; i++)
5782         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5783       /* Not single shifting initially.  */
5784       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5785       /* Beginning of buffer should also be regarded as bol. */
5786       CODING_ISO_BOL (coding) = 1;
5787       coding->detector = detect_coding_iso_2022;
5788       coding->decoder = decode_coding_iso_2022;
5789       coding->encoder = encode_coding_iso_2022;
5790       if (flags & CODING_ISO_FLAG_SAFE)
5791         coding->mode |= CODING_MODE_SAFE_ENCODING;
5792       coding->common_flags
5793         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5794             | CODING_REQUIRE_FLUSHING_MASK);
5795       if (flags & CODING_ISO_FLAG_COMPOSITION)
5796         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5797       if (flags & CODING_ISO_FLAG_DESIGNATION)
5798         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5799       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5800         {
5801           setup_iso_safe_charsets (attrs);
5802           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5803           coding->max_charset_id = SCHARS (val) - 1;
5804           coding->safe_charsets = SDATA (val);
5805         }
5806       CODING_ISO_FLAGS (coding) = flags;
5807       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5808       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5809       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5810       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5811     }
5812   else if (EQ (coding_type, Qcharset))
5813     {
5814       coding->detector = detect_coding_charset;
5815       coding->decoder = decode_coding_charset;
5816       coding->encoder = encode_coding_charset;
5817       coding->common_flags
5818         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5819     }
5820   else if (EQ (coding_type, Qutf_8))
5821     {
5822       val = AREF (attrs, coding_attr_utf_bom);
5823       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5824                                    : EQ (val, Qt) ? utf_with_bom
5825                                    : utf_without_bom);
5826       coding->detector = detect_coding_utf_8;
5827       coding->decoder = decode_coding_utf_8;
5828       coding->encoder = encode_coding_utf_8;
5829       coding->common_flags
5830         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5831       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5832         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5833     }
5834   else if (EQ (coding_type, Qutf_16))
5835     {
5836       val = AREF (attrs, coding_attr_utf_bom);
5837       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5838                                     : EQ (val, Qt) ? utf_with_bom
5839                                     : utf_without_bom);
5840       val = AREF (attrs, coding_attr_utf_16_endian);
5841       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5842                                        : utf_16_little_endian);
5843       CODING_UTF_16_SURROGATE (coding) = 0;
5844       coding->detector = detect_coding_utf_16;
5845       coding->decoder = decode_coding_utf_16;
5846       coding->encoder = encode_coding_utf_16;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5850         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5851     }
5852   else if (EQ (coding_type, Qccl))
5853     {
5854       coding->detector = detect_coding_ccl;
5855       coding->decoder = decode_coding_ccl;
5856       coding->encoder = encode_coding_ccl;
5857       coding->common_flags
5858         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5859             | CODING_REQUIRE_FLUSHING_MASK);
5860     }
5861   else if (EQ (coding_type, Qemacs_mule))
5862     {
5863       coding->detector = detect_coding_emacs_mule;
5864       coding->decoder = decode_coding_emacs_mule;
5865       coding->encoder = encode_coding_emacs_mule;
5866       coding->common_flags
5867         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5868       coding->spec.emacs_mule.full_support = 1;
5869       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5870           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5871         {
5872           Lisp_Object tail, safe_charsets;
5873           int max_charset_id = 0;
5874
5875           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5876                tail = XCDR (tail))
5877             if (max_charset_id < XFASTINT (XCAR (tail)))
5878               max_charset_id = XFASTINT (XCAR (tail));
5879           safe_charsets = make_uninit_string (max_charset_id + 1);
5880           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5881           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5882                tail = XCDR (tail))
5883             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5884           coding->max_charset_id = max_charset_id;
5885           coding->safe_charsets = SDATA (safe_charsets);
5886           coding->spec.emacs_mule.full_support = 1;
5887         }
5888       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5889       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5890     }
5891   else if (EQ (coding_type, Qshift_jis))
5892     {
5893       coding->detector = detect_coding_sjis;
5894       coding->decoder = decode_coding_sjis;
5895       coding->encoder = encode_coding_sjis;
5896       coding->common_flags
5897         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5898     }
5899   else if (EQ (coding_type, Qbig5))
5900     {
5901       coding->detector = detect_coding_big5;
5902       coding->decoder = decode_coding_big5;
5903       coding->encoder = encode_coding_big5;
5904       coding->common_flags
5905         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5906     }
5907   else                          /* EQ (coding_type, Qraw_text) */
5908     {
5909       coding->detector = NULL;
5910       coding->decoder = decode_coding_raw_text;
5911       coding->encoder = encode_coding_raw_text;
5912       if (! EQ (eol_type, Qunix))
5913         {
5914           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5915           if (! VECTORP (eol_type))
5916             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5917         }
5918
5919     }
5920
5921   return;
5922 }
5923
5924 /* Return a list of charsets supported by CODING.  */
5925
5926 Lisp_Object
5927 coding_charset_list (struct coding_system *coding)
5928 {
5929   Lisp_Object attrs, charset_list;
5930
5931   CODING_GET_INFO (coding, attrs, charset_list);
5932   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5933     {
5934       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5935
5936       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5937         charset_list = Viso_2022_charset_list;
5938     }
5939   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5940     {
5941       charset_list = Vemacs_mule_charset_list;
5942     }
5943   return charset_list;
5944 }
5945
5946
5947 /* Return a list of charsets supported by CODING-SYSTEM.  */
5948
5949 Lisp_Object
5950 coding_system_charset_list (Lisp_Object coding_system)
5951 {
5952   ptrdiff_t id;
5953   Lisp_Object attrs, charset_list;
5954
5955   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5956   attrs = CODING_ID_ATTRS (id);
5957
5958   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5959     {
5960       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5961
5962       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5963         charset_list = Viso_2022_charset_list;
5964       else
5965         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5966     }
5967   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5968     {
5969       charset_list = Vemacs_mule_charset_list;
5970     }
5971   else
5972     {
5973       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5974     }
5975   return charset_list;
5976 }
5977
5978
5979 /* Return raw-text or one of its subsidiaries that has the same
5980    eol_type as CODING-SYSTEM.  */
5981
5982 Lisp_Object
5983 raw_text_coding_system (Lisp_Object coding_system)
5984 {
5985   Lisp_Object spec, attrs;
5986   Lisp_Object eol_type, raw_text_eol_type;
5987
5988   if (NILP (coding_system))
5989     return Qraw_text;
5990   spec = CODING_SYSTEM_SPEC (coding_system);
5991   attrs = AREF (spec, 0);
5992
5993   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5994     return coding_system;
5995
5996   eol_type = AREF (spec, 2);
5997   if (VECTORP (eol_type))
5998     return Qraw_text;
5999   spec = CODING_SYSTEM_SPEC (Qraw_text);
6000   raw_text_eol_type = AREF (spec, 2);
6001   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6002           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6003           : AREF (raw_text_eol_type, 2));
6004 }
6005
6006
6007 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6008    the subsidiary that has the same eol-spec as PARENT (if it is not
6009    nil and specifies end-of-line format) or the system's setting
6010    (system_eol_type).  */
6011
6012 Lisp_Object
6013 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6014 {
6015   Lisp_Object spec, eol_type;
6016
6017   if (NILP (coding_system))
6018     coding_system = Qraw_text;
6019   spec = CODING_SYSTEM_SPEC (coding_system);
6020   eol_type = AREF (spec, 2);
6021   if (VECTORP (eol_type))
6022     {
6023       Lisp_Object parent_eol_type;
6024
6025       if (! NILP (parent))
6026         {
6027           Lisp_Object parent_spec;
6028
6029           parent_spec = CODING_SYSTEM_SPEC (parent);
6030           parent_eol_type = AREF (parent_spec, 2);
6031           if (VECTORP (parent_eol_type))
6032             parent_eol_type = system_eol_type;
6033         }
6034       else
6035         parent_eol_type = system_eol_type;
6036       if (EQ (parent_eol_type, Qunix))
6037         coding_system = AREF (eol_type, 0);
6038       else if (EQ (parent_eol_type, Qdos))
6039         coding_system = AREF (eol_type, 1);
6040       else if (EQ (parent_eol_type, Qmac))
6041         coding_system = AREF (eol_type, 2);
6042     }
6043   return coding_system;
6044 }
6045
6046
6047 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6048    decided for writing to a process.  If not, complement them, and
6049    return a new coding system.  */
6050
6051 Lisp_Object
6052 complement_process_encoding_system (Lisp_Object coding_system)
6053 {
6054   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6055   Lisp_Object spec, attrs;
6056   int i;
6057
6058   for (i = 0; i < 3; i++)
6059     {
6060       if (i == 1)
6061         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6062       else if (i == 2)
6063         coding_system = preferred_coding_system ();
6064       spec = CODING_SYSTEM_SPEC (coding_system);
6065       if (NILP (spec))
6066         continue;
6067       attrs = AREF (spec, 0);
6068       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6069         coding_base = CODING_ATTR_BASE_NAME (attrs);
6070       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6071         eol_base = coding_system;
6072       if (! NILP (coding_base) && ! NILP (eol_base))
6073         break;
6074     }
6075
6076   if (i > 0)
6077     /* The original CODING_SYSTEM didn't specify text-conversion or
6078        eol-conversion.  Be sure that we return a fully complemented
6079        coding system.  */
6080     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6081   return coding_system;
6082 }
6083
6084
6085 /* Emacs has a mechanism to automatically detect a coding system if it
6086    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6087    it's impossible to distinguish some coding systems accurately
6088    because they use the same range of codes.  So, at first, coding
6089    systems are categorized into 7, those are:
6090
6091    o coding-category-emacs-mule
6092
6093         The category for a coding system which has the same code range
6094         as Emacs' internal format.  Assigned the coding-system (Lisp
6095         symbol) `emacs-mule' by default.
6096
6097    o coding-category-sjis
6098
6099         The category for a coding system which has the same code range
6100         as SJIS.  Assigned the coding-system (Lisp
6101         symbol) `japanese-shift-jis' by default.
6102
6103    o coding-category-iso-7
6104
6105         The category for a coding system which has the same code range
6106         as ISO2022 of 7-bit environment.  This doesn't use any locking
6107         shift and single shift functions.  This can encode/decode all
6108         charsets.  Assigned the coding-system (Lisp symbol)
6109         `iso-2022-7bit' by default.
6110
6111    o coding-category-iso-7-tight
6112
6113         Same as coding-category-iso-7 except that this can
6114         encode/decode only the specified charsets.
6115
6116    o coding-category-iso-8-1
6117
6118         The category for a coding system which has the same code range
6119         as ISO2022 of 8-bit environment and graphic plane 1 used only
6120         for DIMENSION1 charset.  This doesn't use any locking shift
6121         and single shift functions.  Assigned the coding-system (Lisp
6122         symbol) `iso-latin-1' by default.
6123
6124    o coding-category-iso-8-2
6125
6126         The category for a coding system which has the same code range
6127         as ISO2022 of 8-bit environment and graphic plane 1 used only
6128         for DIMENSION2 charset.  This doesn't use any locking shift
6129         and single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `japanese-iso-8bit' by default.
6131
6132    o coding-category-iso-7-else
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 7-bit environment but uses locking shift or
6136         single shift functions.  Assigned the coding-system (Lisp
6137         symbol) `iso-2022-7bit-lock' by default.
6138
6139    o coding-category-iso-8-else
6140
6141         The category for a coding system which has the same code range
6142         as ISO2022 of 8-bit environment but uses locking shift or
6143         single shift functions.  Assigned the coding-system (Lisp
6144         symbol) `iso-2022-8bit-ss2' by default.
6145
6146    o coding-category-big5
6147
6148         The category for a coding system which has the same code range
6149         as BIG5.  Assigned the coding-system (Lisp symbol)
6150         `cn-big5' by default.
6151
6152    o coding-category-utf-8
6153
6154         The category for a coding system which has the same code range
6155         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6156         symbol) `utf-8' by default.
6157
6158    o coding-category-utf-16-be
6159
6160         The category for a coding system in which a text has an
6161         Unicode signature (cf. Unicode Standard) in the order of BIG
6162         endian at the head.  Assigned the coding-system (Lisp symbol)
6163         `utf-16-be' by default.
6164
6165    o coding-category-utf-16-le
6166
6167         The category for a coding system in which a text has an
6168         Unicode signature (cf. Unicode Standard) in the order of
6169         LITTLE endian at the head.  Assigned the coding-system (Lisp
6170         symbol) `utf-16-le' by default.
6171
6172    o coding-category-ccl
6173
6174         The category for a coding system of which encoder/decoder is
6175         written in CCL programs.  The default value is nil, i.e., no
6176         coding system is assigned.
6177
6178    o coding-category-binary
6179
6180         The category for a coding system not categorized in any of the
6181         above.  Assigned the coding-system (Lisp symbol)
6182         `no-conversion' by default.
6183
6184    Each of them is a Lisp symbol and the value is an actual
6185    `coding-system's (this is also a Lisp symbol) assigned by a user.
6186    What Emacs does actually is to detect a category of coding system.
6187    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6188    decide only one possible category, it selects a category of the
6189    highest priority.  Priorities of categories are also specified by a
6190    user in a Lisp variable `coding-category-list'.
6191
6192 */
6193
6194 #define EOL_SEEN_NONE   0
6195 #define EOL_SEEN_LF     1
6196 #define EOL_SEEN_CR     2
6197 #define EOL_SEEN_CRLF   4
6198
6199 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6200    SOURCE is encoded.  If CATEGORY is one of
6201    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6202    two-byte, else they are encoded by one-byte.
6203
6204    Return one of EOL_SEEN_XXX.  */
6205
6206 #define MAX_EOL_CHECK_COUNT 3
6207
6208 static int
6209 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6210             enum coding_category category)
6211 {
6212   const unsigned char *src = source, *src_end = src + src_bytes;
6213   unsigned char c;
6214   int total  = 0;
6215   int eol_seen = EOL_SEEN_NONE;
6216
6217   if ((1 << category) & CATEGORY_MASK_UTF_16)
6218     {
6219       int msb, lsb;
6220
6221       msb = category == (coding_category_utf_16_le
6222                          | coding_category_utf_16_le_nosig);
6223       lsb = 1 - msb;
6224
6225       while (src + 1 < src_end)
6226         {
6227           c = src[lsb];
6228           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6229             {
6230               int this_eol;
6231
6232               if (c == '\n')
6233                 this_eol = EOL_SEEN_LF;
6234               else if (src + 3 >= src_end
6235                        || src[msb + 2] != 0
6236                        || src[lsb + 2] != '\n')
6237                 this_eol = EOL_SEEN_CR;
6238               else
6239                 {
6240                   this_eol = EOL_SEEN_CRLF;
6241                   src += 2;
6242                 }
6243
6244               if (eol_seen == EOL_SEEN_NONE)
6245                 /* This is the first end-of-line.  */
6246                 eol_seen = this_eol;
6247               else if (eol_seen != this_eol)
6248                 {
6249                   /* The found type is different from what found before.
6250                      Allow for stray ^M characters in DOS EOL files.  */
6251                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6252                       || (eol_seen == EOL_SEEN_CRLF
6253                           && this_eol == EOL_SEEN_CR))
6254                     eol_seen = EOL_SEEN_CRLF;
6255                   else
6256                     {
6257                       eol_seen = EOL_SEEN_LF;
6258                       break;
6259                     }
6260                 }
6261               if (++total == MAX_EOL_CHECK_COUNT)
6262                 break;
6263             }
6264           src += 2;
6265         }
6266     }
6267   else
6268     while (src < src_end)
6269       {
6270         c = *src++;
6271         if (c == '\n' || c == '\r')
6272           {
6273             int this_eol;
6274
6275             if (c == '\n')
6276               this_eol = EOL_SEEN_LF;
6277             else if (src >= src_end || *src != '\n')
6278               this_eol = EOL_SEEN_CR;
6279             else
6280               this_eol = EOL_SEEN_CRLF, src++;
6281
6282             if (eol_seen == EOL_SEEN_NONE)
6283               /* This is the first end-of-line.  */
6284               eol_seen = this_eol;
6285             else if (eol_seen != this_eol)
6286               {
6287                 /* The found type is different from what found before.
6288                    Allow for stray ^M characters in DOS EOL files.  */
6289                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6290                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6291                   eol_seen = EOL_SEEN_CRLF;
6292                 else
6293                   {
6294                     eol_seen = EOL_SEEN_LF;
6295                     break;
6296                   }
6297               }
6298             if (++total == MAX_EOL_CHECK_COUNT)
6299               break;
6300           }
6301       }
6302   return eol_seen;
6303 }
6304
6305
6306 static Lisp_Object
6307 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6308 {
6309   Lisp_Object eol_type;
6310
6311   eol_type = CODING_ID_EOL_TYPE (coding->id);
6312   if (eol_seen & EOL_SEEN_LF)
6313     {
6314       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6315       eol_type = Qunix;
6316     }
6317   else if (eol_seen & EOL_SEEN_CRLF)
6318     {
6319       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6320       eol_type = Qdos;
6321     }
6322   else if (eol_seen & EOL_SEEN_CR)
6323     {
6324       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6325       eol_type = Qmac;
6326     }
6327   return eol_type;
6328 }
6329
6330 /* Detect how a text specified in CODING is encoded.  If a coding
6331    system is detected, update fields of CODING by the detected coding
6332    system.  */
6333
6334 static void
6335 detect_coding (struct coding_system *coding)
6336 {
6337   const unsigned char *src, *src_end;
6338   int saved_mode = coding->mode;
6339
6340   coding->consumed = coding->consumed_char = 0;
6341   coding->produced = coding->produced_char = 0;
6342   coding_set_source (coding);
6343
6344   src_end = coding->source + coding->src_bytes;
6345   coding->head_ascii = 0;
6346
6347   /* If we have not yet decided the text encoding type, detect it
6348      now.  */
6349   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6350     {
6351       int c, i;
6352       struct coding_detection_info detect_info;
6353       int null_byte_found = 0, eight_bit_found = 0;
6354
6355       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6356       for (src = coding->source; src < src_end; src++)
6357         {
6358           c = *src;
6359           if (c & 0x80)
6360             {
6361               eight_bit_found = 1;
6362               if (null_byte_found)
6363                 break;
6364             }
6365           else if (c < 0x20)
6366             {
6367               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6368                   && ! inhibit_iso_escape_detection
6369                   && ! detect_info.checked)
6370                 {
6371                   if (detect_coding_iso_2022 (coding, &detect_info))
6372                     {
6373                       /* We have scanned the whole data.  */
6374                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6375                         {
6376                           /* We didn't find an 8-bit code.  We may
6377                              have found a null-byte, but it's very
6378                              rare that a binary file conforms to
6379                              ISO-2022.  */
6380                           src = src_end;
6381                           coding->head_ascii = src - coding->source;
6382                         }
6383                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6384                       break;
6385                     }
6386                 }
6387               else if (! c && !inhibit_null_byte_detection)
6388                 {
6389                   null_byte_found = 1;
6390                   if (eight_bit_found)
6391                     break;
6392                 }
6393               if (! eight_bit_found)
6394                 coding->head_ascii++;
6395             }
6396           else if (! eight_bit_found)
6397             coding->head_ascii++;
6398         }
6399
6400       if (null_byte_found || eight_bit_found
6401           || coding->head_ascii < coding->src_bytes
6402           || detect_info.found)
6403         {
6404           enum coding_category category;
6405           struct coding_system *this;
6406
6407           if (coding->head_ascii == coding->src_bytes)
6408             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6409             for (i = 0; i < coding_category_raw_text; i++)
6410               {
6411                 category = coding_priorities[i];
6412                 this = coding_categories + category;
6413                 if (detect_info.found & (1 << category))
6414                   break;
6415               }
6416           else
6417             {
6418               if (null_byte_found)
6419                 {
6420                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6421                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6422                 }
6423               for (i = 0; i < coding_category_raw_text; i++)
6424                 {
6425                   category = coding_priorities[i];
6426                   this = coding_categories + category;
6427                   if (this->id < 0)
6428                     {
6429                       /* No coding system of this category is defined.  */
6430                       detect_info.rejected |= (1 << category);
6431                     }
6432                   else if (category >= coding_category_raw_text)
6433                     continue;
6434                   else if (detect_info.checked & (1 << category))
6435                     {
6436                       if (detect_info.found & (1 << category))
6437                         break;
6438                     }
6439                   else if ((*(this->detector)) (coding, &detect_info)
6440                            && detect_info.found & (1 << category))
6441                     {
6442                       if (category == coding_category_utf_16_auto)
6443                         {
6444                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6445                             category = coding_category_utf_16_le;
6446                           else
6447                             category = coding_category_utf_16_be;
6448                         }
6449                       break;
6450                     }
6451                 }
6452             }
6453
6454           if (i < coding_category_raw_text)
6455             setup_coding_system (CODING_ID_NAME (this->id), coding);
6456           else if (null_byte_found)
6457             setup_coding_system (Qno_conversion, coding);
6458           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6459                    == CATEGORY_MASK_ANY)
6460             setup_coding_system (Qraw_text, coding);
6461           else if (detect_info.rejected)
6462             for (i = 0; i < coding_category_raw_text; i++)
6463               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6464                 {
6465                   this = coding_categories + coding_priorities[i];
6466                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6467                   break;
6468                 }
6469         }
6470     }
6471   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6472            == coding_category_utf_8_auto)
6473     {
6474       Lisp_Object coding_systems;
6475       struct coding_detection_info detect_info;
6476
6477       coding_systems
6478         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6479       detect_info.found = detect_info.rejected = 0;
6480       coding->head_ascii = 0;
6481       if (CONSP (coding_systems)
6482           && detect_coding_utf_8 (coding, &detect_info))
6483         {
6484           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6485             setup_coding_system (XCAR (coding_systems), coding);
6486           else
6487             setup_coding_system (XCDR (coding_systems), coding);
6488         }
6489     }
6490   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6491            == coding_category_utf_16_auto)
6492     {
6493       Lisp_Object coding_systems;
6494       struct coding_detection_info detect_info;
6495
6496       coding_systems
6497         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6498       detect_info.found = detect_info.rejected = 0;
6499       coding->head_ascii = 0;
6500       if (CONSP (coding_systems)
6501           && detect_coding_utf_16 (coding, &detect_info))
6502         {
6503           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6504             setup_coding_system (XCAR (coding_systems), coding);
6505           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6506             setup_coding_system (XCDR (coding_systems), coding);
6507         }
6508     }
6509   coding->mode = saved_mode;
6510 }
6511
6512
6513 static void
6514 decode_eol (struct coding_system *coding)
6515 {
6516   Lisp_Object eol_type;
6517   unsigned char *p, *pbeg, *pend;
6518
6519   eol_type = CODING_ID_EOL_TYPE (coding->id);
6520   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6521     return;
6522
6523   if (NILP (coding->dst_object))
6524     pbeg = coding->destination;
6525   else
6526     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6527   pend = pbeg + coding->produced;
6528
6529   if (VECTORP (eol_type))
6530     {
6531       int eol_seen = EOL_SEEN_NONE;
6532
6533       for (p = pbeg; p < pend; p++)
6534         {
6535           if (*p == '\n')
6536             eol_seen |= EOL_SEEN_LF;
6537           else if (*p == '\r')
6538             {
6539               if (p + 1 < pend && *(p + 1) == '\n')
6540                 {
6541                   eol_seen |= EOL_SEEN_CRLF;
6542                   p++;
6543                 }
6544               else
6545                 eol_seen |= EOL_SEEN_CR;
6546             }
6547         }
6548       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6549       if ((eol_seen & EOL_SEEN_CRLF) != 0
6550           && (eol_seen & EOL_SEEN_CR) != 0
6551           && (eol_seen & EOL_SEEN_LF) == 0)
6552         eol_seen = EOL_SEEN_CRLF;
6553       else if (eol_seen != EOL_SEEN_NONE
6554           && eol_seen != EOL_SEEN_LF
6555           && eol_seen != EOL_SEEN_CRLF
6556           && eol_seen != EOL_SEEN_CR)
6557         eol_seen = EOL_SEEN_LF;
6558       if (eol_seen != EOL_SEEN_NONE)
6559         eol_type = adjust_coding_eol_type (coding, eol_seen);
6560     }
6561
6562   if (EQ (eol_type, Qmac))
6563     {
6564       for (p = pbeg; p < pend; p++)
6565         if (*p == '\r')
6566           *p = '\n';
6567     }
6568   else if (EQ (eol_type, Qdos))
6569     {
6570       ptrdiff_t n = 0;
6571
6572       if (NILP (coding->dst_object))
6573         {
6574           /* Start deleting '\r' from the tail to minimize the memory
6575              movement.  */
6576           for (p = pend - 2; p >= pbeg; p--)
6577             if (*p == '\r')
6578               {
6579                 memmove (p, p + 1, pend-- - p - 1);
6580                 n++;
6581               }
6582         }
6583       else
6584         {
6585           ptrdiff_t pos_byte = coding->dst_pos_byte;
6586           ptrdiff_t pos = coding->dst_pos;
6587           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6588
6589           while (pos < pos_end)
6590             {
6591               p = BYTE_POS_ADDR (pos_byte);
6592               if (*p == '\r' && p[1] == '\n')
6593                 {
6594                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6595                   n++;
6596                   pos_end--;
6597                 }
6598               pos++;
6599               if (coding->dst_multibyte)
6600                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6601               else
6602                 pos_byte++;
6603             }
6604         }
6605       coding->produced -= n;
6606       coding->produced_char -= n;
6607     }
6608 }
6609
6610
6611 /* Return a translation table (or list of them) from coding system
6612    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6613    decoding (ENCODEP is zero). */
6614
6615 static Lisp_Object
6616 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6617 {
6618   Lisp_Object standard, translation_table;
6619   Lisp_Object val;
6620
6621   if (NILP (Venable_character_translation))
6622     {
6623       if (max_lookup)
6624         *max_lookup = 0;
6625       return Qnil;
6626     }
6627   if (encodep)
6628     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6629       standard = Vstandard_translation_table_for_encode;
6630   else
6631     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6632       standard = Vstandard_translation_table_for_decode;
6633   if (NILP (translation_table))
6634     translation_table = standard;
6635   else
6636     {
6637       if (SYMBOLP (translation_table))
6638         translation_table = Fget (translation_table, Qtranslation_table);
6639       else if (CONSP (translation_table))
6640         {
6641           translation_table = Fcopy_sequence (translation_table);
6642           for (val = translation_table; CONSP (val); val = XCDR (val))
6643             if (SYMBOLP (XCAR (val)))
6644               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6645         }
6646       if (CHAR_TABLE_P (standard))
6647         {
6648           if (CONSP (translation_table))
6649             translation_table = nconc2 (translation_table,
6650                                         Fcons (standard, Qnil));
6651           else
6652             translation_table = Fcons (translation_table,
6653                                        Fcons (standard, Qnil));
6654         }
6655     }
6656
6657   if (max_lookup)
6658     {
6659       *max_lookup = 1;
6660       if (CHAR_TABLE_P (translation_table)
6661           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6662         {
6663           val = XCHAR_TABLE (translation_table)->extras[1];
6664           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6665             *max_lookup = XFASTINT (val);
6666         }
6667       else if (CONSP (translation_table))
6668         {
6669           Lisp_Object tail;
6670
6671           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6672             if (CHAR_TABLE_P (XCAR (tail))
6673                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6674               {
6675                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6676                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6677                   *max_lookup = XFASTINT (tailval);
6678               }
6679         }
6680     }
6681   return translation_table;
6682 }
6683
6684 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6685   do {                                                          \
6686     trans = Qnil;                                               \
6687     if (CHAR_TABLE_P (table))                                   \
6688       {                                                         \
6689         trans = CHAR_TABLE_REF (table, c);                      \
6690         if (CHARACTERP (trans))                                 \
6691           c = XFASTINT (trans), trans = Qnil;                   \
6692       }                                                         \
6693     else if (CONSP (table))                                     \
6694       {                                                         \
6695         Lisp_Object tail;                                       \
6696                                                                 \
6697         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6698           if (CHAR_TABLE_P (XCAR (tail)))                       \
6699             {                                                   \
6700               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6701               if (CHARACTERP (trans))                           \
6702                 c = XFASTINT (trans), trans = Qnil;             \
6703               else if (! NILP (trans))                          \
6704                 break;                                          \
6705             }                                                   \
6706       }                                                         \
6707   } while (0)
6708
6709
6710 /* Return a translation of character(s) at BUF according to TRANS.
6711    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6712    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6713    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6714    translation is found, and Qnil if not found..
6715    If BUF is too short to lookup characters in FROM, return Qt.  */
6716
6717 static Lisp_Object
6718 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6719 {
6720
6721   if (INTEGERP (trans))
6722     return trans;
6723   for (; CONSP (trans); trans = XCDR (trans))
6724     {
6725       Lisp_Object val = XCAR (trans);
6726       Lisp_Object from = XCAR (val);
6727       ptrdiff_t len = ASIZE (from);
6728       ptrdiff_t i;
6729
6730       for (i = 0; i < len; i++)
6731         {
6732           if (buf + i == buf_end)
6733             return Qt;
6734           if (XINT (AREF (from, i)) != buf[i])
6735             break;
6736         }
6737       if (i == len)
6738         return val;
6739     }
6740   return Qnil;
6741 }
6742
6743
6744 static int
6745 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6746                int last_block)
6747 {
6748   unsigned char *dst = coding->destination + coding->produced;
6749   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6750   ptrdiff_t produced;
6751   ptrdiff_t produced_chars = 0;
6752   int carryover = 0;
6753
6754   if (! coding->chars_at_source)
6755     {
6756       /* Source characters are in coding->charbuf.  */
6757       int *buf = coding->charbuf;
6758       int *buf_end = buf + coding->charbuf_used;
6759
6760       if (EQ (coding->src_object, coding->dst_object))
6761         {
6762           coding_set_source (coding);
6763           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6764         }
6765
6766       while (buf < buf_end)
6767         {
6768           int c = *buf;
6769           ptrdiff_t i;
6770
6771           if (c >= 0)
6772             {
6773               ptrdiff_t from_nchars = 1, to_nchars = 1;
6774               Lisp_Object trans = Qnil;
6775
6776               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6777               if (! NILP (trans))
6778                 {
6779                   trans = get_translation (trans, buf, buf_end);
6780                   if (INTEGERP (trans))
6781                     c = XINT (trans);
6782                   else if (CONSP (trans))
6783                     {
6784                       from_nchars = ASIZE (XCAR (trans));
6785                       trans = XCDR (trans);
6786                       if (INTEGERP (trans))
6787                         c = XINT (trans);
6788                       else
6789                         {
6790                           to_nchars = ASIZE (trans);
6791                           c = XINT (AREF (trans, 0));
6792                         }
6793                     }
6794                   else if (EQ (trans, Qt) && ! last_block)
6795                     break;
6796                 }
6797
6798               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6799                 {
6800                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6801                        / MAX_MULTIBYTE_LENGTH)
6802                       < to_nchars)
6803                     memory_full (SIZE_MAX);
6804                   dst = alloc_destination (coding,
6805                                            buf_end - buf
6806                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6807                                            dst);
6808                   if (EQ (coding->src_object, coding->dst_object))
6809                     {
6810                       coding_set_source (coding);
6811                       dst_end = (((unsigned char *) coding->source)
6812                                  + coding->consumed);
6813                     }
6814                   else
6815                     dst_end = coding->destination + coding->dst_bytes;
6816                 }
6817
6818               for (i = 0; i < to_nchars; i++)
6819                 {
6820                   if (i > 0)
6821                     c = XINT (AREF (trans, i));
6822                   if (coding->dst_multibyte
6823                       || ! CHAR_BYTE8_P (c))
6824                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6825                   else
6826                     *dst++ = CHAR_TO_BYTE8 (c);
6827                 }
6828               produced_chars += to_nchars;
6829               buf += from_nchars;
6830             }
6831           else
6832             /* This is an annotation datum.  (-C) is the length.  */
6833             buf += -c;
6834         }
6835       carryover = buf_end - buf;
6836     }
6837   else
6838     {
6839       /* Source characters are at coding->source.  */
6840       const unsigned char *src = coding->source;
6841       const unsigned char *src_end = src + coding->consumed;
6842
6843       if (EQ (coding->dst_object, coding->src_object))
6844         dst_end = (unsigned char *) src;
6845       if (coding->src_multibyte != coding->dst_multibyte)
6846         {
6847           if (coding->src_multibyte)
6848             {
6849               int multibytep = 1;
6850               ptrdiff_t consumed_chars = 0;
6851
6852               while (1)
6853                 {
6854                   const unsigned char *src_base = src;
6855                   int c;
6856
6857                   ONE_MORE_BYTE (c);
6858                   if (dst == dst_end)
6859                     {
6860                       if (EQ (coding->src_object, coding->dst_object))
6861                         dst_end = (unsigned char *) src;
6862                       if (dst == dst_end)
6863                         {
6864                           ptrdiff_t offset = src - coding->source;
6865
6866                           dst = alloc_destination (coding, src_end - src + 1,
6867                                                    dst);
6868                           dst_end = coding->destination + coding->dst_bytes;
6869                           coding_set_source (coding);
6870                           src = coding->source + offset;
6871                           src_end = coding->source + coding->consumed;
6872                           if (EQ (coding->src_object, coding->dst_object))
6873                             dst_end = (unsigned char *) src;
6874                         }
6875                     }
6876                   *dst++ = c;
6877                   produced_chars++;
6878                 }
6879             no_more_source:
6880               ;
6881             }
6882           else
6883             while (src < src_end)
6884               {
6885                 int multibytep = 1;
6886                 int c = *src++;
6887
6888                 if (dst >= dst_end - 1)
6889                   {
6890                     if (EQ (coding->src_object, coding->dst_object))
6891                       dst_end = (unsigned char *) src;
6892                     if (dst >= dst_end - 1)
6893                       {
6894                         ptrdiff_t offset = src - coding->source;
6895                         ptrdiff_t more_bytes;
6896
6897                         if (EQ (coding->src_object, coding->dst_object))
6898                           more_bytes = ((src_end - src) / 2) + 2;
6899                         else
6900                           more_bytes = src_end - src + 2;
6901                         dst = alloc_destination (coding, more_bytes, dst);
6902                         dst_end = coding->destination + coding->dst_bytes;
6903                         coding_set_source (coding);
6904                         src = coding->source + offset;
6905                         src_end = coding->source + coding->consumed;
6906                         if (EQ (coding->src_object, coding->dst_object))
6907                           dst_end = (unsigned char *) src;
6908                       }
6909                   }
6910                 EMIT_ONE_BYTE (c);
6911               }
6912         }
6913       else
6914         {
6915           if (!EQ (coding->src_object, coding->dst_object))
6916             {
6917               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6918
6919               if (require > 0)
6920                 {
6921                   ptrdiff_t offset = src - coding->source;
6922
6923                   dst = alloc_destination (coding, require, dst);
6924                   coding_set_source (coding);
6925                   src = coding->source + offset;
6926                   src_end = coding->source + coding->consumed;
6927                 }
6928             }
6929           produced_chars = coding->consumed_char;
6930           while (src < src_end)
6931             *dst++ = *src++;
6932         }
6933     }
6934
6935   produced = dst - (coding->destination + coding->produced);
6936   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6937     insert_from_gap (produced_chars, produced);
6938   coding->produced += produced;
6939   coding->produced_char += produced_chars;
6940   return carryover;
6941 }
6942
6943 /* Compose text in CODING->object according to the annotation data at
6944    CHARBUF.  CHARBUF is an array:
6945      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6946  */
6947
6948 static inline void
6949 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6950 {
6951   int len;
6952   ptrdiff_t to;
6953   enum composition_method method;
6954   Lisp_Object components;
6955
6956   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6957   to = pos + charbuf[2];
6958   method = (enum composition_method) (charbuf[4]);
6959
6960   if (method == COMPOSITION_RELATIVE)
6961     components = Qnil;
6962   else
6963     {
6964       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6965       int i, j;
6966
6967       if (method == COMPOSITION_WITH_RULE)
6968         len = charbuf[2] * 3 - 2;
6969       charbuf += MAX_ANNOTATION_LENGTH;
6970       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6971       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6972         {
6973           if (charbuf[i] >= 0)
6974             args[j] = make_number (charbuf[i]);
6975           else
6976             {
6977               i++;
6978               args[j] = make_number (charbuf[i] % 0x100);
6979             }
6980         }
6981       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6982     }
6983   compose_text (pos, to, components, Qnil, coding->dst_object);
6984 }
6985
6986
6987 /* Put `charset' property on text in CODING->object according to
6988    the annotation data at CHARBUF.  CHARBUF is an array:
6989      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6990  */
6991
6992 static inline void
6993 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6994 {
6995   ptrdiff_t from = pos - charbuf[2];
6996   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6997
6998   Fput_text_property (make_number (from), make_number (pos),
6999                       Qcharset, CHARSET_NAME (charset),
7000                       coding->dst_object);
7001 }
7002
7003
7004 #define CHARBUF_SIZE 0x4000
7005
7006 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7007   do {                                                                  \
7008     int size = CHARBUF_SIZE;                                            \
7009                                                                         \
7010     coding->charbuf = NULL;                                             \
7011     while (size > 1024)                                                 \
7012       {                                                                 \
7013         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7014         if (coding->charbuf)                                            \
7015           break;                                                        \
7016         size >>= 1;                                                     \
7017       }                                                                 \
7018     if (! coding->charbuf)                                              \
7019       {                                                                 \
7020         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7021         return coding->result;                                          \
7022       }                                                                 \
7023     coding->charbuf_size = size;                                        \
7024   } while (0)
7025
7026
7027 static void
7028 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7029 {
7030   int *charbuf = coding->charbuf;
7031   int *charbuf_end = charbuf + coding->charbuf_used;
7032
7033   if (NILP (coding->dst_object))
7034     return;
7035
7036   while (charbuf < charbuf_end)
7037     {
7038       if (*charbuf >= 0)
7039         pos++, charbuf++;
7040       else
7041         {
7042           int len = -*charbuf;
7043
7044           if (len > 2)
7045             switch (charbuf[1])
7046               {
7047               case CODING_ANNOTATE_COMPOSITION_MASK:
7048                 produce_composition (coding, charbuf, pos);
7049                 break;
7050               case CODING_ANNOTATE_CHARSET_MASK:
7051                 produce_charset (coding, charbuf, pos);
7052                 break;
7053               }
7054           charbuf += len;
7055         }
7056     }
7057 }
7058
7059 /* Decode the data at CODING->src_object into CODING->dst_object.
7060    CODING->src_object is a buffer, a string, or nil.
7061    CODING->dst_object is a buffer.
7062
7063    If CODING->src_object is a buffer, it must be the current buffer.
7064    In this case, if CODING->src_pos is positive, it is a position of
7065    the source text in the buffer, otherwise, the source text is in the
7066    gap area of the buffer, and CODING->src_pos specifies the offset of
7067    the text from GPT (which must be the same as PT).  If this is the
7068    same buffer as CODING->dst_object, CODING->src_pos must be
7069    negative.
7070
7071    If CODING->src_object is a string, CODING->src_pos is an index to
7072    that string.
7073
7074    If CODING->src_object is nil, CODING->source must already point to
7075    the non-relocatable memory area.  In this case, CODING->src_pos is
7076    an offset from CODING->source.
7077
7078    The decoded data is inserted at the current point of the buffer
7079    CODING->dst_object.
7080 */
7081
7082 static int
7083 decode_coding (struct coding_system *coding)
7084 {
7085   Lisp_Object attrs;
7086   Lisp_Object undo_list;
7087   Lisp_Object translation_table;
7088   struct ccl_spec cclspec;
7089   int carryover;
7090   int i;
7091
7092   if (BUFFERP (coding->src_object)
7093       && coding->src_pos > 0
7094       && coding->src_pos < GPT
7095       && coding->src_pos + coding->src_chars > GPT)
7096     move_gap_both (coding->src_pos, coding->src_pos_byte);
7097
7098   undo_list = Qt;
7099   if (BUFFERP (coding->dst_object))
7100     {
7101       if (current_buffer != XBUFFER (coding->dst_object))
7102         set_buffer_internal (XBUFFER (coding->dst_object));
7103       if (GPT != PT)
7104         move_gap_both (PT, PT_BYTE);
7105       undo_list = BVAR (current_buffer, undo_list);
7106       BVAR (current_buffer, undo_list) = Qt;
7107     }
7108
7109   coding->consumed = coding->consumed_char = 0;
7110   coding->produced = coding->produced_char = 0;
7111   coding->chars_at_source = 0;
7112   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7113   coding->errors = 0;
7114
7115   ALLOC_CONVERSION_WORK_AREA (coding);
7116
7117   attrs = CODING_ID_ATTRS (coding->id);
7118   translation_table = get_translation_table (attrs, 0, NULL);
7119
7120   carryover = 0;
7121   if (coding->decoder == decode_coding_ccl)
7122     {
7123       coding->spec.ccl = &cclspec;
7124       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7125     }
7126   do
7127     {
7128       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7129
7130       coding_set_source (coding);
7131       coding->annotated = 0;
7132       coding->charbuf_used = carryover;
7133       (*(coding->decoder)) (coding);
7134       coding_set_destination (coding);
7135       carryover = produce_chars (coding, translation_table, 0);
7136       if (coding->annotated)
7137         produce_annotation (coding, pos);
7138       for (i = 0; i < carryover; i++)
7139         coding->charbuf[i]
7140           = coding->charbuf[coding->charbuf_used - carryover + i];
7141     }
7142   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7143          || (coding->consumed < coding->src_bytes
7144              && (coding->result == CODING_RESULT_SUCCESS
7145                  || coding->result == CODING_RESULT_INVALID_SRC)));
7146
7147   if (carryover > 0)
7148     {
7149       coding_set_destination (coding);
7150       coding->charbuf_used = carryover;
7151       produce_chars (coding, translation_table, 1);
7152     }
7153
7154   coding->carryover_bytes = 0;
7155   if (coding->consumed < coding->src_bytes)
7156     {
7157       int nbytes = coding->src_bytes - coding->consumed;
7158       const unsigned char *src;
7159
7160       coding_set_source (coding);
7161       coding_set_destination (coding);
7162       src = coding->source + coding->consumed;
7163
7164       if (coding->mode & CODING_MODE_LAST_BLOCK)
7165         {
7166           /* Flush out unprocessed data as binary chars.  We are sure
7167              that the number of data is less than the size of
7168              coding->charbuf.  */
7169           coding->charbuf_used = 0;
7170           coding->chars_at_source = 0;
7171
7172           while (nbytes-- > 0)
7173             {
7174               int c = *src++;
7175
7176               if (c & 0x80)
7177                 c = BYTE8_TO_CHAR (c);
7178               coding->charbuf[coding->charbuf_used++] = c;
7179             }
7180           produce_chars (coding, Qnil, 1);
7181         }
7182       else
7183         {
7184           /* Record unprocessed bytes in coding->carryover.  We are
7185              sure that the number of data is less than the size of
7186              coding->carryover.  */
7187           unsigned char *p = coding->carryover;
7188
7189           if (nbytes > sizeof coding->carryover)
7190             nbytes = sizeof coding->carryover;
7191           coding->carryover_bytes = nbytes;
7192           while (nbytes-- > 0)
7193             *p++ = *src++;
7194         }
7195       coding->consumed = coding->src_bytes;
7196     }
7197
7198   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7199       && !inhibit_eol_conversion)
7200     decode_eol (coding);
7201   if (BUFFERP (coding->dst_object))
7202     {
7203       BVAR (current_buffer, undo_list) = undo_list;
7204       record_insert (coding->dst_pos, coding->produced_char);
7205     }
7206   return coding->result;
7207 }
7208
7209
7210 /* Extract an annotation datum from a composition starting at POS and
7211    ending before LIMIT of CODING->src_object (buffer or string), store
7212    the data in BUF, set *STOP to a starting position of the next
7213    composition (if any) or to LIMIT, and return the address of the
7214    next element of BUF.
7215
7216    If such an annotation is not found, set *STOP to a starting
7217    position of a composition after POS (if any) or to LIMIT, and
7218    return BUF.  */
7219
7220 static inline int *
7221 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7222                                struct coding_system *coding, int *buf,
7223                                ptrdiff_t *stop)
7224 {
7225   ptrdiff_t start, end;
7226   Lisp_Object prop;
7227
7228   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7229       || end > limit)
7230     *stop = limit;
7231   else if (start > pos)
7232     *stop = start;
7233   else
7234     {
7235       if (start == pos)
7236         {
7237           /* We found a composition.  Store the corresponding
7238              annotation data in BUF.  */
7239           int *head = buf;
7240           enum composition_method method = COMPOSITION_METHOD (prop);
7241           int nchars = COMPOSITION_LENGTH (prop);
7242
7243           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7244           if (method != COMPOSITION_RELATIVE)
7245             {
7246               Lisp_Object components;
7247               ptrdiff_t i, len, i_byte;
7248
7249               components = COMPOSITION_COMPONENTS (prop);
7250               if (VECTORP (components))
7251                 {
7252                   len = ASIZE (components);
7253                   for (i = 0; i < len; i++)
7254                     *buf++ = XINT (AREF (components, i));
7255                 }
7256               else if (STRINGP (components))
7257                 {
7258                   len = SCHARS (components);
7259                   i = i_byte = 0;
7260                   while (i < len)
7261                     {
7262                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7263                       buf++;
7264                     }
7265                 }
7266               else if (INTEGERP (components))
7267                 {
7268                   len = 1;
7269                   *buf++ = XINT (components);
7270                 }
7271               else if (CONSP (components))
7272                 {
7273                   for (len = 0; CONSP (components);
7274                        len++, components = XCDR (components))
7275                     *buf++ = XINT (XCAR (components));
7276                 }
7277               else
7278                 abort ();
7279               *head -= len;
7280             }
7281         }
7282
7283       if (find_composition (end, limit, &start, &end, &prop,
7284                             coding->src_object)
7285           && end <= limit)
7286         *stop = start;
7287       else
7288         *stop = limit;
7289     }
7290   return buf;
7291 }
7292
7293
7294 /* Extract an annotation datum from a text property `charset' at POS of
7295    CODING->src_object (buffer of string), store the data in BUF, set
7296    *STOP to the position where the value of `charset' property changes
7297    (limiting by LIMIT), and return the address of the next element of
7298    BUF.
7299
7300    If the property value is nil, set *STOP to the position where the
7301    property value is non-nil (limiting by LIMIT), and return BUF.  */
7302
7303 static inline int *
7304 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7305                            struct coding_system *coding, int *buf,
7306                            ptrdiff_t *stop)
7307 {
7308   Lisp_Object val, next;
7309   int id;
7310
7311   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7312   if (! NILP (val) && CHARSETP (val))
7313     id = XINT (CHARSET_SYMBOL_ID (val));
7314   else
7315     id = -1;
7316   ADD_CHARSET_DATA (buf, 0, id);
7317   next = Fnext_single_property_change (make_number (pos), Qcharset,
7318                                        coding->src_object,
7319                                        make_number (limit));
7320   *stop = XINT (next);
7321   return buf;
7322 }
7323
7324
7325 static void
7326 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7327                int max_lookup)
7328 {
7329   int *buf = coding->charbuf;
7330   int *buf_end = coding->charbuf + coding->charbuf_size;
7331   const unsigned char *src = coding->source + coding->consumed;
7332   const unsigned char *src_end = coding->source + coding->src_bytes;
7333   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7334   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7335   int multibytep = coding->src_multibyte;
7336   Lisp_Object eol_type;
7337   int c;
7338   ptrdiff_t stop, stop_composition, stop_charset;
7339   int *lookup_buf = NULL;
7340
7341   if (! NILP (translation_table))
7342     lookup_buf = alloca (sizeof (int) * max_lookup);
7343
7344   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7345   if (VECTORP (eol_type))
7346     eol_type = Qunix;
7347
7348   /* Note: composition handling is not yet implemented.  */
7349   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7350
7351   if (NILP (coding->src_object))
7352     stop = stop_composition = stop_charset = end_pos;
7353   else
7354     {
7355       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7356         stop = stop_composition = pos;
7357       else
7358         stop = stop_composition = end_pos;
7359       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7360         stop = stop_charset = pos;
7361       else
7362         stop_charset = end_pos;
7363     }
7364
7365   /* Compensate for CRLF and conversion.  */
7366   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7367   while (buf < buf_end)
7368     {
7369       Lisp_Object trans;
7370
7371       if (pos == stop)
7372         {
7373           if (pos == end_pos)
7374             break;
7375           if (pos == stop_composition)
7376             buf = handle_composition_annotation (pos, end_pos, coding,
7377                                                  buf, &stop_composition);
7378           if (pos == stop_charset)
7379             buf = handle_charset_annotation (pos, end_pos, coding,
7380                                              buf, &stop_charset);
7381           stop = (stop_composition < stop_charset
7382                   ? stop_composition : stop_charset);
7383         }
7384
7385       if (! multibytep)
7386         {
7387           int bytes;
7388
7389           if (coding->encoder == encode_coding_raw_text
7390               || coding->encoder == encode_coding_ccl)
7391             c = *src++, pos++;
7392           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7393             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7394           else
7395             c = BYTE8_TO_CHAR (*src), src++, pos++;
7396         }
7397       else
7398         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7399       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7400         c = '\n';
7401       if (! EQ (eol_type, Qunix))
7402         {
7403           if (c == '\n')
7404             {
7405               if (EQ (eol_type, Qdos))
7406                 *buf++ = '\r';
7407               else
7408                 c = '\r';
7409             }
7410         }
7411
7412       trans = Qnil;
7413       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7414       if (NILP (trans))
7415         *buf++ = c;
7416       else
7417         {
7418           ptrdiff_t from_nchars = 1, to_nchars = 1;
7419           int *lookup_buf_end;
7420           const unsigned char *p = src;
7421           int i;
7422
7423           lookup_buf[0] = c;
7424           for (i = 1; i < max_lookup && p < src_end; i++)
7425             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7426           lookup_buf_end = lookup_buf + i;
7427           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7428           if (INTEGERP (trans))
7429             c = XINT (trans);
7430           else if (CONSP (trans))
7431             {
7432               from_nchars = ASIZE (XCAR (trans));
7433               trans = XCDR (trans);
7434               if (INTEGERP (trans))
7435                 c = XINT (trans);
7436               else
7437                 {
7438                   to_nchars = ASIZE (trans);
7439                   if (buf_end - buf < to_nchars)
7440                     break;
7441                   c = XINT (AREF (trans, 0));
7442                 }
7443             }
7444           else
7445             break;
7446           *buf++ = c;
7447           for (i = 1; i < to_nchars; i++)
7448             *buf++ = XINT (AREF (trans, i));
7449           for (i = 1; i < from_nchars; i++, pos++)
7450             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7451         }
7452     }
7453
7454   coding->consumed = src - coding->source;
7455   coding->consumed_char = pos - coding->src_pos;
7456   coding->charbuf_used = buf - coding->charbuf;
7457   coding->chars_at_source = 0;
7458 }
7459
7460
7461 /* Encode the text at CODING->src_object into CODING->dst_object.
7462    CODING->src_object is a buffer or a string.
7463    CODING->dst_object is a buffer or nil.
7464
7465    If CODING->src_object is a buffer, it must be the current buffer.
7466    In this case, if CODING->src_pos is positive, it is a position of
7467    the source text in the buffer, otherwise. the source text is in the
7468    gap area of the buffer, and coding->src_pos specifies the offset of
7469    the text from GPT (which must be the same as PT).  If this is the
7470    same buffer as CODING->dst_object, CODING->src_pos must be
7471    negative and CODING should not have `pre-write-conversion'.
7472
7473    If CODING->src_object is a string, CODING should not have
7474    `pre-write-conversion'.
7475
7476    If CODING->dst_object is a buffer, the encoded data is inserted at
7477    the current point of that buffer.
7478
7479    If CODING->dst_object is nil, the encoded data is placed at the
7480    memory area specified by CODING->destination.  */
7481
7482 static int
7483 encode_coding (struct coding_system *coding)
7484 {
7485   Lisp_Object attrs;
7486   Lisp_Object translation_table;
7487   int max_lookup;
7488   struct ccl_spec cclspec;
7489
7490   attrs = CODING_ID_ATTRS (coding->id);
7491   if (coding->encoder == encode_coding_raw_text)
7492     translation_table = Qnil, max_lookup = 0;
7493   else
7494     translation_table = get_translation_table (attrs, 1, &max_lookup);
7495
7496   if (BUFFERP (coding->dst_object))
7497     {
7498       set_buffer_internal (XBUFFER (coding->dst_object));
7499       coding->dst_multibyte
7500         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7501     }
7502
7503   coding->consumed = coding->consumed_char = 0;
7504   coding->produced = coding->produced_char = 0;
7505   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7506   coding->errors = 0;
7507
7508   ALLOC_CONVERSION_WORK_AREA (coding);
7509
7510   if (coding->encoder == encode_coding_ccl)
7511     {
7512       coding->spec.ccl = &cclspec;
7513       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7514     }
7515   do {
7516     coding_set_source (coding);
7517     consume_chars (coding, translation_table, max_lookup);
7518     coding_set_destination (coding);
7519     (*(coding->encoder)) (coding);
7520   } while (coding->consumed_char < coding->src_chars);
7521
7522   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7523     insert_from_gap (coding->produced_char, coding->produced);
7524
7525   return (coding->result);
7526 }
7527
7528
7529 /* Name (or base name) of work buffer for code conversion.  */
7530 static Lisp_Object Vcode_conversion_workbuf_name;
7531
7532 /* A working buffer used by the top level conversion.  Once it is
7533    created, it is never destroyed.  It has the name
7534    Vcode_conversion_workbuf_name.  The other working buffers are
7535    destroyed after the use is finished, and their names are modified
7536    versions of Vcode_conversion_workbuf_name.  */
7537 static Lisp_Object Vcode_conversion_reused_workbuf;
7538
7539 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7540 static int reused_workbuf_in_use;
7541
7542
7543 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7544    multibyteness of returning buffer.  */
7545
7546 static Lisp_Object
7547 make_conversion_work_buffer (int multibyte)
7548 {
7549   Lisp_Object name, workbuf;
7550   struct buffer *current;
7551
7552   if (reused_workbuf_in_use++)
7553     {
7554       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7555       workbuf = Fget_buffer_create (name);
7556     }
7557   else
7558     {
7559       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7560         Vcode_conversion_reused_workbuf
7561           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7562       workbuf = Vcode_conversion_reused_workbuf;
7563     }
7564   current = current_buffer;
7565   set_buffer_internal (XBUFFER (workbuf));
7566   /* We can't allow modification hooks to run in the work buffer.  For
7567      instance, directory_files_internal assumes that file decoding
7568      doesn't compile new regexps.  */
7569   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7570   Ferase_buffer ();
7571   BVAR (current_buffer, undo_list) = Qt;
7572   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7573   set_buffer_internal (current);
7574   return workbuf;
7575 }
7576
7577
7578 static Lisp_Object
7579 code_conversion_restore (Lisp_Object arg)
7580 {
7581   Lisp_Object current, workbuf;
7582   struct gcpro gcpro1;
7583
7584   GCPRO1 (arg);
7585   current = XCAR (arg);
7586   workbuf = XCDR (arg);
7587   if (! NILP (workbuf))
7588     {
7589       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7590         reused_workbuf_in_use = 0;
7591       else if (! NILP (Fbuffer_live_p (workbuf)))
7592         Fkill_buffer (workbuf);
7593     }
7594   set_buffer_internal (XBUFFER (current));
7595   UNGCPRO;
7596   return Qnil;
7597 }
7598
7599 Lisp_Object
7600 code_conversion_save (int with_work_buf, int multibyte)
7601 {
7602   Lisp_Object workbuf = Qnil;
7603
7604   if (with_work_buf)
7605     workbuf = make_conversion_work_buffer (multibyte);
7606   record_unwind_protect (code_conversion_restore,
7607                          Fcons (Fcurrent_buffer (), workbuf));
7608   return workbuf;
7609 }
7610
7611 int
7612 decode_coding_gap (struct coding_system *coding,
7613                    ptrdiff_t chars, ptrdiff_t bytes)
7614 {
7615   ptrdiff_t count = SPECPDL_INDEX ();
7616   Lisp_Object attrs;
7617
7618   code_conversion_save (0, 0);
7619
7620   coding->src_object = Fcurrent_buffer ();
7621   coding->src_chars = chars;
7622   coding->src_bytes = bytes;
7623   coding->src_pos = -chars;
7624   coding->src_pos_byte = -bytes;
7625   coding->src_multibyte = chars < bytes;
7626   coding->dst_object = coding->src_object;
7627   coding->dst_pos = PT;
7628   coding->dst_pos_byte = PT_BYTE;
7629   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7630
7631   if (CODING_REQUIRE_DETECTION (coding))
7632     detect_coding (coding);
7633
7634   coding->mode |= CODING_MODE_LAST_BLOCK;
7635   current_buffer->text->inhibit_shrinking = 1;
7636   decode_coding (coding);
7637   current_buffer->text->inhibit_shrinking = 0;
7638
7639   attrs = CODING_ID_ATTRS (coding->id);
7640   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7641     {
7642       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7643       Lisp_Object val;
7644
7645       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7646       val = call1 (CODING_ATTR_POST_READ (attrs),
7647                    make_number (coding->produced_char));
7648       CHECK_NATNUM (val);
7649       coding->produced_char += Z - prev_Z;
7650       coding->produced += Z_BYTE - prev_Z_BYTE;
7651     }
7652
7653   unbind_to (count, Qnil);
7654   return coding->result;
7655 }
7656
7657
7658 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7659    SRC_OBJECT into DST_OBJECT by coding context CODING.
7660
7661    SRC_OBJECT is a buffer, a string, or Qnil.
7662
7663    If it is a buffer, the text is at point of the buffer.  FROM and TO
7664    are positions in the buffer.
7665
7666    If it is a string, the text is at the beginning of the string.
7667    FROM and TO are indices to the string.
7668
7669    If it is nil, the text is at coding->source.  FROM and TO are
7670    indices to coding->source.
7671
7672    DST_OBJECT is a buffer, Qt, or Qnil.
7673
7674    If it is a buffer, the decoded text is inserted at point of the
7675    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7676    is deleted.
7677
7678    If it is Qt, a string is made from the decoded text, and
7679    set in CODING->dst_object.
7680
7681    If it is Qnil, the decoded text is stored at CODING->destination.
7682    The caller must allocate CODING->dst_bytes bytes at
7683    CODING->destination by xmalloc.  If the decoded text is longer than
7684    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7685  */
7686
7687 void
7688 decode_coding_object (struct coding_system *coding,
7689                       Lisp_Object src_object,
7690                       ptrdiff_t from, ptrdiff_t from_byte,
7691                       ptrdiff_t to, ptrdiff_t to_byte,
7692                       Lisp_Object dst_object)
7693 {
7694   ptrdiff_t count = SPECPDL_INDEX ();
7695   unsigned char *destination IF_LINT (= NULL);
7696   ptrdiff_t dst_bytes IF_LINT (= 0);
7697   ptrdiff_t chars = to - from;
7698   ptrdiff_t bytes = to_byte - from_byte;
7699   Lisp_Object attrs;
7700   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7701   int need_marker_adjustment = 0;
7702   Lisp_Object old_deactivate_mark;
7703
7704   old_deactivate_mark = Vdeactivate_mark;
7705
7706   if (NILP (dst_object))
7707     {
7708       destination = coding->destination;
7709       dst_bytes = coding->dst_bytes;
7710     }
7711
7712   coding->src_object = src_object;
7713   coding->src_chars = chars;
7714   coding->src_bytes = bytes;
7715   coding->src_multibyte = chars < bytes;
7716
7717   if (STRINGP (src_object))
7718     {
7719       coding->src_pos = from;
7720       coding->src_pos_byte = from_byte;
7721     }
7722   else if (BUFFERP (src_object))
7723     {
7724       set_buffer_internal (XBUFFER (src_object));
7725       if (from != GPT)
7726         move_gap_both (from, from_byte);
7727       if (EQ (src_object, dst_object))
7728         {
7729           struct Lisp_Marker *tail;
7730
7731           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7732             {
7733               tail->need_adjustment
7734                 = tail->charpos == (tail->insertion_type ? from : to);
7735               need_marker_adjustment |= tail->need_adjustment;
7736             }
7737           saved_pt = PT, saved_pt_byte = PT_BYTE;
7738           TEMP_SET_PT_BOTH (from, from_byte);
7739           current_buffer->text->inhibit_shrinking = 1;
7740           del_range_both (from, from_byte, to, to_byte, 1);
7741           coding->src_pos = -chars;
7742           coding->src_pos_byte = -bytes;
7743         }
7744       else
7745         {
7746           coding->src_pos = from;
7747           coding->src_pos_byte = from_byte;
7748         }
7749     }
7750
7751   if (CODING_REQUIRE_DETECTION (coding))
7752     detect_coding (coding);
7753   attrs = CODING_ID_ATTRS (coding->id);
7754
7755   if (EQ (dst_object, Qt)
7756       || (! NILP (CODING_ATTR_POST_READ (attrs))
7757           && NILP (dst_object)))
7758     {
7759       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7760       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7761       coding->dst_pos = BEG;
7762       coding->dst_pos_byte = BEG_BYTE;
7763     }
7764   else if (BUFFERP (dst_object))
7765     {
7766       code_conversion_save (0, 0);
7767       coding->dst_object = dst_object;
7768       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7769       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7770       coding->dst_multibyte
7771         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7772     }
7773   else
7774     {
7775       code_conversion_save (0, 0);
7776       coding->dst_object = Qnil;
7777       /* Most callers presume this will return a multibyte result, and they
7778          won't use `binary' or `raw-text' anyway, so let's not worry about
7779          CODING_FOR_UNIBYTE.  */
7780       coding->dst_multibyte = 1;
7781     }
7782
7783   decode_coding (coding);
7784
7785   if (BUFFERP (coding->dst_object))
7786     set_buffer_internal (XBUFFER (coding->dst_object));
7787
7788   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7789     {
7790       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7791       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7792       Lisp_Object val;
7793
7794       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7795       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7796               old_deactivate_mark);
7797       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7798                         make_number (coding->produced_char));
7799       UNGCPRO;
7800       CHECK_NATNUM (val);
7801       coding->produced_char += Z - prev_Z;
7802       coding->produced += Z_BYTE - prev_Z_BYTE;
7803     }
7804
7805   if (EQ (dst_object, Qt))
7806     {
7807       coding->dst_object = Fbuffer_string ();
7808     }
7809   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7810     {
7811       set_buffer_internal (XBUFFER (coding->dst_object));
7812       if (dst_bytes < coding->produced)
7813         {
7814           destination = xrealloc (destination, coding->produced);
7815           if (! destination)
7816             {
7817               record_conversion_result (coding,
7818                                         CODING_RESULT_INSUFFICIENT_MEM);
7819               unbind_to (count, Qnil);
7820               return;
7821             }
7822           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7823             move_gap_both (BEGV, BEGV_BYTE);
7824           memcpy (destination, BEGV_ADDR, coding->produced);
7825           coding->destination = destination;
7826         }
7827     }
7828
7829   if (saved_pt >= 0)
7830     {
7831       /* This is the case of:
7832          (BUFFERP (src_object) && EQ (src_object, dst_object))
7833          As we have moved PT while replacing the original buffer
7834          contents, we must recover it now.  */
7835       set_buffer_internal (XBUFFER (src_object));
7836       current_buffer->text->inhibit_shrinking = 0;
7837       if (saved_pt < from)
7838         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7839       else if (saved_pt < from + chars)
7840         TEMP_SET_PT_BOTH (from, from_byte);
7841       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7842         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7843                           saved_pt_byte + (coding->produced - bytes));
7844       else
7845         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7846                           saved_pt_byte + (coding->produced - bytes));
7847
7848       if (need_marker_adjustment)
7849         {
7850           struct Lisp_Marker *tail;
7851
7852           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7853             if (tail->need_adjustment)
7854               {
7855                 tail->need_adjustment = 0;
7856                 if (tail->insertion_type)
7857                   {
7858                     tail->bytepos = from_byte;
7859                     tail->charpos = from;
7860                   }
7861                 else
7862                   {
7863                     tail->bytepos = from_byte + coding->produced;
7864                     tail->charpos
7865                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7866                          ? tail->bytepos : from + coding->produced_char);
7867                   }
7868               }
7869         }
7870     }
7871
7872   Vdeactivate_mark = old_deactivate_mark;
7873   unbind_to (count, coding->dst_object);
7874 }
7875
7876
7877 void
7878 encode_coding_object (struct coding_system *coding,
7879                       Lisp_Object src_object,
7880                       ptrdiff_t from, ptrdiff_t from_byte,
7881                       ptrdiff_t to, ptrdiff_t to_byte,
7882                       Lisp_Object dst_object)
7883 {
7884   ptrdiff_t count = SPECPDL_INDEX ();
7885   ptrdiff_t chars = to - from;
7886   ptrdiff_t bytes = to_byte - from_byte;
7887   Lisp_Object attrs;
7888   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7889   int need_marker_adjustment = 0;
7890   int kill_src_buffer = 0;
7891   Lisp_Object old_deactivate_mark;
7892
7893   old_deactivate_mark = Vdeactivate_mark;
7894
7895   coding->src_object = src_object;
7896   coding->src_chars = chars;
7897   coding->src_bytes = bytes;
7898   coding->src_multibyte = chars < bytes;
7899
7900   attrs = CODING_ID_ATTRS (coding->id);
7901
7902   if (EQ (src_object, dst_object))
7903     {
7904       struct Lisp_Marker *tail;
7905
7906       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7907         {
7908           tail->need_adjustment
7909             = tail->charpos == (tail->insertion_type ? from : to);
7910           need_marker_adjustment |= tail->need_adjustment;
7911         }
7912     }
7913
7914   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7915     {
7916       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7917       set_buffer_internal (XBUFFER (coding->src_object));
7918       if (STRINGP (src_object))
7919         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7920       else if (BUFFERP (src_object))
7921         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7922       else
7923         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7924
7925       if (EQ (src_object, dst_object))
7926         {
7927           set_buffer_internal (XBUFFER (src_object));
7928           saved_pt = PT, saved_pt_byte = PT_BYTE;
7929           del_range_both (from, from_byte, to, to_byte, 1);
7930           set_buffer_internal (XBUFFER (coding->src_object));
7931         }
7932
7933       {
7934         Lisp_Object args[3];
7935         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7936
7937         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7938                 old_deactivate_mark);
7939         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7940         args[1] = make_number (BEG);
7941         args[2] = make_number (Z);
7942         safe_call (3, args);
7943         UNGCPRO;
7944       }
7945       if (XBUFFER (coding->src_object) != current_buffer)
7946         kill_src_buffer = 1;
7947       coding->src_object = Fcurrent_buffer ();
7948       if (BEG != GPT)
7949         move_gap_both (BEG, BEG_BYTE);
7950       coding->src_chars = Z - BEG;
7951       coding->src_bytes = Z_BYTE - BEG_BYTE;
7952       coding->src_pos = BEG;
7953       coding->src_pos_byte = BEG_BYTE;
7954       coding->src_multibyte = Z < Z_BYTE;
7955     }
7956   else if (STRINGP (src_object))
7957     {
7958       code_conversion_save (0, 0);
7959       coding->src_pos = from;
7960       coding->src_pos_byte = from_byte;
7961     }
7962   else if (BUFFERP (src_object))
7963     {
7964       code_conversion_save (0, 0);
7965       set_buffer_internal (XBUFFER (src_object));
7966       if (EQ (src_object, dst_object))
7967         {
7968           saved_pt = PT, saved_pt_byte = PT_BYTE;
7969           coding->src_object = del_range_1 (from, to, 1, 1);
7970           coding->src_pos = 0;
7971           coding->src_pos_byte = 0;
7972         }
7973       else
7974         {
7975           if (from < GPT && to >= GPT)
7976             move_gap_both (from, from_byte);
7977           coding->src_pos = from;
7978           coding->src_pos_byte = from_byte;
7979         }
7980     }
7981   else
7982     code_conversion_save (0, 0);
7983
7984   if (BUFFERP (dst_object))
7985     {
7986       coding->dst_object = dst_object;
7987       if (EQ (src_object, dst_object))
7988         {
7989           coding->dst_pos = from;
7990           coding->dst_pos_byte = from_byte;
7991         }
7992       else
7993         {
7994           struct buffer *current = current_buffer;
7995
7996           set_buffer_temp (XBUFFER (dst_object));
7997           coding->dst_pos = PT;
7998           coding->dst_pos_byte = PT_BYTE;
7999           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8000           set_buffer_temp (current);
8001         }
8002       coding->dst_multibyte
8003         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8004     }
8005   else if (EQ (dst_object, Qt))
8006     {
8007       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8008       coding->dst_object = Qnil;
8009       coding->destination = (unsigned char *) xmalloc (dst_bytes);
8010       coding->dst_bytes = dst_bytes;
8011       coding->dst_multibyte = 0;
8012     }
8013   else
8014     {
8015       coding->dst_object = Qnil;
8016       coding->dst_multibyte = 0;
8017     }
8018
8019   encode_coding (coding);
8020
8021   if (EQ (dst_object, Qt))
8022     {
8023       if (BUFFERP (coding->dst_object))
8024         coding->dst_object = Fbuffer_string ();
8025       else
8026         {
8027           coding->dst_object
8028             = make_unibyte_string ((char *) coding->destination,
8029                                    coding->produced);
8030           xfree (coding->destination);
8031         }
8032     }
8033
8034   if (saved_pt >= 0)
8035     {
8036       /* This is the case of:
8037          (BUFFERP (src_object) && EQ (src_object, dst_object))
8038          As we have moved PT while replacing the original buffer
8039          contents, we must recover it now.  */
8040       set_buffer_internal (XBUFFER (src_object));
8041       if (saved_pt < from)
8042         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8043       else if (saved_pt < from + chars)
8044         TEMP_SET_PT_BOTH (from, from_byte);
8045       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8046         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8047                           saved_pt_byte + (coding->produced - bytes));
8048       else
8049         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8050                           saved_pt_byte + (coding->produced - bytes));
8051
8052       if (need_marker_adjustment)
8053         {
8054           struct Lisp_Marker *tail;
8055
8056           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8057             if (tail->need_adjustment)
8058               {
8059                 tail->need_adjustment = 0;
8060                 if (tail->insertion_type)
8061                   {
8062                     tail->bytepos = from_byte;
8063                     tail->charpos = from;
8064                   }
8065                 else
8066                   {
8067                     tail->bytepos = from_byte + coding->produced;
8068                     tail->charpos
8069                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8070                          ? tail->bytepos : from + coding->produced_char);
8071                   }
8072               }
8073         }
8074     }
8075
8076   if (kill_src_buffer)
8077     Fkill_buffer (coding->src_object);
8078
8079   Vdeactivate_mark = old_deactivate_mark;
8080   unbind_to (count, Qnil);
8081 }
8082
8083
8084 Lisp_Object
8085 preferred_coding_system (void)
8086 {
8087   int id = coding_categories[coding_priorities[0]].id;
8088
8089   return CODING_ID_NAME (id);
8090 }
8091
8092 \f
8093 #ifdef emacs
8094 /*** 8. Emacs Lisp library functions ***/
8095
8096 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8097        doc: /* Return t if OBJECT is nil or a coding-system.
8098 See the documentation of `define-coding-system' for information
8099 about coding-system objects.  */)
8100   (Lisp_Object object)
8101 {
8102   if (NILP (object)
8103       || CODING_SYSTEM_ID (object) >= 0)
8104     return Qt;
8105   if (! SYMBOLP (object)
8106       || NILP (Fget (object, Qcoding_system_define_form)))
8107     return Qnil;
8108   return Qt;
8109 }
8110
8111 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8112        Sread_non_nil_coding_system, 1, 1, 0,
8113        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8114   (Lisp_Object prompt)
8115 {
8116   Lisp_Object val;
8117   do
8118     {
8119       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8120                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8121     }
8122   while (SCHARS (val) == 0);
8123   return (Fintern (val, Qnil));
8124 }
8125
8126 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8127        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8128 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8129 Ignores case when completing coding systems (all Emacs coding systems
8130 are lower-case).  */)
8131   (Lisp_Object prompt, Lisp_Object default_coding_system)
8132 {
8133   Lisp_Object val;
8134   ptrdiff_t count = SPECPDL_INDEX ();
8135
8136   if (SYMBOLP (default_coding_system))
8137     default_coding_system = SYMBOL_NAME (default_coding_system);
8138   specbind (Qcompletion_ignore_case, Qt);
8139   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8140                           Qt, Qnil, Qcoding_system_history,
8141                           default_coding_system, Qnil);
8142   unbind_to (count, Qnil);
8143   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8144 }
8145
8146 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8147        1, 1, 0,
8148        doc: /* Check validity of CODING-SYSTEM.
8149 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8150 It is valid if it is nil or a symbol defined as a coding system by the
8151 function `define-coding-system'.  */)
8152   (Lisp_Object coding_system)
8153 {
8154   Lisp_Object define_form;
8155
8156   define_form = Fget (coding_system, Qcoding_system_define_form);
8157   if (! NILP (define_form))
8158     {
8159       Fput (coding_system, Qcoding_system_define_form, Qnil);
8160       safe_eval (define_form);
8161     }
8162   if (!NILP (Fcoding_system_p (coding_system)))
8163     return coding_system;
8164   xsignal1 (Qcoding_system_error, coding_system);
8165 }
8166
8167 \f
8168 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8169    HIGHEST is nonzero, return the coding system of the highest
8170    priority among the detected coding systems.  Otherwise return a
8171    list of detected coding systems sorted by their priorities.  If
8172    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8173    multibyte form but contains only ASCII and eight-bit chars.
8174    Otherwise, the bytes are raw bytes.
8175
8176    CODING-SYSTEM controls the detection as below:
8177
8178    If it is nil, detect both text-format and eol-format.  If the
8179    text-format part of CODING-SYSTEM is already specified
8180    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8181    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8182    detect only text-format.  */
8183
8184 Lisp_Object
8185 detect_coding_system (const unsigned char *src,
8186                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8187                       int highest, int multibytep,
8188                       Lisp_Object coding_system)
8189 {
8190   const unsigned char *src_end = src + src_bytes;
8191   Lisp_Object attrs, eol_type;
8192   Lisp_Object val = Qnil;
8193   struct coding_system coding;
8194   ptrdiff_t id;
8195   struct coding_detection_info detect_info;
8196   enum coding_category base_category;
8197   int null_byte_found = 0, eight_bit_found = 0;
8198
8199   if (NILP (coding_system))
8200     coding_system = Qundecided;
8201   setup_coding_system (coding_system, &coding);
8202   attrs = CODING_ID_ATTRS (coding.id);
8203   eol_type = CODING_ID_EOL_TYPE (coding.id);
8204   coding_system = CODING_ATTR_BASE_NAME (attrs);
8205
8206   coding.source = src;
8207   coding.src_chars = src_chars;
8208   coding.src_bytes = src_bytes;
8209   coding.src_multibyte = multibytep;
8210   coding.consumed = 0;
8211   coding.mode |= CODING_MODE_LAST_BLOCK;
8212   coding.head_ascii = 0;
8213
8214   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8215
8216   /* At first, detect text-format if necessary.  */
8217   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8218   if (base_category == coding_category_undecided)
8219     {
8220       enum coding_category category IF_LINT (= 0);
8221       struct coding_system *this IF_LINT (= NULL);
8222       int c, i;
8223
8224       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8225       for (; src < src_end; src++)
8226         {
8227           c = *src;
8228           if (c & 0x80)
8229             {
8230               eight_bit_found = 1;
8231               if (null_byte_found)
8232                 break;
8233             }
8234           else if (c < 0x20)
8235             {
8236               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8237                   && ! inhibit_iso_escape_detection
8238                   && ! detect_info.checked)
8239                 {
8240                   if (detect_coding_iso_2022 (&coding, &detect_info))
8241                     {
8242                       /* We have scanned the whole data.  */
8243                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8244                         {
8245                           /* We didn't find an 8-bit code.  We may
8246                              have found a null-byte, but it's very
8247                              rare that a binary file confirm to
8248                              ISO-2022.  */
8249                           src = src_end;
8250                           coding.head_ascii = src - coding.source;
8251                         }
8252                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8253                       break;
8254                     }
8255                 }
8256               else if (! c && !inhibit_null_byte_detection)
8257                 {
8258                   null_byte_found = 1;
8259                   if (eight_bit_found)
8260                     break;
8261                 }
8262               if (! eight_bit_found)
8263                 coding.head_ascii++;
8264             }
8265           else if (! eight_bit_found)
8266             coding.head_ascii++;
8267         }
8268
8269       if (null_byte_found || eight_bit_found
8270           || coding.head_ascii < coding.src_bytes
8271           || detect_info.found)
8272         {
8273           if (coding.head_ascii == coding.src_bytes)
8274             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8275             for (i = 0; i < coding_category_raw_text; i++)
8276               {
8277                 category = coding_priorities[i];
8278                 this = coding_categories + category;
8279                 if (detect_info.found & (1 << category))
8280                   break;
8281               }
8282           else
8283             {
8284               if (null_byte_found)
8285                 {
8286                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8287                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8288                 }
8289               for (i = 0; i < coding_category_raw_text; i++)
8290                 {
8291                   category = coding_priorities[i];
8292                   this = coding_categories + category;
8293
8294                   if (this->id < 0)
8295                     {
8296                       /* No coding system of this category is defined.  */
8297                       detect_info.rejected |= (1 << category);
8298                     }
8299                   else if (category >= coding_category_raw_text)
8300                     continue;
8301                   else if (detect_info.checked & (1 << category))
8302                     {
8303                       if (highest
8304                           && (detect_info.found & (1 << category)))
8305                         break;
8306                     }
8307                   else if ((*(this->detector)) (&coding, &detect_info)
8308                            && highest
8309                            && (detect_info.found & (1 << category)))
8310                     {
8311                       if (category == coding_category_utf_16_auto)
8312                         {
8313                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8314                             category = coding_category_utf_16_le;
8315                           else
8316                             category = coding_category_utf_16_be;
8317                         }
8318                       break;
8319                     }
8320                 }
8321             }
8322         }
8323
8324       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8325           || null_byte_found)
8326         {
8327           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8328           id = CODING_SYSTEM_ID (Qno_conversion);
8329           val = Fcons (make_number (id), Qnil);
8330         }
8331       else if (! detect_info.rejected && ! detect_info.found)
8332         {
8333           detect_info.found = CATEGORY_MASK_ANY;
8334           id = coding_categories[coding_category_undecided].id;
8335           val = Fcons (make_number (id), Qnil);
8336         }
8337       else if (highest)
8338         {
8339           if (detect_info.found)
8340             {
8341               detect_info.found = 1 << category;
8342               val = Fcons (make_number (this->id), Qnil);
8343             }
8344           else
8345             for (i = 0; i < coding_category_raw_text; i++)
8346               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8347                 {
8348                   detect_info.found = 1 << coding_priorities[i];
8349                   id = coding_categories[coding_priorities[i]].id;
8350                   val = Fcons (make_number (id), Qnil);
8351                   break;
8352                 }
8353         }
8354       else
8355         {
8356           int mask = detect_info.rejected | detect_info.found;
8357           int found = 0;
8358
8359           for (i = coding_category_raw_text - 1; i >= 0; i--)
8360             {
8361               category = coding_priorities[i];
8362               if (! (mask & (1 << category)))
8363                 {
8364                   found |= 1 << category;
8365                   id = coding_categories[category].id;
8366                   if (id >= 0)
8367                     val = Fcons (make_number (id), val);
8368                 }
8369             }
8370           for (i = coding_category_raw_text - 1; i >= 0; i--)
8371             {
8372               category = coding_priorities[i];
8373               if (detect_info.found & (1 << category))
8374                 {
8375                   id = coding_categories[category].id;
8376                   val = Fcons (make_number (id), val);
8377                 }
8378             }
8379           detect_info.found |= found;
8380         }
8381     }
8382   else if (base_category == coding_category_utf_8_auto)
8383     {
8384       if (detect_coding_utf_8 (&coding, &detect_info))
8385         {
8386           struct coding_system *this;
8387
8388           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8389             this = coding_categories + coding_category_utf_8_sig;
8390           else
8391             this = coding_categories + coding_category_utf_8_nosig;
8392           val = Fcons (make_number (this->id), Qnil);
8393         }
8394     }
8395   else if (base_category == coding_category_utf_16_auto)
8396     {
8397       if (detect_coding_utf_16 (&coding, &detect_info))
8398         {
8399           struct coding_system *this;
8400
8401           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8402             this = coding_categories + coding_category_utf_16_le;
8403           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8404             this = coding_categories + coding_category_utf_16_be;
8405           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8406             this = coding_categories + coding_category_utf_16_be_nosig;
8407           else
8408             this = coding_categories + coding_category_utf_16_le_nosig;
8409           val = Fcons (make_number (this->id), Qnil);
8410         }
8411     }
8412   else
8413     {
8414       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8415       val = Fcons (make_number (coding.id), Qnil);
8416     }
8417
8418   /* Then, detect eol-format if necessary.  */
8419   {
8420     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8421     Lisp_Object tail;
8422
8423     if (VECTORP (eol_type))
8424       {
8425         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8426           {
8427             if (null_byte_found)
8428               normal_eol = EOL_SEEN_LF;
8429             else
8430               normal_eol = detect_eol (coding.source, src_bytes,
8431                                        coding_category_raw_text);
8432           }
8433         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8434                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8435           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8436                                       coding_category_utf_16_be);
8437         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8438                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8439           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8440                                       coding_category_utf_16_le);
8441       }
8442     else
8443       {
8444         if (EQ (eol_type, Qunix))
8445           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8446         else if (EQ (eol_type, Qdos))
8447           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8448         else
8449           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8450       }
8451
8452     for (tail = val; CONSP (tail); tail = XCDR (tail))
8453       {
8454         enum coding_category category;
8455         int this_eol;
8456
8457         id = XINT (XCAR (tail));
8458         attrs = CODING_ID_ATTRS (id);
8459         category = XINT (CODING_ATTR_CATEGORY (attrs));
8460         eol_type = CODING_ID_EOL_TYPE (id);
8461         if (VECTORP (eol_type))
8462           {
8463             if (category == coding_category_utf_16_be
8464                 || category == coding_category_utf_16_be_nosig)
8465               this_eol = utf_16_be_eol;
8466             else if (category == coding_category_utf_16_le
8467                      || category == coding_category_utf_16_le_nosig)
8468               this_eol = utf_16_le_eol;
8469             else
8470               this_eol = normal_eol;
8471
8472             if (this_eol == EOL_SEEN_LF)
8473               XSETCAR (tail, AREF (eol_type, 0));
8474             else if (this_eol == EOL_SEEN_CRLF)
8475               XSETCAR (tail, AREF (eol_type, 1));
8476             else if (this_eol == EOL_SEEN_CR)
8477               XSETCAR (tail, AREF (eol_type, 2));
8478             else
8479               XSETCAR (tail, CODING_ID_NAME (id));
8480           }
8481         else
8482           XSETCAR (tail, CODING_ID_NAME (id));
8483       }
8484   }
8485
8486   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8487 }
8488
8489
8490 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8491        2, 3, 0,
8492        doc: /* Detect coding system of the text in the region between START and END.
8493 Return a list of possible coding systems ordered by priority.
8494 The coding systems to try and their priorities follows what
8495 the function `coding-system-priority-list' (which see) returns.
8496
8497 If only ASCII characters are found (except for such ISO-2022 control
8498 characters as ESC), it returns a list of single element `undecided'
8499 or its subsidiary coding system according to a detected end-of-line
8500 format.
8501
8502 If optional argument HIGHEST is non-nil, return the coding system of
8503 highest priority.  */)
8504   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8505 {
8506   ptrdiff_t from, to;
8507   ptrdiff_t from_byte, to_byte;
8508
8509   CHECK_NUMBER_COERCE_MARKER (start);
8510   CHECK_NUMBER_COERCE_MARKER (end);
8511
8512   validate_region (&start, &end);
8513   from = XINT (start), to = XINT (end);
8514   from_byte = CHAR_TO_BYTE (from);
8515   to_byte = CHAR_TO_BYTE (to);
8516
8517   if (from < GPT && to >= GPT)
8518     move_gap_both (to, to_byte);
8519
8520   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8521                                to - from, to_byte - from_byte,
8522                                !NILP (highest),
8523                                !NILP (BVAR (current_buffer
8524                                       , enable_multibyte_characters)),
8525                                Qnil);
8526 }
8527
8528 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8529        1, 2, 0,
8530        doc: /* Detect coding system of the text in STRING.
8531 Return a list of possible coding systems ordered by priority.
8532 The coding systems to try and their priorities follows what
8533 the function `coding-system-priority-list' (which see) returns.
8534
8535 If only ASCII characters are found (except for such ISO-2022 control
8536 characters as ESC), it returns a list of single element `undecided'
8537 or its subsidiary coding system according to a detected end-of-line
8538 format.
8539
8540 If optional argument HIGHEST is non-nil, return the coding system of
8541 highest priority.  */)
8542   (Lisp_Object string, Lisp_Object highest)
8543 {
8544   CHECK_STRING (string);
8545
8546   return detect_coding_system (SDATA (string),
8547                                SCHARS (string), SBYTES (string),
8548                                !NILP (highest), STRING_MULTIBYTE (string),
8549                                Qnil);
8550 }
8551
8552
8553 static inline int
8554 char_encodable_p (int c, Lisp_Object attrs)
8555 {
8556   Lisp_Object tail;
8557   struct charset *charset;
8558   Lisp_Object translation_table;
8559
8560   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8561   if (! NILP (translation_table))
8562     c = translate_char (translation_table, c);
8563   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8564        CONSP (tail); tail = XCDR (tail))
8565     {
8566       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8567       if (CHAR_CHARSET_P (c, charset))
8568         break;
8569     }
8570   return (! NILP (tail));
8571 }
8572
8573
8574 /* Return a list of coding systems that safely encode the text between
8575    START and END.  If EXCLUDE is non-nil, it is a list of coding
8576    systems not to check.  The returned list doesn't contain any such
8577    coding systems.  In any case, if the text contains only ASCII or is
8578    unibyte, return t.  */
8579
8580 DEFUN ("find-coding-systems-region-internal",
8581        Ffind_coding_systems_region_internal,
8582        Sfind_coding_systems_region_internal, 2, 3, 0,
8583        doc: /* Internal use only.  */)
8584   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8585 {
8586   Lisp_Object coding_attrs_list, safe_codings;
8587   ptrdiff_t start_byte, end_byte;
8588   const unsigned char *p, *pbeg, *pend;
8589   int c;
8590   Lisp_Object tail, elt, work_table;
8591
8592   if (STRINGP (start))
8593     {
8594       if (!STRING_MULTIBYTE (start)
8595           || SCHARS (start) == SBYTES (start))
8596         return Qt;
8597       start_byte = 0;
8598       end_byte = SBYTES (start);
8599     }
8600   else
8601     {
8602       CHECK_NUMBER_COERCE_MARKER (start);
8603       CHECK_NUMBER_COERCE_MARKER (end);
8604       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8605         args_out_of_range (start, end);
8606       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8607         return Qt;
8608       start_byte = CHAR_TO_BYTE (XINT (start));
8609       end_byte = CHAR_TO_BYTE (XINT (end));
8610       if (XINT (end) - XINT (start) == end_byte - start_byte)
8611         return Qt;
8612
8613       if (XINT (start) < GPT && XINT (end) > GPT)
8614         {
8615           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8616             move_gap_both (XINT (start), start_byte);
8617           else
8618             move_gap_both (XINT (end), end_byte);
8619         }
8620     }
8621
8622   coding_attrs_list = Qnil;
8623   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8624     if (NILP (exclude)
8625         || NILP (Fmemq (XCAR (tail), exclude)))
8626       {
8627         Lisp_Object attrs;
8628
8629         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8630         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8631             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8632           {
8633             ASET (attrs, coding_attr_trans_tbl,
8634                   get_translation_table (attrs, 1, NULL));
8635             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8636           }
8637       }
8638
8639   if (STRINGP (start))
8640     p = pbeg = SDATA (start);
8641   else
8642     p = pbeg = BYTE_POS_ADDR (start_byte);
8643   pend = p + (end_byte - start_byte);
8644
8645   while (p < pend && ASCII_BYTE_P (*p)) p++;
8646   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8647
8648   work_table = Fmake_char_table (Qnil, Qnil);
8649   while (p < pend)
8650     {
8651       if (ASCII_BYTE_P (*p))
8652         p++;
8653       else
8654         {
8655           c = STRING_CHAR_ADVANCE (p);
8656           if (!NILP (char_table_ref (work_table, c)))
8657             /* This character was already checked.  Ignore it.  */
8658             continue;
8659
8660           charset_map_loaded = 0;
8661           for (tail = coding_attrs_list; CONSP (tail);)
8662             {
8663               elt = XCAR (tail);
8664               if (NILP (elt))
8665                 tail = XCDR (tail);
8666               else if (char_encodable_p (c, elt))
8667                 tail = XCDR (tail);
8668               else if (CONSP (XCDR (tail)))
8669                 {
8670                   XSETCAR (tail, XCAR (XCDR (tail)));
8671                   XSETCDR (tail, XCDR (XCDR (tail)));
8672                 }
8673               else
8674                 {
8675                   XSETCAR (tail, Qnil);
8676                   tail = XCDR (tail);
8677                 }
8678             }
8679           if (charset_map_loaded)
8680             {
8681               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8682
8683               if (STRINGP (start))
8684                 pbeg = SDATA (start);
8685               else
8686                 pbeg = BYTE_POS_ADDR (start_byte);
8687               p = pbeg + p_offset;
8688               pend = pbeg + pend_offset;
8689             }
8690           char_table_set (work_table, c, Qt);
8691         }
8692     }
8693
8694   safe_codings = list2 (Qraw_text, Qno_conversion);
8695   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8696     if (! NILP (XCAR (tail)))
8697       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8698
8699   return safe_codings;
8700 }
8701
8702
8703 DEFUN ("unencodable-char-position", Funencodable_char_position,
8704        Sunencodable_char_position, 3, 5, 0,
8705        doc: /*
8706 Return position of first un-encodable character in a region.
8707 START and END specify the region and CODING-SYSTEM specifies the
8708 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8709
8710 If optional 4th argument COUNT is non-nil, it specifies at most how
8711 many un-encodable characters to search.  In this case, the value is a
8712 list of positions.
8713
8714 If optional 5th argument STRING is non-nil, it is a string to search
8715 for un-encodable characters.  In that case, START and END are indexes
8716 to the string.  */)
8717   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8718 {
8719   EMACS_INT n;
8720   struct coding_system coding;
8721   Lisp_Object attrs, charset_list, translation_table;
8722   Lisp_Object positions;
8723   ptrdiff_t from, to;
8724   const unsigned char *p, *stop, *pend;
8725   int ascii_compatible;
8726
8727   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8728   attrs = CODING_ID_ATTRS (coding.id);
8729   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8730     return Qnil;
8731   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8732   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8733   translation_table = get_translation_table (attrs, 1, NULL);
8734
8735   if (NILP (string))
8736     {
8737       validate_region (&start, &end);
8738       from = XINT (start);
8739       to = XINT (end);
8740       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8741           || (ascii_compatible
8742               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8743         return Qnil;
8744       p = CHAR_POS_ADDR (from);
8745       pend = CHAR_POS_ADDR (to);
8746       if (from < GPT && to >= GPT)
8747         stop = GPT_ADDR;
8748       else
8749         stop = pend;
8750     }
8751   else
8752     {
8753       CHECK_STRING (string);
8754       CHECK_NATNUM (start);
8755       CHECK_NATNUM (end);
8756       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8757         args_out_of_range_3 (string, start, end);
8758       from = XINT (start);
8759       to = XINT (end);
8760       if (! STRING_MULTIBYTE (string))
8761         return Qnil;
8762       p = SDATA (string) + string_char_to_byte (string, from);
8763       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8764       if (ascii_compatible && (to - from) == (pend - p))
8765         return Qnil;
8766     }
8767
8768   if (NILP (count))
8769     n = 1;
8770   else
8771     {
8772       CHECK_NATNUM (count);
8773       n = XINT (count);
8774     }
8775
8776   positions = Qnil;
8777   charset_map_loaded = 0;
8778   while (1)
8779     {
8780       int c;
8781
8782       if (ascii_compatible)
8783         while (p < stop && ASCII_BYTE_P (*p))
8784           p++, from++;
8785       if (p >= stop)
8786         {
8787           if (p >= pend)
8788             break;
8789           stop = pend;
8790           p = GAP_END_ADDR;
8791         }
8792
8793       c = STRING_CHAR_ADVANCE (p);
8794       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8795           && ! char_charset (translate_char (translation_table, c),
8796                              charset_list, NULL))
8797         {
8798           positions = Fcons (make_number (from), positions);
8799           n--;
8800           if (n == 0)
8801             break;
8802         }
8803
8804       from++;
8805       if (charset_map_loaded && NILP (string))
8806         {
8807           p = CHAR_POS_ADDR (from);
8808           pend = CHAR_POS_ADDR (to);
8809           if (from < GPT && to >= GPT)
8810             stop = GPT_ADDR;
8811           else
8812             stop = pend;
8813           charset_map_loaded = 0;
8814         }
8815     }
8816
8817   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8818 }
8819
8820
8821 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8822        Scheck_coding_systems_region, 3, 3, 0,
8823        doc: /* Check if the region is encodable by coding systems.
8824
8825 START and END are buffer positions specifying the region.
8826 CODING-SYSTEM-LIST is a list of coding systems to check.
8827
8828 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8829 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8830 whole region, POS0, POS1, ... are buffer positions where non-encodable
8831 characters are found.
8832
8833 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8834 value is nil.
8835
8836 START may be a string.  In that case, check if the string is
8837 encodable, and the value contains indices to the string instead of
8838 buffer positions.  END is ignored.
8839
8840 If the current buffer (or START if it is a string) is unibyte, the value
8841 is nil.  */)
8842   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8843 {
8844   Lisp_Object list;
8845   ptrdiff_t start_byte, end_byte;
8846   ptrdiff_t pos;
8847   const unsigned char *p, *pbeg, *pend;
8848   int c;
8849   Lisp_Object tail, elt, attrs;
8850
8851   if (STRINGP (start))
8852     {
8853       if (!STRING_MULTIBYTE (start)
8854           || SCHARS (start) == SBYTES (start))
8855         return Qnil;
8856       start_byte = 0;
8857       end_byte = SBYTES (start);
8858       pos = 0;
8859     }
8860   else
8861     {
8862       CHECK_NUMBER_COERCE_MARKER (start);
8863       CHECK_NUMBER_COERCE_MARKER (end);
8864       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8865         args_out_of_range (start, end);
8866       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8867         return Qnil;
8868       start_byte = CHAR_TO_BYTE (XINT (start));
8869       end_byte = CHAR_TO_BYTE (XINT (end));
8870       if (XINT (end) - XINT (start) == end_byte - start_byte)
8871         return Qnil;
8872
8873       if (XINT (start) < GPT && XINT (end) > GPT)
8874         {
8875           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8876             move_gap_both (XINT (start), start_byte);
8877           else
8878             move_gap_both (XINT (end), end_byte);
8879         }
8880       pos = XINT (start);
8881     }
8882
8883   list = Qnil;
8884   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8885     {
8886       elt = XCAR (tail);
8887       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8888       ASET (attrs, coding_attr_trans_tbl,
8889             get_translation_table (attrs, 1, NULL));
8890       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8891     }
8892
8893   if (STRINGP (start))
8894     p = pbeg = SDATA (start);
8895   else
8896     p = pbeg = BYTE_POS_ADDR (start_byte);
8897   pend = p + (end_byte - start_byte);
8898
8899   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8900   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8901
8902   while (p < pend)
8903     {
8904       if (ASCII_BYTE_P (*p))
8905         p++;
8906       else
8907         {
8908           c = STRING_CHAR_ADVANCE (p);
8909
8910           charset_map_loaded = 0;
8911           for (tail = list; CONSP (tail); tail = XCDR (tail))
8912             {
8913               elt = XCDR (XCAR (tail));
8914               if (! char_encodable_p (c, XCAR (elt)))
8915                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8916             }
8917           if (charset_map_loaded)
8918             {
8919               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8920
8921               if (STRINGP (start))
8922                 pbeg = SDATA (start);
8923               else
8924                 pbeg = BYTE_POS_ADDR (start_byte);
8925               p = pbeg + p_offset;
8926               pend = pbeg + pend_offset;
8927             }
8928         }
8929       pos++;
8930     }
8931
8932   tail = list;
8933   list = Qnil;
8934   for (; CONSP (tail); tail = XCDR (tail))
8935     {
8936       elt = XCAR (tail);
8937       if (CONSP (XCDR (XCDR (elt))))
8938         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8939                       list);
8940     }
8941
8942   return list;
8943 }
8944
8945
8946 static Lisp_Object
8947 code_convert_region (Lisp_Object start, Lisp_Object end,
8948                      Lisp_Object coding_system, Lisp_Object dst_object,
8949                      int encodep, int norecord)
8950 {
8951   struct coding_system coding;
8952   ptrdiff_t from, from_byte, to, to_byte;
8953   Lisp_Object src_object;
8954
8955   CHECK_NUMBER_COERCE_MARKER (start);
8956   CHECK_NUMBER_COERCE_MARKER (end);
8957   if (NILP (coding_system))
8958     coding_system = Qno_conversion;
8959   else
8960     CHECK_CODING_SYSTEM (coding_system);
8961   src_object = Fcurrent_buffer ();
8962   if (NILP (dst_object))
8963     dst_object = src_object;
8964   else if (! EQ (dst_object, Qt))
8965     CHECK_BUFFER (dst_object);
8966
8967   validate_region (&start, &end);
8968   from = XFASTINT (start);
8969   from_byte = CHAR_TO_BYTE (from);
8970   to = XFASTINT (end);
8971   to_byte = CHAR_TO_BYTE (to);
8972
8973   setup_coding_system (coding_system, &coding);
8974   coding.mode |= CODING_MODE_LAST_BLOCK;
8975
8976   if (encodep)
8977     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8978                           dst_object);
8979   else
8980     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8981                           dst_object);
8982   if (! norecord)
8983     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8984
8985   return (BUFFERP (dst_object)
8986           ? make_number (coding.produced_char)
8987           : coding.dst_object);
8988 }
8989
8990
8991 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8992        3, 4, "r\nzCoding system: ",
8993        doc: /* Decode the current region from the specified coding system.
8994 When called from a program, takes four arguments:
8995         START, END, CODING-SYSTEM, and DESTINATION.
8996 START and END are buffer positions.
8997
8998 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8999 If nil, the region between START and END is replaced by the decoded text.
9000 If buffer, the decoded text is inserted in that buffer after point (point
9001 does not move).
9002 In those cases, the length of the decoded text is returned.
9003 If DESTINATION is t, the decoded text is returned.
9004
9005 This function sets `last-coding-system-used' to the precise coding system
9006 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9007 not fully specified.)  */)
9008   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9009 {
9010   return code_convert_region (start, end, coding_system, destination, 0, 0);
9011 }
9012
9013 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9014        3, 4, "r\nzCoding system: ",
9015        doc: /* Encode the current region by specified coding system.
9016 When called from a program, takes four arguments:
9017         START, END, CODING-SYSTEM and DESTINATION.
9018 START and END are buffer positions.
9019
9020 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9021 If nil, the region between START and END is replace by the encoded text.
9022 If buffer, the encoded text is inserted in that buffer after point (point
9023 does not move).
9024 In those cases, the length of the encoded text is returned.
9025 If DESTINATION is t, the encoded text is returned.
9026
9027 This function sets `last-coding-system-used' to the precise coding system
9028 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9029 not fully specified.)  */)
9030   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9031 {
9032   return code_convert_region (start, end, coding_system, destination, 1, 0);
9033 }
9034
9035 Lisp_Object
9036 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9037                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
9038 {
9039   struct coding_system coding;
9040   ptrdiff_t chars, bytes;
9041
9042   CHECK_STRING (string);
9043   if (NILP (coding_system))
9044     {
9045       if (! norecord)
9046         Vlast_coding_system_used = Qno_conversion;
9047       if (NILP (dst_object))
9048         return (nocopy ? Fcopy_sequence (string) : string);
9049     }
9050
9051   if (NILP (coding_system))
9052     coding_system = Qno_conversion;
9053   else
9054     CHECK_CODING_SYSTEM (coding_system);
9055   if (NILP (dst_object))
9056     dst_object = Qt;
9057   else if (! EQ (dst_object, Qt))
9058     CHECK_BUFFER (dst_object);
9059
9060   setup_coding_system (coding_system, &coding);
9061   coding.mode |= CODING_MODE_LAST_BLOCK;
9062   chars = SCHARS (string);
9063   bytes = SBYTES (string);
9064   if (encodep)
9065     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9066   else
9067     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9068   if (! norecord)
9069     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9070
9071   return (BUFFERP (dst_object)
9072           ? make_number (coding.produced_char)
9073           : coding.dst_object);
9074 }
9075
9076
9077 /* Encode or decode STRING according to CODING_SYSTEM.
9078    Do not set Vlast_coding_system_used.
9079
9080    This function is called only from macros DECODE_FILE and
9081    ENCODE_FILE, thus we ignore character composition.  */
9082
9083 Lisp_Object
9084 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9085                               int encodep)
9086 {
9087   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9088 }
9089
9090
9091 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9092        2, 4, 0,
9093        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9094
9095 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9096 if the decoding operation is trivial.
9097
9098 Optional fourth arg BUFFER non-nil means that the decoded text is
9099 inserted in that buffer after point (point does not move).  In this
9100 case, the return value is the length of the decoded text.
9101
9102 This function sets `last-coding-system-used' to the precise coding system
9103 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9104 not fully specified.)  */)
9105   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9106 {
9107   return code_convert_string (string, coding_system, buffer,
9108                               0, ! NILP (nocopy), 0);
9109 }
9110
9111 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9112        2, 4, 0,
9113        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9114
9115 Optional third arg NOCOPY non-nil means it is OK to return STRING
9116 itself if the encoding operation is trivial.
9117
9118 Optional fourth arg BUFFER non-nil means that the encoded text is
9119 inserted in that buffer after point (point does not move).  In this
9120 case, the return value is the length of the encoded text.
9121
9122 This function sets `last-coding-system-used' to the precise coding system
9123 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9124 not fully specified.)  */)
9125   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9126 {
9127   return code_convert_string (string, coding_system, buffer,
9128                               1, ! NILP (nocopy), 0);
9129 }
9130
9131 \f
9132 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9133        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9134 Return the corresponding character.  */)
9135   (Lisp_Object code)
9136 {
9137   Lisp_Object spec, attrs, val;
9138   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9139   EMACS_INT ch;
9140   int c;
9141
9142   CHECK_NATNUM (code);
9143   ch = XFASTINT (code);
9144   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9145   attrs = AREF (spec, 0);
9146
9147   if (ASCII_BYTE_P (ch)
9148       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9149     return code;
9150
9151   val = CODING_ATTR_CHARSET_LIST (attrs);
9152   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9153   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9154   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9155
9156   if (ch <= 0x7F)
9157     {
9158       c = ch;
9159       charset = charset_roman;
9160     }
9161   else if (ch >= 0xA0 && ch < 0xDF)
9162     {
9163       c = ch - 0x80;
9164       charset = charset_kana;
9165     }
9166   else
9167     {
9168       EMACS_INT c1 = ch >> 8;
9169       int c2 = ch & 0xFF;
9170
9171       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9172           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9173         error ("Invalid code: %"pI"d", ch);
9174       c = ch;
9175       SJIS_TO_JIS (c);
9176       charset = charset_kanji;
9177     }
9178   c = DECODE_CHAR (charset, c);
9179   if (c < 0)
9180     error ("Invalid code: %"pI"d", ch);
9181   return make_number (c);
9182 }
9183
9184
9185 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9186        doc: /* Encode a Japanese character CH to shift_jis encoding.
9187 Return the corresponding code in SJIS.  */)
9188   (Lisp_Object ch)
9189 {
9190   Lisp_Object spec, attrs, charset_list;
9191   int c;
9192   struct charset *charset;
9193   unsigned code;
9194
9195   CHECK_CHARACTER (ch);
9196   c = XFASTINT (ch);
9197   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9198   attrs = AREF (spec, 0);
9199
9200   if (ASCII_CHAR_P (c)
9201       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9202     return ch;
9203
9204   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9205   charset = char_charset (c, charset_list, &code);
9206   if (code == CHARSET_INVALID_CODE (charset))
9207     error ("Can't encode by shift_jis encoding: %c", c);
9208   JIS_TO_SJIS (code);
9209
9210   return make_number (code);
9211 }
9212
9213 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9214        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9215 Return the corresponding character.  */)
9216   (Lisp_Object code)
9217 {
9218   Lisp_Object spec, attrs, val;
9219   struct charset *charset_roman, *charset_big5, *charset;
9220   EMACS_INT ch;
9221   int c;
9222
9223   CHECK_NATNUM (code);
9224   ch = XFASTINT (code);
9225   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9226   attrs = AREF (spec, 0);
9227
9228   if (ASCII_BYTE_P (ch)
9229       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9230     return code;
9231
9232   val = CODING_ATTR_CHARSET_LIST (attrs);
9233   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9234   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9235
9236   if (ch <= 0x7F)
9237     {
9238       c = ch;
9239       charset = charset_roman;
9240     }
9241   else
9242     {
9243       EMACS_INT b1 = ch >> 8;
9244       int b2 = ch & 0x7F;
9245       if (b1 < 0xA1 || b1 > 0xFE
9246           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9247         error ("Invalid code: %"pI"d", ch);
9248       c = ch;
9249       charset = charset_big5;
9250     }
9251   c = DECODE_CHAR (charset, c);
9252   if (c < 0)
9253     error ("Invalid code: %"pI"d", ch);
9254   return make_number (c);
9255 }
9256
9257 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9258        doc: /* Encode the Big5 character CH to BIG5 coding system.
9259 Return the corresponding character code in Big5.  */)
9260   (Lisp_Object ch)
9261 {
9262   Lisp_Object spec, attrs, charset_list;
9263   struct charset *charset;
9264   int c;
9265   unsigned code;
9266
9267   CHECK_CHARACTER (ch);
9268   c = XFASTINT (ch);
9269   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9270   attrs = AREF (spec, 0);
9271   if (ASCII_CHAR_P (c)
9272       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9273     return ch;
9274
9275   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9276   charset = char_charset (c, charset_list, &code);
9277   if (code == CHARSET_INVALID_CODE (charset))
9278     error ("Can't encode by Big5 encoding: %c", c);
9279
9280   return make_number (code);
9281 }
9282
9283 \f
9284 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9285        Sset_terminal_coding_system_internal, 1, 2, 0,
9286        doc: /* Internal use only.  */)
9287   (Lisp_Object coding_system, Lisp_Object terminal)
9288 {
9289   struct terminal *term = get_terminal (terminal, 1);
9290   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9291   CHECK_SYMBOL (coding_system);
9292   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9293   /* We had better not send unsafe characters to terminal.  */
9294   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9295   /* Character composition should be disabled.  */
9296   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9297   terminal_coding->src_multibyte = 1;
9298   terminal_coding->dst_multibyte = 0;
9299   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9300     term->charset_list = coding_charset_list (terminal_coding);
9301   else
9302     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9303   return Qnil;
9304 }
9305
9306 DEFUN ("set-safe-terminal-coding-system-internal",
9307        Fset_safe_terminal_coding_system_internal,
9308        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9309        doc: /* Internal use only.  */)
9310   (Lisp_Object coding_system)
9311 {
9312   CHECK_SYMBOL (coding_system);
9313   setup_coding_system (Fcheck_coding_system (coding_system),
9314                        &safe_terminal_coding);
9315   /* Character composition should be disabled.  */
9316   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9317   safe_terminal_coding.src_multibyte = 1;
9318   safe_terminal_coding.dst_multibyte = 0;
9319   return Qnil;
9320 }
9321
9322 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9323        Sterminal_coding_system, 0, 1, 0,
9324        doc: /* Return coding system specified for terminal output on the given terminal.
9325 TERMINAL may be a terminal object, a frame, or nil for the selected
9326 frame's terminal device.  */)
9327   (Lisp_Object terminal)
9328 {
9329   struct coding_system *terminal_coding
9330     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9331   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9332
9333   /* For backward compatibility, return nil if it is `undecided'.  */
9334   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9335 }
9336
9337 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9338        Sset_keyboard_coding_system_internal, 1, 2, 0,
9339        doc: /* Internal use only.  */)
9340   (Lisp_Object coding_system, Lisp_Object terminal)
9341 {
9342   struct terminal *t = get_terminal (terminal, 1);
9343   CHECK_SYMBOL (coding_system);
9344   if (NILP (coding_system))
9345     coding_system = Qno_conversion;
9346   else
9347     Fcheck_coding_system (coding_system);
9348   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9349   /* Character composition should be disabled.  */
9350   TERMINAL_KEYBOARD_CODING (t)->common_flags
9351     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9352   return Qnil;
9353 }
9354
9355 DEFUN ("keyboard-coding-system",
9356        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9357        doc: /* Return coding system specified for decoding keyboard input.  */)
9358   (Lisp_Object terminal)
9359 {
9360   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9361                          (get_terminal (terminal, 1))->id);
9362 }
9363
9364 \f
9365 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9366        Sfind_operation_coding_system,  1, MANY, 0,
9367        doc: /* Choose a coding system for an operation based on the target name.
9368 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9369 DECODING-SYSTEM is the coding system to use for decoding
9370 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9371 for encoding (in case OPERATION does encoding).
9372
9373 The first argument OPERATION specifies an I/O primitive:
9374   For file I/O, `insert-file-contents' or `write-region'.
9375   For process I/O, `call-process', `call-process-region', or `start-process'.
9376   For network I/O, `open-network-stream'.
9377
9378 The remaining arguments should be the same arguments that were passed
9379 to the primitive.  Depending on which primitive, one of those arguments
9380 is selected as the TARGET.  For example, if OPERATION does file I/O,
9381 whichever argument specifies the file name is TARGET.
9382
9383 TARGET has a meaning which depends on OPERATION:
9384   For file I/O, TARGET is a file name (except for the special case below).
9385   For process I/O, TARGET is a process name.
9386   For network I/O, TARGET is a service name or a port number.
9387
9388 This function looks up what is specified for TARGET in
9389 `file-coding-system-alist', `process-coding-system-alist',
9390 or `network-coding-system-alist' depending on OPERATION.
9391 They may specify a coding system, a cons of coding systems,
9392 or a function symbol to call.
9393 In the last case, we call the function with one argument,
9394 which is a list of all the arguments given to this function.
9395 If the function can't decide a coding system, it can return
9396 `undecided' so that the normal code-detection is performed.
9397
9398 If OPERATION is `insert-file-contents', the argument corresponding to
9399 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9400 file name to look up, and BUFFER is a buffer that contains the file's
9401 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9402 function to call for FILENAME, that function should examine the
9403 contents of BUFFER instead of reading the file.
9404
9405 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9406   (ptrdiff_t nargs, Lisp_Object *args)
9407 {
9408   Lisp_Object operation, target_idx, target, val;
9409   register Lisp_Object chain;
9410
9411   if (nargs < 2)
9412     error ("Too few arguments");
9413   operation = args[0];
9414   if (!SYMBOLP (operation)
9415       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9416     error ("Invalid first argument");
9417   if (nargs <= 1 + XFASTINT (target_idx))
9418     error ("Too few arguments for operation `%s'",
9419            SDATA (SYMBOL_NAME (operation)));
9420   target = args[XFASTINT (target_idx) + 1];
9421   if (!(STRINGP (target)
9422         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9423             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9424         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9425     error ("Invalid argument %"pI"d of operation `%s'",
9426            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9427   if (CONSP (target))
9428     target = XCAR (target);
9429
9430   chain = ((EQ (operation, Qinsert_file_contents)
9431             || EQ (operation, Qwrite_region))
9432            ? Vfile_coding_system_alist
9433            : (EQ (operation, Qopen_network_stream)
9434               ? Vnetwork_coding_system_alist
9435               : Vprocess_coding_system_alist));
9436   if (NILP (chain))
9437     return Qnil;
9438
9439   for (; CONSP (chain); chain = XCDR (chain))
9440     {
9441       Lisp_Object elt;
9442
9443       elt = XCAR (chain);
9444       if (CONSP (elt)
9445           && ((STRINGP (target)
9446                && STRINGP (XCAR (elt))
9447                && fast_string_match (XCAR (elt), target) >= 0)
9448               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9449         {
9450           val = XCDR (elt);
9451           /* Here, if VAL is both a valid coding system and a valid
9452              function symbol, we return VAL as a coding system.  */
9453           if (CONSP (val))
9454             return val;
9455           if (! SYMBOLP (val))
9456             return Qnil;
9457           if (! NILP (Fcoding_system_p (val)))
9458             return Fcons (val, val);
9459           if (! NILP (Ffboundp (val)))
9460             {
9461               /* We use call1 rather than safe_call1
9462                  so as to get bug reports about functions called here
9463                  which don't handle the current interface.  */
9464               val = call1 (val, Flist (nargs, args));
9465               if (CONSP (val))
9466                 return val;
9467               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9468                 return Fcons (val, val);
9469             }
9470           return Qnil;
9471         }
9472     }
9473   return Qnil;
9474 }
9475
9476 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9477        Sset_coding_system_priority, 0, MANY, 0,
9478        doc: /* Assign higher priority to the coding systems given as arguments.
9479 If multiple coding systems belong to the same category,
9480 all but the first one are ignored.
9481
9482 usage: (set-coding-system-priority &rest coding-systems)  */)
9483   (ptrdiff_t nargs, Lisp_Object *args)
9484 {
9485   ptrdiff_t i, j;
9486   int changed[coding_category_max];
9487   enum coding_category priorities[coding_category_max];
9488
9489   memset (changed, 0, sizeof changed);
9490
9491   for (i = j = 0; i < nargs; i++)
9492     {
9493       enum coding_category category;
9494       Lisp_Object spec, attrs;
9495
9496       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9497       attrs = AREF (spec, 0);
9498       category = XINT (CODING_ATTR_CATEGORY (attrs));
9499       if (changed[category])
9500         /* Ignore this coding system because a coding system of the
9501            same category already had a higher priority.  */
9502         continue;
9503       changed[category] = 1;
9504       priorities[j++] = category;
9505       if (coding_categories[category].id >= 0
9506           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9507         setup_coding_system (args[i], &coding_categories[category]);
9508       Fset (AREF (Vcoding_category_table, category), args[i]);
9509     }
9510
9511   /* Now we have decided top J priorities.  Reflect the order of the
9512      original priorities to the remaining priorities.  */
9513
9514   for (i = j, j = 0; i < coding_category_max; i++, j++)
9515     {
9516       while (j < coding_category_max
9517              && changed[coding_priorities[j]])
9518         j++;
9519       if (j == coding_category_max)
9520         abort ();
9521       priorities[i] = coding_priorities[j];
9522     }
9523
9524   memcpy (coding_priorities, priorities, sizeof priorities);
9525
9526   /* Update `coding-category-list'.  */
9527   Vcoding_category_list = Qnil;
9528   for (i = coding_category_max; i-- > 0; )
9529     Vcoding_category_list
9530       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9531                Vcoding_category_list);
9532
9533   return Qnil;
9534 }
9535
9536 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9537        Scoding_system_priority_list, 0, 1, 0,
9538        doc: /* Return a list of coding systems ordered by their priorities.
9539 The list contains a subset of coding systems; i.e. coding systems
9540 assigned to each coding category (see `coding-category-list').
9541
9542 HIGHESTP non-nil means just return the highest priority one.  */)
9543   (Lisp_Object highestp)
9544 {
9545   int i;
9546   Lisp_Object val;
9547
9548   for (i = 0, val = Qnil; i < coding_category_max; i++)
9549     {
9550       enum coding_category category = coding_priorities[i];
9551       int id = coding_categories[category].id;
9552       Lisp_Object attrs;
9553
9554       if (id < 0)
9555         continue;
9556       attrs = CODING_ID_ATTRS (id);
9557       if (! NILP (highestp))
9558         return CODING_ATTR_BASE_NAME (attrs);
9559       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9560     }
9561   return Fnreverse (val);
9562 }
9563
9564 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9565
9566 static Lisp_Object
9567 make_subsidiaries (Lisp_Object base)
9568 {
9569   Lisp_Object subsidiaries;
9570   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9571   char *buf = (char *) alloca (base_name_len + 6);
9572   int i;
9573
9574   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9575   subsidiaries = Fmake_vector (make_number (3), Qnil);
9576   for (i = 0; i < 3; i++)
9577     {
9578       strcpy (buf + base_name_len, suffixes[i]);
9579       ASET (subsidiaries, i, intern (buf));
9580     }
9581   return subsidiaries;
9582 }
9583
9584
9585 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9586        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9587        doc: /* For internal use only.
9588 usage: (define-coding-system-internal ...)  */)
9589   (ptrdiff_t nargs, Lisp_Object *args)
9590 {
9591   Lisp_Object name;
9592   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9593   Lisp_Object attrs;            /* Vector of attributes.  */
9594   Lisp_Object eol_type;
9595   Lisp_Object aliases;
9596   Lisp_Object coding_type, charset_list, safe_charsets;
9597   enum coding_category category;
9598   Lisp_Object tail, val;
9599   int max_charset_id = 0;
9600   int i;
9601
9602   if (nargs < coding_arg_max)
9603     goto short_args;
9604
9605   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9606
9607   name = args[coding_arg_name];
9608   CHECK_SYMBOL (name);
9609   CODING_ATTR_BASE_NAME (attrs) = name;
9610
9611   val = args[coding_arg_mnemonic];
9612   if (! STRINGP (val))
9613     CHECK_CHARACTER (val);
9614   CODING_ATTR_MNEMONIC (attrs) = val;
9615
9616   coding_type = args[coding_arg_coding_type];
9617   CHECK_SYMBOL (coding_type);
9618   CODING_ATTR_TYPE (attrs) = coding_type;
9619
9620   charset_list = args[coding_arg_charset_list];
9621   if (SYMBOLP (charset_list))
9622     {
9623       if (EQ (charset_list, Qiso_2022))
9624         {
9625           if (! EQ (coding_type, Qiso_2022))
9626             error ("Invalid charset-list");
9627           charset_list = Viso_2022_charset_list;
9628         }
9629       else if (EQ (charset_list, Qemacs_mule))
9630         {
9631           if (! EQ (coding_type, Qemacs_mule))
9632             error ("Invalid charset-list");
9633           charset_list = Vemacs_mule_charset_list;
9634         }
9635       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9636         {
9637           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9638             error ("Invalid charset-list");
9639           if (max_charset_id < XFASTINT (XCAR (tail)))
9640             max_charset_id = XFASTINT (XCAR (tail));
9641         }
9642     }
9643   else
9644     {
9645       charset_list = Fcopy_sequence (charset_list);
9646       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9647         {
9648           struct charset *charset;
9649
9650           val = XCAR (tail);
9651           CHECK_CHARSET_GET_CHARSET (val, charset);
9652           if (EQ (coding_type, Qiso_2022)
9653               ? CHARSET_ISO_FINAL (charset) < 0
9654               : EQ (coding_type, Qemacs_mule)
9655               ? CHARSET_EMACS_MULE_ID (charset) < 0
9656               : 0)
9657             error ("Can't handle charset `%s'",
9658                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9659
9660           XSETCAR (tail, make_number (charset->id));
9661           if (max_charset_id < charset->id)
9662             max_charset_id = charset->id;
9663         }
9664     }
9665   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9666
9667   safe_charsets = make_uninit_string (max_charset_id + 1);
9668   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9669   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9670     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9671   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9672
9673   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9674
9675   val = args[coding_arg_decode_translation_table];
9676   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9677     CHECK_SYMBOL (val);
9678   CODING_ATTR_DECODE_TBL (attrs) = val;
9679
9680   val = args[coding_arg_encode_translation_table];
9681   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9682     CHECK_SYMBOL (val);
9683   CODING_ATTR_ENCODE_TBL (attrs) = val;
9684
9685   val = args[coding_arg_post_read_conversion];
9686   CHECK_SYMBOL (val);
9687   CODING_ATTR_POST_READ (attrs) = val;
9688
9689   val = args[coding_arg_pre_write_conversion];
9690   CHECK_SYMBOL (val);
9691   CODING_ATTR_PRE_WRITE (attrs) = val;
9692
9693   val = args[coding_arg_default_char];
9694   if (NILP (val))
9695     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9696   else
9697     {
9698       CHECK_CHARACTER (val);
9699       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9700     }
9701
9702   val = args[coding_arg_for_unibyte];
9703   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9704
9705   val = args[coding_arg_plist];
9706   CHECK_LIST (val);
9707   CODING_ATTR_PLIST (attrs) = val;
9708
9709   if (EQ (coding_type, Qcharset))
9710     {
9711       /* Generate a lisp vector of 256 elements.  Each element is nil,
9712          integer, or a list of charset IDs.
9713
9714          If Nth element is nil, the byte code N is invalid in this
9715          coding system.
9716
9717          If Nth element is a number NUM, N is the first byte of a
9718          charset whose ID is NUM.
9719
9720          If Nth element is a list of charset IDs, N is the first byte
9721          of one of them.  The list is sorted by dimensions of the
9722          charsets.  A charset of smaller dimension comes first. */
9723       val = Fmake_vector (make_number (256), Qnil);
9724
9725       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9726         {
9727           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9728           int dim = CHARSET_DIMENSION (charset);
9729           int idx = (dim - 1) * 4;
9730
9731           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9732             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9733
9734           for (i = charset->code_space[idx];
9735                i <= charset->code_space[idx + 1]; i++)
9736             {
9737               Lisp_Object tmp, tmp2;
9738               int dim2;
9739
9740               tmp = AREF (val, i);
9741               if (NILP (tmp))
9742                 tmp = XCAR (tail);
9743               else if (NUMBERP (tmp))
9744                 {
9745                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9746                   if (dim < dim2)
9747                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9748                   else
9749                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9750                 }
9751               else
9752                 {
9753                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9754                     {
9755                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9756                       if (dim < dim2)
9757                         break;
9758                     }
9759                   if (NILP (tmp2))
9760                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9761                   else
9762                     {
9763                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9764                       XSETCAR (tmp2, XCAR (tail));
9765                     }
9766                 }
9767               ASET (val, i, tmp);
9768             }
9769         }
9770       ASET (attrs, coding_attr_charset_valids, val);
9771       category = coding_category_charset;
9772     }
9773   else if (EQ (coding_type, Qccl))
9774     {
9775       Lisp_Object valids;
9776
9777       if (nargs < coding_arg_ccl_max)
9778         goto short_args;
9779
9780       val = args[coding_arg_ccl_decoder];
9781       CHECK_CCL_PROGRAM (val);
9782       if (VECTORP (val))
9783         val = Fcopy_sequence (val);
9784       ASET (attrs, coding_attr_ccl_decoder, val);
9785
9786       val = args[coding_arg_ccl_encoder];
9787       CHECK_CCL_PROGRAM (val);
9788       if (VECTORP (val))
9789         val = Fcopy_sequence (val);
9790       ASET (attrs, coding_attr_ccl_encoder, val);
9791
9792       val = args[coding_arg_ccl_valids];
9793       valids = Fmake_string (make_number (256), make_number (0));
9794       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9795         {
9796           int from, to;
9797
9798           val = Fcar (tail);
9799           if (INTEGERP (val))
9800             {
9801               if (! (0 <= XINT (val) && XINT (val) <= 255))
9802                 args_out_of_range_3 (val, make_number (0), make_number (255));
9803               from = to = XINT (val);
9804             }
9805           else
9806             {
9807               CHECK_CONS (val);
9808               CHECK_NATNUM_CAR (val);
9809               CHECK_NUMBER_CDR (val);
9810               if (XINT (XCAR (val)) > 255)
9811                 args_out_of_range_3 (XCAR (val),
9812                                      make_number (0), make_number (255));
9813               from = XINT (XCAR (val));
9814               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9815                 args_out_of_range_3 (XCDR (val),
9816                                      XCAR (val), make_number (255));
9817               to = XINT (XCDR (val));
9818             }
9819           for (i = from; i <= to; i++)
9820             SSET (valids, i, 1);
9821         }
9822       ASET (attrs, coding_attr_ccl_valids, valids);
9823
9824       category = coding_category_ccl;
9825     }
9826   else if (EQ (coding_type, Qutf_16))
9827     {
9828       Lisp_Object bom, endian;
9829
9830       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9831
9832       if (nargs < coding_arg_utf16_max)
9833         goto short_args;
9834
9835       bom = args[coding_arg_utf16_bom];
9836       if (! NILP (bom) && ! EQ (bom, Qt))
9837         {
9838           CHECK_CONS (bom);
9839           val = XCAR (bom);
9840           CHECK_CODING_SYSTEM (val);
9841           val = XCDR (bom);
9842           CHECK_CODING_SYSTEM (val);
9843         }
9844       ASET (attrs, coding_attr_utf_bom, bom);
9845
9846       endian = args[coding_arg_utf16_endian];
9847       CHECK_SYMBOL (endian);
9848       if (NILP (endian))
9849         endian = Qbig;
9850       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9851         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9852       ASET (attrs, coding_attr_utf_16_endian, endian);
9853
9854       category = (CONSP (bom)
9855                   ? coding_category_utf_16_auto
9856                   : NILP (bom)
9857                   ? (EQ (endian, Qbig)
9858                      ? coding_category_utf_16_be_nosig
9859                      : coding_category_utf_16_le_nosig)
9860                   : (EQ (endian, Qbig)
9861                      ? coding_category_utf_16_be
9862                      : coding_category_utf_16_le));
9863     }
9864   else if (EQ (coding_type, Qiso_2022))
9865     {
9866       Lisp_Object initial, reg_usage, request, flags;
9867
9868       if (nargs < coding_arg_iso2022_max)
9869         goto short_args;
9870
9871       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9872       CHECK_VECTOR (initial);
9873       for (i = 0; i < 4; i++)
9874         {
9875           val = Faref (initial, make_number (i));
9876           if (! NILP (val))
9877             {
9878               struct charset *charset;
9879
9880               CHECK_CHARSET_GET_CHARSET (val, charset);
9881               ASET (initial, i, make_number (CHARSET_ID (charset)));
9882               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9883                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9884             }
9885           else
9886             ASET (initial, i, make_number (-1));
9887         }
9888
9889       reg_usage = args[coding_arg_iso2022_reg_usage];
9890       CHECK_CONS (reg_usage);
9891       CHECK_NUMBER_CAR (reg_usage);
9892       CHECK_NUMBER_CDR (reg_usage);
9893
9894       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9895       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9896         {
9897           int id;
9898           Lisp_Object tmp1;
9899
9900           val = Fcar (tail);
9901           CHECK_CONS (val);
9902           tmp1 = XCAR (val);
9903           CHECK_CHARSET_GET_ID (tmp1, id);
9904           CHECK_NATNUM_CDR (val);
9905           if (XINT (XCDR (val)) >= 4)
9906             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9907           XSETCAR (val, make_number (id));
9908         }
9909
9910       flags = args[coding_arg_iso2022_flags];
9911       CHECK_NATNUM (flags);
9912       i = XINT (flags) & INT_MAX;
9913       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9914         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9915       flags = make_number (i);
9916
9917       ASET (attrs, coding_attr_iso_initial, initial);
9918       ASET (attrs, coding_attr_iso_usage, reg_usage);
9919       ASET (attrs, coding_attr_iso_request, request);
9920       ASET (attrs, coding_attr_iso_flags, flags);
9921       setup_iso_safe_charsets (attrs);
9922
9923       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9924         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9925                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9926                     ? coding_category_iso_7_else
9927                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9928                     ? coding_category_iso_7
9929                     : coding_category_iso_7_tight);
9930       else
9931         {
9932           int id = XINT (AREF (initial, 1));
9933
9934           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9935                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9936                        || id < 0)
9937                       ? coding_category_iso_8_else
9938                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9939                       ? coding_category_iso_8_1
9940                       : coding_category_iso_8_2);
9941         }
9942       if (category != coding_category_iso_8_1
9943           && category != coding_category_iso_8_2)
9944         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9945     }
9946   else if (EQ (coding_type, Qemacs_mule))
9947     {
9948       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9949         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9950       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9951       category = coding_category_emacs_mule;
9952     }
9953   else if (EQ (coding_type, Qshift_jis))
9954     {
9955
9956       struct charset *charset;
9957
9958       if (XINT (Flength (charset_list)) != 3
9959           && XINT (Flength (charset_list)) != 4)
9960         error ("There should be three or four charsets");
9961
9962       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9963       if (CHARSET_DIMENSION (charset) != 1)
9964         error ("Dimension of charset %s is not one",
9965                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9966       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9967         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9968
9969       charset_list = XCDR (charset_list);
9970       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9971       if (CHARSET_DIMENSION (charset) != 1)
9972         error ("Dimension of charset %s is not one",
9973                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9974
9975       charset_list = XCDR (charset_list);
9976       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9977       if (CHARSET_DIMENSION (charset) != 2)
9978         error ("Dimension of charset %s is not two",
9979                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9980
9981       charset_list = XCDR (charset_list);
9982       if (! NILP (charset_list))
9983         {
9984           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9985           if (CHARSET_DIMENSION (charset) != 2)
9986             error ("Dimension of charset %s is not two",
9987                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9988         }
9989
9990       category = coding_category_sjis;
9991       Vsjis_coding_system = name;
9992     }
9993   else if (EQ (coding_type, Qbig5))
9994     {
9995       struct charset *charset;
9996
9997       if (XINT (Flength (charset_list)) != 2)
9998         error ("There should be just two charsets");
9999
10000       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10001       if (CHARSET_DIMENSION (charset) != 1)
10002         error ("Dimension of charset %s is not one",
10003                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10004       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10005         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10006
10007       charset_list = XCDR (charset_list);
10008       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10009       if (CHARSET_DIMENSION (charset) != 2)
10010         error ("Dimension of charset %s is not two",
10011                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10012
10013       category = coding_category_big5;
10014       Vbig5_coding_system = name;
10015     }
10016   else if (EQ (coding_type, Qraw_text))
10017     {
10018       category = coding_category_raw_text;
10019       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10020     }
10021   else if (EQ (coding_type, Qutf_8))
10022     {
10023       Lisp_Object bom;
10024
10025       if (nargs < coding_arg_utf8_max)
10026         goto short_args;
10027
10028       bom = args[coding_arg_utf8_bom];
10029       if (! NILP (bom) && ! EQ (bom, Qt))
10030         {
10031           CHECK_CONS (bom);
10032           val = XCAR (bom);
10033           CHECK_CODING_SYSTEM (val);
10034           val = XCDR (bom);
10035           CHECK_CODING_SYSTEM (val);
10036         }
10037       ASET (attrs, coding_attr_utf_bom, bom);
10038       if (NILP (bom))
10039         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10040
10041       category = (CONSP (bom) ? coding_category_utf_8_auto
10042                   : NILP (bom) ? coding_category_utf_8_nosig
10043                   : coding_category_utf_8_sig);
10044     }
10045   else if (EQ (coding_type, Qundecided))
10046     category = coding_category_undecided;
10047   else
10048     error ("Invalid coding system type: %s",
10049            SDATA (SYMBOL_NAME (coding_type)));
10050
10051   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10052   CODING_ATTR_PLIST (attrs)
10053     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10054                                 CODING_ATTR_PLIST (attrs)));
10055   CODING_ATTR_PLIST (attrs)
10056     = Fcons (QCascii_compatible_p,
10057              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10058                     CODING_ATTR_PLIST (attrs)));
10059
10060   eol_type = args[coding_arg_eol_type];
10061   if (! NILP (eol_type)
10062       && ! EQ (eol_type, Qunix)
10063       && ! EQ (eol_type, Qdos)
10064       && ! EQ (eol_type, Qmac))
10065     error ("Invalid eol-type");
10066
10067   aliases = Fcons (name, Qnil);
10068
10069   if (NILP (eol_type))
10070     {
10071       eol_type = make_subsidiaries (name);
10072       for (i = 0; i < 3; i++)
10073         {
10074           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10075
10076           this_name = AREF (eol_type, i);
10077           this_aliases = Fcons (this_name, Qnil);
10078           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10079           this_spec = Fmake_vector (make_number (3), attrs);
10080           ASET (this_spec, 1, this_aliases);
10081           ASET (this_spec, 2, this_eol_type);
10082           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10083           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10084           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10085           if (NILP (val))
10086             Vcoding_system_alist
10087               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10088                        Vcoding_system_alist);
10089         }
10090     }
10091
10092   spec_vec = Fmake_vector (make_number (3), attrs);
10093   ASET (spec_vec, 1, aliases);
10094   ASET (spec_vec, 2, eol_type);
10095
10096   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10097   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10098   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10099   if (NILP (val))
10100     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10101                                   Vcoding_system_alist);
10102
10103   {
10104     int id = coding_categories[category].id;
10105
10106     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10107       setup_coding_system (name, &coding_categories[category]);
10108   }
10109
10110   return Qnil;
10111
10112  short_args:
10113   return Fsignal (Qwrong_number_of_arguments,
10114                   Fcons (intern ("define-coding-system-internal"),
10115                          make_number (nargs)));
10116 }
10117
10118
10119 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10120        3, 3, 0,
10121        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10122   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10123 {
10124   Lisp_Object spec, attrs;
10125
10126   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10127   attrs = AREF (spec, 0);
10128   if (EQ (prop, QCmnemonic))
10129     {
10130       if (! STRINGP (val))
10131         CHECK_CHARACTER (val);
10132       CODING_ATTR_MNEMONIC (attrs) = val;
10133     }
10134   else if (EQ (prop, QCdefault_char))
10135     {
10136       if (NILP (val))
10137         val = make_number (' ');
10138       else
10139         CHECK_CHARACTER (val);
10140       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10141     }
10142   else if (EQ (prop, QCdecode_translation_table))
10143     {
10144       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10145         CHECK_SYMBOL (val);
10146       CODING_ATTR_DECODE_TBL (attrs) = val;
10147     }
10148   else if (EQ (prop, QCencode_translation_table))
10149     {
10150       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10151         CHECK_SYMBOL (val);
10152       CODING_ATTR_ENCODE_TBL (attrs) = val;
10153     }
10154   else if (EQ (prop, QCpost_read_conversion))
10155     {
10156       CHECK_SYMBOL (val);
10157       CODING_ATTR_POST_READ (attrs) = val;
10158     }
10159   else if (EQ (prop, QCpre_write_conversion))
10160     {
10161       CHECK_SYMBOL (val);
10162       CODING_ATTR_PRE_WRITE (attrs) = val;
10163     }
10164   else if (EQ (prop, QCascii_compatible_p))
10165     {
10166       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10167     }
10168
10169   CODING_ATTR_PLIST (attrs)
10170     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10171   return val;
10172 }
10173
10174
10175 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10176        Sdefine_coding_system_alias, 2, 2, 0,
10177        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10178   (Lisp_Object alias, Lisp_Object coding_system)
10179 {
10180   Lisp_Object spec, aliases, eol_type, val;
10181
10182   CHECK_SYMBOL (alias);
10183   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10184   aliases = AREF (spec, 1);
10185   /* ALIASES should be a list of length more than zero, and the first
10186      element is a base coding system.  Append ALIAS at the tail of the
10187      list.  */
10188   while (!NILP (XCDR (aliases)))
10189     aliases = XCDR (aliases);
10190   XSETCDR (aliases, Fcons (alias, Qnil));
10191
10192   eol_type = AREF (spec, 2);
10193   if (VECTORP (eol_type))
10194     {
10195       Lisp_Object subsidiaries;
10196       int i;
10197
10198       subsidiaries = make_subsidiaries (alias);
10199       for (i = 0; i < 3; i++)
10200         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10201                                      AREF (eol_type, i));
10202     }
10203
10204   Fputhash (alias, spec, Vcoding_system_hash_table);
10205   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10206   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10207   if (NILP (val))
10208     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10209                                   Vcoding_system_alist);
10210
10211   return Qnil;
10212 }
10213
10214 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10215        1, 1, 0,
10216        doc: /* Return the base of CODING-SYSTEM.
10217 Any alias or subsidiary coding system is not a base coding system.  */)
10218   (Lisp_Object coding_system)
10219 {
10220   Lisp_Object spec, attrs;
10221
10222   if (NILP (coding_system))
10223     return (Qno_conversion);
10224   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10225   attrs = AREF (spec, 0);
10226   return CODING_ATTR_BASE_NAME (attrs);
10227 }
10228
10229 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10230        1, 1, 0,
10231        doc: "Return the property list of CODING-SYSTEM.")
10232   (Lisp_Object coding_system)
10233 {
10234   Lisp_Object spec, attrs;
10235
10236   if (NILP (coding_system))
10237     coding_system = Qno_conversion;
10238   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10239   attrs = AREF (spec, 0);
10240   return CODING_ATTR_PLIST (attrs);
10241 }
10242
10243
10244 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10245        1, 1, 0,
10246        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10247   (Lisp_Object coding_system)
10248 {
10249   Lisp_Object spec;
10250
10251   if (NILP (coding_system))
10252     coding_system = Qno_conversion;
10253   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10254   return AREF (spec, 1);
10255 }
10256
10257 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10258        Scoding_system_eol_type, 1, 1, 0,
10259        doc: /* Return eol-type of CODING-SYSTEM.
10260 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10261
10262 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10263 and CR respectively.
10264
10265 A vector value indicates that a format of end-of-line should be
10266 detected automatically.  Nth element of the vector is the subsidiary
10267 coding system whose eol-type is N.  */)
10268   (Lisp_Object coding_system)
10269 {
10270   Lisp_Object spec, eol_type;
10271   int n;
10272
10273   if (NILP (coding_system))
10274     coding_system = Qno_conversion;
10275   if (! CODING_SYSTEM_P (coding_system))
10276     return Qnil;
10277   spec = CODING_SYSTEM_SPEC (coding_system);
10278   eol_type = AREF (spec, 2);
10279   if (VECTORP (eol_type))
10280     return Fcopy_sequence (eol_type);
10281   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10282   return make_number (n);
10283 }
10284
10285 #endif /* emacs */
10286
10287 \f
10288 /*** 9. Post-amble ***/
10289
10290 void
10291 init_coding_once (void)
10292 {
10293   int i;
10294
10295   for (i = 0; i < coding_category_max; i++)
10296     {
10297       coding_categories[i].id = -1;
10298       coding_priorities[i] = i;
10299     }
10300
10301   /* ISO2022 specific initialize routine.  */
10302   for (i = 0; i < 0x20; i++)
10303     iso_code_class[i] = ISO_control_0;
10304   for (i = 0x21; i < 0x7F; i++)
10305     iso_code_class[i] = ISO_graphic_plane_0;
10306   for (i = 0x80; i < 0xA0; i++)
10307     iso_code_class[i] = ISO_control_1;
10308   for (i = 0xA1; i < 0xFF; i++)
10309     iso_code_class[i] = ISO_graphic_plane_1;
10310   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10311   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10312   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10313   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10314   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10315   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10316   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10317   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10318   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10319
10320   for (i = 0; i < 256; i++)
10321     {
10322       emacs_mule_bytes[i] = 1;
10323     }
10324   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10325   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10326   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10327   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10328 }
10329
10330 #ifdef emacs
10331
10332 void
10333 syms_of_coding (void)
10334 {
10335   staticpro (&Vcoding_system_hash_table);
10336   {
10337     Lisp_Object args[2];
10338     args[0] = QCtest;
10339     args[1] = Qeq;
10340     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10341   }
10342
10343   staticpro (&Vsjis_coding_system);
10344   Vsjis_coding_system = Qnil;
10345
10346   staticpro (&Vbig5_coding_system);
10347   Vbig5_coding_system = Qnil;
10348
10349   staticpro (&Vcode_conversion_reused_workbuf);
10350   Vcode_conversion_reused_workbuf = Qnil;
10351
10352   staticpro (&Vcode_conversion_workbuf_name);
10353   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10354
10355   reused_workbuf_in_use = 0;
10356
10357   DEFSYM (Qcharset, "charset");
10358   DEFSYM (Qtarget_idx, "target-idx");
10359   DEFSYM (Qcoding_system_history, "coding-system-history");
10360   Fset (Qcoding_system_history, Qnil);
10361
10362   /* Target FILENAME is the first argument.  */
10363   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10364   /* Target FILENAME is the third argument.  */
10365   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10366
10367   DEFSYM (Qcall_process, "call-process");
10368   /* Target PROGRAM is the first argument.  */
10369   Fput (Qcall_process, Qtarget_idx, make_number (0));
10370
10371   DEFSYM (Qcall_process_region, "call-process-region");
10372   /* Target PROGRAM is the third argument.  */
10373   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10374
10375   DEFSYM (Qstart_process, "start-process");
10376   /* Target PROGRAM is the third argument.  */
10377   Fput (Qstart_process, Qtarget_idx, make_number (2));
10378
10379   DEFSYM (Qopen_network_stream, "open-network-stream");
10380   /* Target SERVICE is the fourth argument.  */
10381   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10382
10383   DEFSYM (Qcoding_system, "coding-system");
10384   DEFSYM (Qcoding_aliases, "coding-aliases");
10385
10386   DEFSYM (Qeol_type, "eol-type");
10387   DEFSYM (Qunix, "unix");
10388   DEFSYM (Qdos, "dos");
10389
10390   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10391   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10392   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10393   DEFSYM (Qdefault_char, "default-char");
10394   DEFSYM (Qundecided, "undecided");
10395   DEFSYM (Qno_conversion, "no-conversion");
10396   DEFSYM (Qraw_text, "raw-text");
10397
10398   DEFSYM (Qiso_2022, "iso-2022");
10399
10400   DEFSYM (Qutf_8, "utf-8");
10401   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10402
10403   DEFSYM (Qutf_16, "utf-16");
10404   DEFSYM (Qbig, "big");
10405   DEFSYM (Qlittle, "little");
10406
10407   DEFSYM (Qshift_jis, "shift-jis");
10408   DEFSYM (Qbig5, "big5");
10409
10410   DEFSYM (Qcoding_system_p, "coding-system-p");
10411
10412   DEFSYM (Qcoding_system_error, "coding-system-error");
10413   Fput (Qcoding_system_error, Qerror_conditions,
10414         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10415   Fput (Qcoding_system_error, Qerror_message,
10416         make_pure_c_string ("Invalid coding system"));
10417
10418   /* Intern this now in case it isn't already done.
10419      Setting this variable twice is harmless.
10420      But don't staticpro it here--that is done in alloc.c.  */
10421   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10422
10423   DEFSYM (Qtranslation_table, "translation-table");
10424   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10425   DEFSYM (Qtranslation_table_id, "translation-table-id");
10426   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10427   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10428
10429   DEFSYM (Qvalid_codes, "valid-codes");
10430
10431   DEFSYM (Qemacs_mule, "emacs-mule");
10432
10433   DEFSYM (QCcategory, ":category");
10434   DEFSYM (QCmnemonic, ":mnemonic");
10435   DEFSYM (QCdefault_char, ":default-char");
10436   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10437   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10438   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10439   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10440   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10441
10442   Vcoding_category_table
10443     = Fmake_vector (make_number (coding_category_max), Qnil);
10444   staticpro (&Vcoding_category_table);
10445   /* Followings are target of code detection.  */
10446   ASET (Vcoding_category_table, coding_category_iso_7,
10447         intern_c_string ("coding-category-iso-7"));
10448   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10449         intern_c_string ("coding-category-iso-7-tight"));
10450   ASET (Vcoding_category_table, coding_category_iso_8_1,
10451         intern_c_string ("coding-category-iso-8-1"));
10452   ASET (Vcoding_category_table, coding_category_iso_8_2,
10453         intern_c_string ("coding-category-iso-8-2"));
10454   ASET (Vcoding_category_table, coding_category_iso_7_else,
10455         intern_c_string ("coding-category-iso-7-else"));
10456   ASET (Vcoding_category_table, coding_category_iso_8_else,
10457         intern_c_string ("coding-category-iso-8-else"));
10458   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10459         intern_c_string ("coding-category-utf-8-auto"));
10460   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10461         intern_c_string ("coding-category-utf-8"));
10462   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10463         intern_c_string ("coding-category-utf-8-sig"));
10464   ASET (Vcoding_category_table, coding_category_utf_16_be,
10465         intern_c_string ("coding-category-utf-16-be"));
10466   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10467         intern_c_string ("coding-category-utf-16-auto"));
10468   ASET (Vcoding_category_table, coding_category_utf_16_le,
10469         intern_c_string ("coding-category-utf-16-le"));
10470   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10471         intern_c_string ("coding-category-utf-16-be-nosig"));
10472   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10473         intern_c_string ("coding-category-utf-16-le-nosig"));
10474   ASET (Vcoding_category_table, coding_category_charset,
10475         intern_c_string ("coding-category-charset"));
10476   ASET (Vcoding_category_table, coding_category_sjis,
10477         intern_c_string ("coding-category-sjis"));
10478   ASET (Vcoding_category_table, coding_category_big5,
10479         intern_c_string ("coding-category-big5"));
10480   ASET (Vcoding_category_table, coding_category_ccl,
10481         intern_c_string ("coding-category-ccl"));
10482   ASET (Vcoding_category_table, coding_category_emacs_mule,
10483         intern_c_string ("coding-category-emacs-mule"));
10484   /* Followings are NOT target of code detection.  */
10485   ASET (Vcoding_category_table, coding_category_raw_text,
10486         intern_c_string ("coding-category-raw-text"));
10487   ASET (Vcoding_category_table, coding_category_undecided,
10488         intern_c_string ("coding-category-undecided"));
10489
10490   DEFSYM (Qinsufficient_source, "insufficient-source");
10491   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10492   DEFSYM (Qinvalid_source, "invalid-source");
10493   DEFSYM (Qinterrupted, "interrupted");
10494   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10495   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10496
10497   defsubr (&Scoding_system_p);
10498   defsubr (&Sread_coding_system);
10499   defsubr (&Sread_non_nil_coding_system);
10500   defsubr (&Scheck_coding_system);
10501   defsubr (&Sdetect_coding_region);
10502   defsubr (&Sdetect_coding_string);
10503   defsubr (&Sfind_coding_systems_region_internal);
10504   defsubr (&Sunencodable_char_position);
10505   defsubr (&Scheck_coding_systems_region);
10506   defsubr (&Sdecode_coding_region);
10507   defsubr (&Sencode_coding_region);
10508   defsubr (&Sdecode_coding_string);
10509   defsubr (&Sencode_coding_string);
10510   defsubr (&Sdecode_sjis_char);
10511   defsubr (&Sencode_sjis_char);
10512   defsubr (&Sdecode_big5_char);
10513   defsubr (&Sencode_big5_char);
10514   defsubr (&Sset_terminal_coding_system_internal);
10515   defsubr (&Sset_safe_terminal_coding_system_internal);
10516   defsubr (&Sterminal_coding_system);
10517   defsubr (&Sset_keyboard_coding_system_internal);
10518   defsubr (&Skeyboard_coding_system);
10519   defsubr (&Sfind_operation_coding_system);
10520   defsubr (&Sset_coding_system_priority);
10521   defsubr (&Sdefine_coding_system_internal);
10522   defsubr (&Sdefine_coding_system_alias);
10523   defsubr (&Scoding_system_put);
10524   defsubr (&Scoding_system_base);
10525   defsubr (&Scoding_system_plist);
10526   defsubr (&Scoding_system_aliases);
10527   defsubr (&Scoding_system_eol_type);
10528   defsubr (&Scoding_system_priority_list);
10529
10530   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10531                doc: /* List of coding systems.
10532
10533 Do not alter the value of this variable manually.  This variable should be
10534 updated by the functions `define-coding-system' and
10535 `define-coding-system-alias'.  */);
10536   Vcoding_system_list = Qnil;
10537
10538   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10539                doc: /* Alist of coding system names.
10540 Each element is one element list of coding system name.
10541 This variable is given to `completing-read' as COLLECTION argument.
10542
10543 Do not alter the value of this variable manually.  This variable should be
10544 updated by the functions `make-coding-system' and
10545 `define-coding-system-alias'.  */);
10546   Vcoding_system_alist = Qnil;
10547
10548   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10549                doc: /* List of coding-categories (symbols) ordered by priority.
10550
10551 On detecting a coding system, Emacs tries code detection algorithms
10552 associated with each coding-category one by one in this order.  When
10553 one algorithm agrees with a byte sequence of source text, the coding
10554 system bound to the corresponding coding-category is selected.
10555
10556 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10557   {
10558     int i;
10559
10560     Vcoding_category_list = Qnil;
10561     for (i = coding_category_max - 1; i >= 0; i--)
10562       Vcoding_category_list
10563         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10564                  Vcoding_category_list);
10565   }
10566
10567   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10568                doc: /* Specify the coding system for read operations.
10569 It is useful to bind this variable with `let', but do not set it globally.
10570 If the value is a coding system, it is used for decoding on read operation.
10571 If not, an appropriate element is used from one of the coding system alists.
10572 There are three such tables: `file-coding-system-alist',
10573 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10574   Vcoding_system_for_read = Qnil;
10575
10576   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10577                doc: /* Specify the coding system for write operations.
10578 Programs bind this variable with `let', but you should not set it globally.
10579 If the value is a coding system, it is used for encoding of output,
10580 when writing it to a file and when sending it to a file or subprocess.
10581
10582 If this does not specify a coding system, an appropriate element
10583 is used from one of the coding system alists.
10584 There are three such tables: `file-coding-system-alist',
10585 `process-coding-system-alist', and `network-coding-system-alist'.
10586 For output to files, if the above procedure does not specify a coding system,
10587 the value of `buffer-file-coding-system' is used.  */);
10588   Vcoding_system_for_write = Qnil;
10589
10590   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10591                doc: /*
10592 Coding system used in the latest file or process I/O.  */);
10593   Vlast_coding_system_used = Qnil;
10594
10595   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10596                doc: /*
10597 Error status of the last code conversion.
10598
10599 When an error was detected in the last code conversion, this variable
10600 is set to one of the following symbols.
10601   `insufficient-source'
10602   `inconsistent-eol'
10603   `invalid-source'
10604   `interrupted'
10605   `insufficient-memory'
10606 When no error was detected, the value doesn't change.  So, to check
10607 the error status of a code conversion by this variable, you must
10608 explicitly set this variable to nil before performing code
10609 conversion.  */);
10610   Vlast_code_conversion_error = Qnil;
10611
10612   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10613                doc: /*
10614 *Non-nil means always inhibit code conversion of end-of-line format.
10615 See info node `Coding Systems' and info node `Text and Binary' concerning
10616 such conversion.  */);
10617   inhibit_eol_conversion = 0;
10618
10619   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10620                doc: /*
10621 Non-nil means process buffer inherits coding system of process output.
10622 Bind it to t if the process output is to be treated as if it were a file
10623 read from some filesystem.  */);
10624   inherit_process_coding_system = 0;
10625
10626   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10627                doc: /*
10628 Alist to decide a coding system to use for a file I/O operation.
10629 The format is ((PATTERN . VAL) ...),
10630 where PATTERN is a regular expression matching a file name,
10631 VAL is a coding system, a cons of coding systems, or a function symbol.
10632 If VAL is a coding system, it is used for both decoding and encoding
10633 the file contents.
10634 If VAL is a cons of coding systems, the car part is used for decoding,
10635 and the cdr part is used for encoding.
10636 If VAL is a function symbol, the function must return a coding system
10637 or a cons of coding systems which are used as above.  The function is
10638 called with an argument that is a list of the arguments with which
10639 `find-operation-coding-system' was called.  If the function can't decide
10640 a coding system, it can return `undecided' so that the normal
10641 code-detection is performed.
10642
10643 See also the function `find-operation-coding-system'
10644 and the variable `auto-coding-alist'.  */);
10645   Vfile_coding_system_alist = Qnil;
10646
10647   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10648                doc: /*
10649 Alist to decide a coding system to use for a process I/O operation.
10650 The format is ((PATTERN . VAL) ...),
10651 where PATTERN is a regular expression matching a program name,
10652 VAL is a coding system, a cons of coding systems, or a function symbol.
10653 If VAL is a coding system, it is used for both decoding what received
10654 from the program and encoding what sent to the program.
10655 If VAL is a cons of coding systems, the car part is used for decoding,
10656 and the cdr part is used for encoding.
10657 If VAL is a function symbol, the function must return a coding system
10658 or a cons of coding systems which are used as above.
10659
10660 See also the function `find-operation-coding-system'.  */);
10661   Vprocess_coding_system_alist = Qnil;
10662
10663   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10664                doc: /*
10665 Alist to decide a coding system to use for a network I/O operation.
10666 The format is ((PATTERN . VAL) ...),
10667 where PATTERN is a regular expression matching a network service name
10668 or is a port number to connect to,
10669 VAL is a coding system, a cons of coding systems, or a function symbol.
10670 If VAL is a coding system, it is used for both decoding what received
10671 from the network stream and encoding what sent to the network stream.
10672 If VAL is a cons of coding systems, the car part is used for decoding,
10673 and the cdr part is used for encoding.
10674 If VAL is a function symbol, the function must return a coding system
10675 or a cons of coding systems which are used as above.
10676
10677 See also the function `find-operation-coding-system'.  */);
10678   Vnetwork_coding_system_alist = Qnil;
10679
10680   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10681                doc: /* Coding system to use with system messages.
10682 Also used for decoding keyboard input on X Window system.  */);
10683   Vlocale_coding_system = Qnil;
10684
10685   /* The eol mnemonics are reset in startup.el system-dependently.  */
10686   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10687                doc: /*
10688 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10689   eol_mnemonic_unix = make_pure_c_string (":");
10690
10691   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10692                doc: /*
10693 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10694   eol_mnemonic_dos = make_pure_c_string ("\\");
10695
10696   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10697                doc: /*
10698 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10699   eol_mnemonic_mac = make_pure_c_string ("/");
10700
10701   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10702                doc: /*
10703 *String displayed in mode line when end-of-line format is not yet determined.  */);
10704   eol_mnemonic_undecided = make_pure_c_string (":");
10705
10706   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10707                doc: /*
10708 *Non-nil enables character translation while encoding and decoding.  */);
10709   Venable_character_translation = Qt;
10710
10711   DEFVAR_LISP ("standard-translation-table-for-decode",
10712                Vstandard_translation_table_for_decode,
10713                doc: /* Table for translating characters while decoding.  */);
10714   Vstandard_translation_table_for_decode = Qnil;
10715
10716   DEFVAR_LISP ("standard-translation-table-for-encode",
10717                Vstandard_translation_table_for_encode,
10718                doc: /* Table for translating characters while encoding.  */);
10719   Vstandard_translation_table_for_encode = Qnil;
10720
10721   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10722                doc: /* Alist of charsets vs revision numbers.
10723 While encoding, if a charset (car part of an element) is found,
10724 designate it with the escape sequence identifying revision (cdr part
10725 of the element).  */);
10726   Vcharset_revision_table = Qnil;
10727
10728   DEFVAR_LISP ("default-process-coding-system",
10729                Vdefault_process_coding_system,
10730                doc: /* Cons of coding systems used for process I/O by default.
10731 The car part is used for decoding a process output,
10732 the cdr part is used for encoding a text to be sent to a process.  */);
10733   Vdefault_process_coding_system = Qnil;
10734
10735   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10736                doc: /*
10737 Table of extra Latin codes in the range 128..159 (inclusive).
10738 This is a vector of length 256.
10739 If Nth element is non-nil, the existence of code N in a file
10740 \(or output of subprocess) doesn't prevent it to be detected as
10741 a coding system of ISO 2022 variant which has a flag
10742 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10743 or reading output of a subprocess.
10744 Only 128th through 159th elements have a meaning.  */);
10745   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10746
10747   DEFVAR_LISP ("select-safe-coding-system-function",
10748                Vselect_safe_coding_system_function,
10749                doc: /*
10750 Function to call to select safe coding system for encoding a text.
10751
10752 If set, this function is called to force a user to select a proper
10753 coding system which can encode the text in the case that a default
10754 coding system used in each operation can't encode the text.  The
10755 function should take care that the buffer is not modified while
10756 the coding system is being selected.
10757
10758 The default value is `select-safe-coding-system' (which see).  */);
10759   Vselect_safe_coding_system_function = Qnil;
10760
10761   DEFVAR_BOOL ("coding-system-require-warning",
10762                coding_system_require_warning,
10763                doc: /* Internal use only.
10764 If non-nil, on writing a file, `select-safe-coding-system-function' is
10765 called even if `coding-system-for-write' is non-nil.  The command
10766 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10767   coding_system_require_warning = 0;
10768
10769
10770   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10771                inhibit_iso_escape_detection,
10772                doc: /*
10773 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10774
10775 When Emacs reads text, it tries to detect how the text is encoded.
10776 This code detection is sensitive to escape sequences.  If Emacs sees
10777 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10778 of the ISO2022 encodings, and decodes text by the corresponding coding
10779 system (e.g. `iso-2022-7bit').
10780
10781 However, there may be a case that you want to read escape sequences in
10782 a file as is.  In such a case, you can set this variable to non-nil.
10783 Then the code detection will ignore any escape sequences, and no text is
10784 detected as encoded in some ISO-2022 encoding.  The result is that all
10785 escape sequences become visible in a buffer.
10786
10787 The default value is nil, and it is strongly recommended not to change
10788 it.  That is because many Emacs Lisp source files that contain
10789 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10790 in Emacs's distribution, and they won't be decoded correctly on
10791 reading if you suppress escape sequence detection.
10792
10793 The other way to read escape sequences in a file without decoding is
10794 to explicitly specify some coding system that doesn't use ISO-2022
10795 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10796   inhibit_iso_escape_detection = 0;
10797
10798   DEFVAR_BOOL ("inhibit-null-byte-detection",
10799                inhibit_null_byte_detection,
10800                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10801 By default, Emacs treats it as binary data, and does not attempt to
10802 decode it.  The effect is as if you specified `no-conversion' for
10803 reading that text.
10804
10805 Set this to non-nil when a regular text happens to include null bytes.
10806 Examples are Index nodes of Info files and null-byte delimited output
10807 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10808 decode text as usual.  */);
10809   inhibit_null_byte_detection = 0;
10810
10811   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10812                doc: /* Char table for translating self-inserting characters.
10813 This is applied to the result of input methods, not their input.
10814 See also `keyboard-translate-table'.
10815
10816 Use of this variable for character code unification was rendered
10817 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10818 internal character representation.  */);
10819     Vtranslation_table_for_input = Qnil;
10820
10821   {
10822     Lisp_Object args[coding_arg_max];
10823     Lisp_Object plist[16];
10824     int i;
10825
10826     for (i = 0; i < coding_arg_max; i++)
10827       args[i] = Qnil;
10828
10829     plist[0] = intern_c_string (":name");
10830     plist[1] = args[coding_arg_name] = Qno_conversion;
10831     plist[2] = intern_c_string (":mnemonic");
10832     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10833     plist[4] = intern_c_string (":coding-type");
10834     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10835     plist[6] = intern_c_string (":ascii-compatible-p");
10836     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10837     plist[8] = intern_c_string (":default-char");
10838     plist[9] = args[coding_arg_default_char] = make_number (0);
10839     plist[10] = intern_c_string (":for-unibyte");
10840     plist[11] = args[coding_arg_for_unibyte] = Qt;
10841     plist[12] = intern_c_string (":docstring");
10842     plist[13] = make_pure_c_string ("Do no conversion.\n\
10843 \n\
10844 When you visit a file with this coding, the file is read into a\n\
10845 unibyte buffer as is, thus each byte of a file is treated as a\n\
10846 character.");
10847     plist[14] = intern_c_string (":eol-type");
10848     plist[15] = args[coding_arg_eol_type] = Qunix;
10849     args[coding_arg_plist] = Flist (16, plist);
10850     Fdefine_coding_system_internal (coding_arg_max, args);
10851
10852     plist[1] = args[coding_arg_name] = Qundecided;
10853     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10854     plist[5] = args[coding_arg_coding_type] = Qundecided;
10855     /* This is already set.
10856        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10857     plist[8] = intern_c_string (":charset-list");
10858     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10859     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10860     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10861     plist[15] = args[coding_arg_eol_type] = Qnil;
10862     args[coding_arg_plist] = Flist (16, plist);
10863     Fdefine_coding_system_internal (coding_arg_max, args);
10864   }
10865
10866   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10867
10868   {
10869     int i;
10870
10871     for (i = 0; i < coding_category_max; i++)
10872       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10873   }
10874 #if defined (DOS_NT)
10875   system_eol_type = Qdos;
10876 #else
10877   system_eol_type = Qunix;
10878 #endif
10879   staticpro (&system_eol_type);
10880 }
10881
10882 char *
10883 emacs_strerror (int error_number)
10884 {
10885   char *str;
10886
10887   synchronize_system_messages_locale ();
10888   str = strerror (error_number);
10889
10890   if (! NILP (Vlocale_coding_system))
10891     {
10892       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10893                                                       Vlocale_coding_system,
10894                                                       0);
10895       str = SSDATA (dec);
10896     }
10897
10898   return str;
10899 }
10900
10901 #endif /* emacs */