src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   EMACS_INT consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   EMACS_INT produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static void coding_set_destination (struct coding_system *);
 852 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 853 static void coding_alloc_by_making_gap (struct coding_system *,
 854                                         EMACS_INT, EMACS_INT);
 855 static unsigned char *alloc_destination (struct coding_system *,
 856                                          EMACS_INT, unsigned char *);
 857 static void setup_iso_safe_charsets (Lisp_Object);
 858 static unsigned char *encode_designation_at_bol (struct coding_system *,
 859                                                  int *, unsigned char *);
 860 static int detect_eol (const unsigned char *,
 861                        EMACS_INT, enum coding_category);
 862 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 863 static void decode_eol (struct coding_system *);
 864 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 865 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 866 static int produce_chars (struct coding_system *, Lisp_Object, int);
 867 static inline void produce_charset (struct coding_system *, int *,
 868                                     EMACS_INT);
 869 static void produce_annotation (struct coding_system *, EMACS_INT);
 870 static int decode_coding (struct coding_system *);
 871 static inline int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 872                                                   struct coding_system *,
 873                                                   int *, EMACS_INT *);
 874 static inline int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 875                                               struct coding_system *,
 876                                               int *, EMACS_INT *);
 877 static void consume_chars (struct coding_system *, Lisp_Object, int);
 878 static int encode_coding (struct coding_system *);
 879 static Lisp_Object make_conversion_work_buffer (int);
 880 static Lisp_Object code_conversion_restore (Lisp_Object);
 881 static inline int char_encodable_p (int, Lisp_Object);
 882 static Lisp_Object make_subsidiaries (Lisp_Object);
 883
 884 static void
 885 record_conversion_result (struct coding_system *coding,
 886                           enum coding_result_code result)
 887 {
 888   coding->result = result;
 889   switch (result)
 890     {
 891     case CODING_RESULT_INSUFFICIENT_SRC:
 892       Vlast_code_conversion_error = Qinsufficient_source;
 893       break;
 894     case CODING_RESULT_INCONSISTENT_EOL:
 895       Vlast_code_conversion_error = Qinconsistent_eol;
 896       break;
 897     case CODING_RESULT_INVALID_SRC:
 898       Vlast_code_conversion_error = Qinvalid_source;
 899       break;
 900     case CODING_RESULT_INTERRUPT:
 901       Vlast_code_conversion_error = Qinterrupted;
 902       break;
 903     case CODING_RESULT_INSUFFICIENT_MEM:
 904       Vlast_code_conversion_error = Qinsufficient_memory;
 905       break;
 906     case CODING_RESULT_INSUFFICIENT_DST:
 907       /* Don't record this error in Vlast_code_conversion_error
 908          because it happens just temporarily and is resolved when the
 909          whole conversion is finished.  */
 910       break;
 911     case CODING_RESULT_SUCCESS:
 912       break;
 913     default:
 914       Vlast_code_conversion_error = intern ("Unknown error");
 915     }
 916 }
 917
 918 /* This wrapper macro is used to preserve validity of pointers into
 919    buffer text across calls to decode_char, which could cause
 920    relocation of buffers if it loads a charset map, because loading a
 921    charset map allocates large structures.  */
 922 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 923   do {                                                                       \
 924     charset_map_loaded = 0;                                                  \
 925     c = DECODE_CHAR (charset, code);                                         \
 926     if (charset_map_loaded)                                                  \
 927       {                                                                      \
 928         const unsigned char *orig = coding->source;                          \
 929         EMACS_INT offset;                                                    \
 930                                                                              \
 931         coding_set_source (coding);                                          \
 932         offset = coding->source - orig;                                      \
 933         src += offset;                                                       \
 934         src_base += offset;                                                  \
 935         src_end += offset;                                                   \
 936       }                                                                      \
 937   } while (0)
 938
 939
 940 /* If there are at least BYTES length of room at dst, allocate memory
 941    for coding->destination and update dst and dst_end.  We don't have
 942    to take care of coding->source which will be relocated.  It is
 943    handled by calling coding_set_source in encode_coding.  */
 944
 945 #define ASSURE_DESTINATION(bytes)                               \
 946   do {                                                          \
 947     if (dst + (bytes) >= dst_end)                               \
 948       {                                                         \
 949         EMACS_INT more_bytes = charbuf_end - charbuf + (bytes); \
 950                                                                 \
 951         dst = alloc_destination (coding, more_bytes, dst);      \
 952         dst_end = coding->destination + coding->dst_bytes;      \
 953       }                                                         \
 954   } while (0)
 955
 956
 957 /* Store multibyte form of the character C in P, and advance P to the
 958    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 959    never calls MAYBE_UNIFY_CHAR.  */
 960
 961 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 962   do {                                          \
 963     if ((c) <= MAX_1_BYTE_CHAR)                 \
 964       *(p)++ = (c);                             \
 965     else if ((c) <= MAX_2_BYTE_CHAR)            \
 966       *(p)++ = (0xC0 | ((c) >> 6)),             \
 967         *(p)++ = (0x80 | ((c) & 0x3F));         \
 968     else if ((c) <= MAX_3_BYTE_CHAR)            \
 969       *(p)++ = (0xE0 | ((c) >> 12)),            \
 970         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 971         *(p)++ = (0x80 | ((c) & 0x3F));         \
 972     else if ((c) <= MAX_4_BYTE_CHAR)            \
 973       *(p)++ = (0xF0 | (c >> 18)),              \
 974         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 975         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 976         *(p)++ = (0x80 | (c & 0x3F));           \
 977     else if ((c) <= MAX_5_BYTE_CHAR)            \
 978       *(p)++ = 0xF8,                            \
 979         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 980         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 981         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 982         *(p)++ = (0x80 | (c & 0x3F));           \
 983     else                                        \
 984       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 985   } while (0)
 986
 987
 988 /* Return the character code of character whose multibyte form is at
 989    P, and advance P to the end of the multibyte form.  This is like
 990    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 991
 992 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 993   (!((p)[0] & 0x80)                                             \
 994    ? *(p)++                                                     \
 995    : ! ((p)[0] & 0x20)                                          \
 996    ? ((p) += 2,                                                 \
 997       ((((p)[-2] & 0x1F) << 6)                                  \
 998        | ((p)[-1] & 0x3F)                                       \
 999        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1000    : ! ((p)[0] & 0x10)                                          \
1001    ? ((p) += 3,                                                 \
1002       ((((p)[-3] & 0x0F) << 12)                                 \
1003        | (((p)[-2] & 0x3F) << 6)                                \
1004        | ((p)[-1] & 0x3F)))                                     \
1005    : ! ((p)[0] & 0x08)                                          \
1006    ? ((p) += 4,                                                 \
1007       ((((p)[-4] & 0xF) << 18)                                  \
1008        | (((p)[-3] & 0x3F) << 12)                               \
1009        | (((p)[-2] & 0x3F) << 6)                                \
1010        | ((p)[-1] & 0x3F)))                                     \
1011    : ((p) += 5,                                                 \
1012       ((((p)[-4] & 0x3F) << 18)                                 \
1013        | (((p)[-3] & 0x3F) << 12)                               \
1014        | (((p)[-2] & 0x3F) << 6)                                \
1015        | ((p)[-1] & 0x3F))))
1016
1017
1018 static void
1019 coding_set_source (struct coding_system *coding)
1020 {
1021   if (BUFFERP (coding->src_object))
1022     {
1023       struct buffer *buf = XBUFFER (coding->src_object);
1024
1025       if (coding->src_pos < 0)
1026         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1027       else
1028         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1029     }
1030   else if (STRINGP (coding->src_object))
1031     {
1032       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1033     }
1034   else
1035     {
1036       /* Otherwise, the source is C string and is never relocated
1037          automatically.  Thus we don't have to update anything.  */
1038     }
1039 }
1040
1041 static void
1042 coding_set_destination (struct coding_system *coding)
1043 {
1044   if (BUFFERP (coding->dst_object))
1045     {
1046       if (coding->src_pos < 0)
1047         {
1048           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1049           coding->dst_bytes = (GAP_END_ADDR
1050                                - (coding->src_bytes - coding->consumed)
1051                                - coding->destination);
1052         }
1053       else
1054         {
1055           /* We are sure that coding->dst_pos_byte is before the gap
1056              of the buffer. */
1057           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1058                                  + coding->dst_pos_byte - BEG_BYTE);
1059           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1060                                - coding->destination);
1061         }
1062     }
1063   else
1064     {
1065       /* Otherwise, the destination is C string and is never relocated
1066          automatically.  Thus we don't have to update anything.  */
1067     }
1068 }
1069
1070
1071 static void
1072 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1073 {
1074   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1075     string_overflow ();
1076   coding->destination = (unsigned char *) xrealloc (coding->destination,
1077                                                     coding->dst_bytes + bytes);
1078   coding->dst_bytes += bytes;
1079 }
1080
1081 static void
1082 coding_alloc_by_making_gap (struct coding_system *coding,
1083                             EMACS_INT gap_head_used, EMACS_INT bytes)
1084 {
1085   if (EQ (coding->src_object, coding->dst_object))
1086     {
1087       /* The gap may contain the produced data at the head and not-yet
1088          consumed data at the tail.  To preserve those data, we at
1089          first make the gap size to zero, then increase the gap
1090          size.  */
1091       EMACS_INT add = GAP_SIZE;
1092
1093       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1094       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1095       make_gap (bytes);
1096       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1097       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1098     }
1099   else
1100     {
1101       Lisp_Object this_buffer;
1102
1103       this_buffer = Fcurrent_buffer ();
1104       set_buffer_internal (XBUFFER (coding->dst_object));
1105       make_gap (bytes);
1106       set_buffer_internal (XBUFFER (this_buffer));
1107     }
1108 }
1109
1110
1111 static unsigned char *
1112 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1113                    unsigned char *dst)
1114 {
1115   EMACS_INT offset = dst - coding->destination;
1116
1117   if (BUFFERP (coding->dst_object))
1118     {
1119       struct buffer *buf = XBUFFER (coding->dst_object);
1120
1121       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1122     }
1123   else
1124     coding_alloc_by_realloc (coding, nbytes);
1125   coding_set_destination (coding);
1126   dst = coding->destination + offset;
1127   return dst;
1128 }
1129
1130 /** Macros for annotations.  */
1131
1132 /* An annotation data is stored in the array coding->charbuf in this
1133    format:
1134      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1135    LENGTH is the number of elements in the annotation.
1136    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1137    NCHARS is the number of characters in the text annotated.
1138
1139    The format of the following elements depend on ANNOTATION_MASK.
1140
1141    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1142    follows:
1143      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1144
1145    NBYTES is the number of bytes specified in the header part of
1146    old-style emacs-mule encoding, or 0 for the other kind of
1147    composition.
1148
1149    METHOD is one of enum composition_method.
1150
1151    Optional COMPOSITION-COMPONENTS are characters and composition
1152    rules.
1153
1154    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1155    follows.
1156
1157    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1158    recover from an invalid annotation, and should be skipped by
1159    produce_annotation.  */
1160
1161 /* Maximum length of the header of annotation data.  */
1162 #define MAX_ANNOTATION_LENGTH 5
1163
1164 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1165   do {                                                  \
1166     *(buf)++ = -(len);                                  \
1167     *(buf)++ = (mask);                                  \
1168     *(buf)++ = (nchars);                                \
1169     coding->annotated = 1;                              \
1170   } while (0);
1171
1172 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1173   do {                                                                      \
1174     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1175     *buf++ = nbytes;                                                        \
1176     *buf++ = method;                                                        \
1177   } while (0)
1178
1179
1180 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1181   do {                                                                  \
1182     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1183     *buf++ = id;                                                        \
1184   } while (0)
1185
1186 \f
1187 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1188
1189
1190
1191 \f
1192 /*** 3. UTF-8 ***/
1193
1194 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1195    Check if a text is encoded in UTF-8.  If it is, return 1, else
1196    return 0.  */
1197
1198 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1199 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1200 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1201 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1202 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1203 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1204
1205 #define UTF_8_BOM_1 0xEF
1206 #define UTF_8_BOM_2 0xBB
1207 #define UTF_8_BOM_3 0xBF
1208
1209 static int
1210 detect_coding_utf_8 (struct coding_system *coding,
1211                      struct coding_detection_info *detect_info)
1212 {
1213   const unsigned char *src = coding->source, *src_base;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   int multibytep = coding->src_multibyte;
1216   EMACS_INT consumed_chars = 0;
1217   int bom_found = 0;
1218   int found = 0;
1219
1220   detect_info->checked |= CATEGORY_MASK_UTF_8;
1221   /* A coding system of this category is always ASCII compatible.  */
1222   src += coding->head_ascii;
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4;
1227
1228       src_base = src;
1229       ONE_MORE_BYTE (c);
1230       if (c < 0 || UTF_8_1_OCTET_P (c))
1231         continue;
1232       ONE_MORE_BYTE (c1);
1233       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1234         break;
1235       if (UTF_8_2_OCTET_LEADING_P (c))
1236         {
1237           found = 1;
1238           continue;
1239         }
1240       ONE_MORE_BYTE (c2);
1241       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1242         break;
1243       if (UTF_8_3_OCTET_LEADING_P (c))
1244         {
1245           found = 1;
1246           if (src_base == coding->source
1247               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1248             bom_found = 1;
1249           continue;
1250         }
1251       ONE_MORE_BYTE (c3);
1252       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1253         break;
1254       if (UTF_8_4_OCTET_LEADING_P (c))
1255         {
1256           found = 1;
1257           continue;
1258         }
1259       ONE_MORE_BYTE (c4);
1260       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1261         break;
1262       if (UTF_8_5_OCTET_LEADING_P (c))
1263         {
1264           found = 1;
1265           continue;
1266         }
1267       break;
1268     }
1269   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1270   return 0;
1271
1272  no_more_source:
1273   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1274     {
1275       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1276       return 0;
1277     }
1278   if (bom_found)
1279     {
1280       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1281       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1282     }
1283   else
1284     {
1285       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1286       if (found)
1287         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1288     }
1289   return 1;
1290 }
1291
1292
1293 static void
1294 decode_coding_utf_8 (struct coding_system *coding)
1295 {
1296   const unsigned char *src = coding->source + coding->consumed;
1297   const unsigned char *src_end = coding->source + coding->src_bytes;
1298   const unsigned char *src_base;
1299   int *charbuf = coding->charbuf + coding->charbuf_used;
1300   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1301   EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
1302   int multibytep = coding->src_multibyte;
1303   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1304   int eol_dos =
1305     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1306   int byte_after_cr = -1;
1307
1308   if (bom != utf_without_bom)
1309     {
1310       int c1, c2, c3;
1311
1312       src_base = src;
1313       ONE_MORE_BYTE (c1);
1314       if (! UTF_8_3_OCTET_LEADING_P (c1))
1315         src = src_base;
1316       else
1317         {
1318           ONE_MORE_BYTE (c2);
1319           if (! UTF_8_EXTRA_OCTET_P (c2))
1320             src = src_base;
1321           else
1322             {
1323               ONE_MORE_BYTE (c3);
1324               if (! UTF_8_EXTRA_OCTET_P (c3))
1325                 src = src_base;
1326               else
1327                 {
1328                   if ((c1 != UTF_8_BOM_1)
1329                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1330                     src = src_base;
1331                   else
1332                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1333                 }
1334             }
1335         }
1336     }
1337   CODING_UTF_8_BOM (coding) = utf_without_bom;
1338
1339   while (1)
1340     {
1341       int c, c1, c2, c3, c4, c5;
1342
1343       src_base = src;
1344       consumed_chars_base = consumed_chars;
1345
1346       if (charbuf >= charbuf_end)
1347         {
1348           if (byte_after_cr >= 0)
1349             src_base--;
1350           break;
1351         }
1352
1353       if (byte_after_cr >= 0)
1354         c1 = byte_after_cr, byte_after_cr = -1;
1355       else
1356         ONE_MORE_BYTE (c1);
1357       if (c1 < 0)
1358         {
1359           c = - c1;
1360         }
1361       else if (UTF_8_1_OCTET_P (c1))
1362         {
1363           if (eol_dos && c1 == '\r')
1364             ONE_MORE_BYTE (byte_after_cr);
1365           c = c1;
1366         }
1367       else
1368         {
1369           ONE_MORE_BYTE (c2);
1370           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1371             goto invalid_code;
1372           if (UTF_8_2_OCTET_LEADING_P (c1))
1373             {
1374               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1375               /* Reject overlong sequences here and below.  Encoders
1376                  producing them are incorrect, they can be misleading,
1377                  and they mess up read/write invariance.  */
1378               if (c < 128)
1379                 goto invalid_code;
1380             }
1381           else
1382             {
1383               ONE_MORE_BYTE (c3);
1384               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1385                 goto invalid_code;
1386               if (UTF_8_3_OCTET_LEADING_P (c1))
1387                 {
1388                   c = (((c1 & 0xF) << 12)
1389                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1390                   if (c < 0x800
1391                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1392                     goto invalid_code;
1393                 }
1394               else
1395                 {
1396                   ONE_MORE_BYTE (c4);
1397                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1398                     goto invalid_code;
1399                   if (UTF_8_4_OCTET_LEADING_P (c1))
1400                     {
1401                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1402                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1403                     if (c < 0x10000)
1404                       goto invalid_code;
1405                     }
1406                   else
1407                     {
1408                       ONE_MORE_BYTE (c5);
1409                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1410                         goto invalid_code;
1411                       if (UTF_8_5_OCTET_LEADING_P (c1))
1412                         {
1413                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1414                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1415                                | (c5 & 0x3F));
1416                           if ((c > MAX_CHAR) || (c < 0x200000))
1417                             goto invalid_code;
1418                         }
1419                       else
1420                         goto invalid_code;
1421                     }
1422                 }
1423             }
1424         }
1425
1426       *charbuf++ = c;
1427       continue;
1428
1429     invalid_code:
1430       src = src_base;
1431       consumed_chars = consumed_chars_base;
1432       ONE_MORE_BYTE (c);
1433       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1434       coding->errors++;
1435     }
1436
1437  no_more_source:
1438   coding->consumed_char += consumed_chars_base;
1439   coding->consumed = src_base - coding->source;
1440   coding->charbuf_used = charbuf - coding->charbuf;
1441 }
1442
1443
1444 static int
1445 encode_coding_utf_8 (struct coding_system *coding)
1446 {
1447   int multibytep = coding->dst_multibyte;
1448   int *charbuf = coding->charbuf;
1449   int *charbuf_end = charbuf + coding->charbuf_used;
1450   unsigned char *dst = coding->destination + coding->produced;
1451   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1452   EMACS_INT produced_chars = 0;
1453   int c;
1454
1455   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1456     {
1457       ASSURE_DESTINATION (3);
1458       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1459       CODING_UTF_8_BOM (coding) = utf_without_bom;
1460     }
1461
1462   if (multibytep)
1463     {
1464       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1465
1466       while (charbuf < charbuf_end)
1467         {
1468           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1469
1470           ASSURE_DESTINATION (safe_room);
1471           c = *charbuf++;
1472           if (CHAR_BYTE8_P (c))
1473             {
1474               c = CHAR_TO_BYTE8 (c);
1475               EMIT_ONE_BYTE (c);
1476             }
1477           else
1478             {
1479               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1480               for (p = str; p < pend; p++)
1481                 EMIT_ONE_BYTE (*p);
1482             }
1483         }
1484     }
1485   else
1486     {
1487       int safe_room = MAX_MULTIBYTE_LENGTH;
1488
1489       while (charbuf < charbuf_end)
1490         {
1491           ASSURE_DESTINATION (safe_room);
1492           c = *charbuf++;
1493           if (CHAR_BYTE8_P (c))
1494             *dst++ = CHAR_TO_BYTE8 (c);
1495           else
1496             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1497           produced_chars++;
1498         }
1499     }
1500   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1501   coding->produced_char += produced_chars;
1502   coding->produced = dst - coding->destination;
1503   return 0;
1504 }
1505
1506
1507 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1508    Check if a text is encoded in one of UTF-16 based coding systems.
1509    If it is, return 1, else return 0.  */
1510
1511 #define UTF_16_HIGH_SURROGATE_P(val) \
1512   (((val) & 0xFC00) == 0xD800)
1513
1514 #define UTF_16_LOW_SURROGATE_P(val) \
1515   (((val) & 0xFC00) == 0xDC00)
1516
1517
1518 static int
1519 detect_coding_utf_16 (struct coding_system *coding,
1520                       struct coding_detection_info *detect_info)
1521 {
1522   const unsigned char *src = coding->source;
1523   const unsigned char *src_end = coding->source + coding->src_bytes;
1524   int multibytep = coding->src_multibyte;
1525   int c1, c2;
1526
1527   detect_info->checked |= CATEGORY_MASK_UTF_16;
1528   if (coding->mode & CODING_MODE_LAST_BLOCK
1529       && (coding->src_chars & 1))
1530     {
1531       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1532       return 0;
1533     }
1534
1535   TWO_MORE_BYTES (c1, c2);
1536   if ((c1 == 0xFF) && (c2 == 0xFE))
1537     {
1538       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1539                              | CATEGORY_MASK_UTF_16_AUTO);
1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1541                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1542                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1543     }
1544   else if ((c1 == 0xFE) && (c2 == 0xFF))
1545     {
1546       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1547                              | CATEGORY_MASK_UTF_16_AUTO);
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1549                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1550                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1551     }
1552   else if (c2 < 0)
1553     {
1554       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1555       return 0;
1556     }
1557   else
1558     {
1559       /* We check the dispersion of Eth and Oth bytes where E is even and
1560          O is odd.  If both are high, we assume binary data.*/
1561       unsigned char e[256], o[256];
1562       unsigned e_num = 1, o_num = 1;
1563
1564       memset (e, 0, 256);
1565       memset (o, 0, 256);
1566       e[c1] = 1;
1567       o[c2] = 1;
1568
1569       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1570                                 |CATEGORY_MASK_UTF_16_BE
1571                                 | CATEGORY_MASK_UTF_16_LE);
1572
1573       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1574              != CATEGORY_MASK_UTF_16)
1575         {
1576           TWO_MORE_BYTES (c1, c2);
1577           if (c2 < 0)
1578             break;
1579           if (! e[c1])
1580             {
1581               e[c1] = 1;
1582               e_num++;
1583               if (e_num >= 128)
1584                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1585             }
1586           if (! o[c2])
1587             {
1588               o[c2] = 1;
1589               o_num++;
1590               if (o_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1592             }
1593         }
1594       return 0;
1595     }
1596
1597  no_more_source:
1598   return 1;
1599 }
1600
1601 static void
1602 decode_coding_utf_16 (struct coding_system *coding)
1603 {
1604   const unsigned char *src = coding->source + coding->consumed;
1605   const unsigned char *src_end = coding->source + coding->src_bytes;
1606   const unsigned char *src_base;
1607   int *charbuf = coding->charbuf + coding->charbuf_used;
1608   /* We may produces at most 3 chars in one loop.  */
1609   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1610   EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
1611   int multibytep = coding->src_multibyte;
1612   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1613   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1614   int surrogate = CODING_UTF_16_SURROGATE (coding);
1615   int eol_dos =
1616     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1617   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1618
1619   if (bom == utf_with_bom)
1620     {
1621       int c, c1, c2;
1622
1623       src_base = src;
1624       ONE_MORE_BYTE (c1);
1625       ONE_MORE_BYTE (c2);
1626       c = (c1 << 8) | c2;
1627
1628       if (endian == utf_16_big_endian
1629           ? c != 0xFEFF : c != 0xFFFE)
1630         {
1631           /* The first two bytes are not BOM.  Treat them as bytes
1632              for a normal character.  */
1633           src = src_base;
1634           coding->errors++;
1635         }
1636       CODING_UTF_16_BOM (coding) = utf_without_bom;
1637     }
1638   else if (bom == utf_detect_bom)
1639     {
1640       /* We have already tried to detect BOM and failed in
1641          detect_coding.  */
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644
1645   while (1)
1646     {
1647       int c, c1, c2;
1648
1649       src_base = src;
1650       consumed_chars_base = consumed_chars;
1651
1652       if (charbuf >= charbuf_end)
1653         {
1654           if (byte_after_cr1 >= 0)
1655             src_base -= 2;
1656           break;
1657         }
1658
1659       if (byte_after_cr1 >= 0)
1660         c1 = byte_after_cr1, byte_after_cr1 = -1;
1661       else
1662         ONE_MORE_BYTE (c1);
1663       if (c1 < 0)
1664         {
1665           *charbuf++ = -c1;
1666           continue;
1667         }
1668       if (byte_after_cr2 >= 0)
1669         c2 = byte_after_cr2, byte_after_cr2 = -1;
1670       else
1671         ONE_MORE_BYTE (c2);
1672       if (c2 < 0)
1673         {
1674           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1675           *charbuf++ = -c2;
1676           continue;
1677         }
1678       c = (endian == utf_16_big_endian
1679            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1680
1681       if (surrogate)
1682         {
1683           if (! UTF_16_LOW_SURROGATE_P (c))
1684             {
1685               if (endian == utf_16_big_endian)
1686                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1687               else
1688                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1689               *charbuf++ = c1;
1690               *charbuf++ = c2;
1691               coding->errors++;
1692               if (UTF_16_HIGH_SURROGATE_P (c))
1693                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1694               else
1695                 *charbuf++ = c;
1696             }
1697           else
1698             {
1699               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1700               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1701               *charbuf++ = 0x10000 + c;
1702             }
1703         }
1704       else
1705         {
1706           if (UTF_16_HIGH_SURROGATE_P (c))
1707             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1708           else
1709             {
1710               if (eol_dos && c == '\r')
1711                 {
1712                   ONE_MORE_BYTE (byte_after_cr1);
1713                   ONE_MORE_BYTE (byte_after_cr2);
1714                 }
1715               *charbuf++ = c;
1716             }
1717         }
1718     }
1719
1720  no_more_source:
1721   coding->consumed_char += consumed_chars_base;
1722   coding->consumed = src_base - coding->source;
1723   coding->charbuf_used = charbuf - coding->charbuf;
1724 }
1725
1726 static int
1727 encode_coding_utf_16 (struct coding_system *coding)
1728 {
1729   int multibytep = coding->dst_multibyte;
1730   int *charbuf = coding->charbuf;
1731   int *charbuf_end = charbuf + coding->charbuf_used;
1732   unsigned char *dst = coding->destination + coding->produced;
1733   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1734   int safe_room = 8;
1735   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1736   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1737   EMACS_INT produced_chars = 0;
1738   int c;
1739
1740   if (bom != utf_without_bom)
1741     {
1742       ASSURE_DESTINATION (safe_room);
1743       if (big_endian)
1744         EMIT_TWO_BYTES (0xFE, 0xFF);
1745       else
1746         EMIT_TWO_BYTES (0xFF, 0xFE);
1747       CODING_UTF_16_BOM (coding) = utf_without_bom;
1748     }
1749
1750   while (charbuf < charbuf_end)
1751     {
1752       ASSURE_DESTINATION (safe_room);
1753       c = *charbuf++;
1754       if (c > MAX_UNICODE_CHAR)
1755         c = coding->default_char;
1756
1757       if (c < 0x10000)
1758         {
1759           if (big_endian)
1760             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1761           else
1762             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1763         }
1764       else
1765         {
1766           int c1, c2;
1767
1768           c -= 0x10000;
1769           c1 = (c >> 10) + 0xD800;
1770           c2 = (c & 0x3FF) + 0xDC00;
1771           if (big_endian)
1772             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1773           else
1774             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1775         }
1776     }
1777   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1778   coding->produced = dst - coding->destination;
1779   coding->produced_char += produced_chars;
1780   return 0;
1781 }
1782
1783 \f
1784 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1785
1786 /* Emacs' internal format for representation of multiple character
1787    sets is a kind of multi-byte encoding, i.e. characters are
1788    represented by variable-length sequences of one-byte codes.
1789
1790    ASCII characters and control characters (e.g. `tab', `newline') are
1791    represented by one-byte sequences which are their ASCII codes, in
1792    the range 0x00 through 0x7F.
1793
1794    8-bit characters of the range 0x80..0x9F are represented by
1795    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1796    code + 0x20).
1797
1798    8-bit characters of the range 0xA0..0xFF are represented by
1799    one-byte sequences which are their 8-bit code.
1800
1801    The other characters are represented by a sequence of `base
1802    leading-code', optional `extended leading-code', and one or two
1803    `position-code's.  The length of the sequence is determined by the
1804    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1805    whereas extended leading-code and position-code take the range 0xA0
1806    through 0xFF.  See `charset.h' for more details about leading-code
1807    and position-code.
1808
1809    --- CODE RANGE of Emacs' internal format ---
1810    character set        range
1811    -------------        -----
1812    ascii                0x00..0x7F
1813    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1814    eight-bit-graphic    0xA0..0xBF
1815    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1816    ---------------------------------------------
1817
1818    As this is the internal character representation, the format is
1819    usually not used externally (i.e. in a file or in a data sent to a
1820    process).  But, it is possible to have a text externally in this
1821    format (i.e. by encoding by the coding system `emacs-mule').
1822
1823    In that case, a sequence of one-byte codes has a slightly different
1824    form.
1825
1826    At first, all characters in eight-bit-control are represented by
1827    one-byte sequences which are their 8-bit code.
1828
1829    Next, character composition data are represented by the byte
1830    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1831    where,
1832         METHOD is 0xF2 plus one of composition method (enum
1833         composition_method),
1834
1835         BYTES is 0xA0 plus a byte length of this composition data,
1836
1837         CHARS is 0xA0 plus a number of characters composed by this
1838         data,
1839
1840         COMPONENTs are characters of multibyte form or composition
1841         rules encoded by two-byte of ASCII codes.
1842
1843    In addition, for backward compatibility, the following formats are
1844    also recognized as composition data on decoding.
1845
1846    0x80 MSEQ ...
1847    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1848
1849    Here,
1850         MSEQ is a multibyte form but in these special format:
1851           ASCII: 0xA0 ASCII_CODE+0x80,
1852           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1853         RULE is a one byte code of the range 0xA0..0xF0 that
1854         represents a composition rule.
1855   */
1856
1857 char emacs_mule_bytes[256];
1858
1859
1860 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1861    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1862    else return 0.  */
1863
1864 static int
1865 detect_coding_emacs_mule (struct coding_system *coding,
1866                           struct coding_detection_info *detect_info)
1867 {
1868   const unsigned char *src = coding->source, *src_base;
1869   const unsigned char *src_end = coding->source + coding->src_bytes;
1870   int multibytep = coding->src_multibyte;
1871   EMACS_INT consumed_chars = 0;
1872   int c;
1873   int found = 0;
1874
1875   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1876   /* A coding system of this category is always ASCII compatible.  */
1877   src += coding->head_ascii;
1878
1879   while (1)
1880     {
1881       src_base = src;
1882       ONE_MORE_BYTE (c);
1883       if (c < 0)
1884         continue;
1885       if (c == 0x80)
1886         {
1887           /* Perhaps the start of composite character.  We simply skip
1888              it because analyzing it is too heavy for detecting.  But,
1889              at least, we check that the composite character
1890              constitutes of more than 4 bytes.  */
1891           const unsigned char *src_start;
1892
1893         repeat:
1894           src_start = src;
1895           do
1896             {
1897               ONE_MORE_BYTE (c);
1898             }
1899           while (c >= 0xA0);
1900
1901           if (src - src_start <= 4)
1902             break;
1903           found = CATEGORY_MASK_EMACS_MULE;
1904           if (c == 0x80)
1905             goto repeat;
1906         }
1907
1908       if (c < 0x80)
1909         {
1910           if (c < 0x20
1911               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1912             break;
1913         }
1914       else
1915         {
1916           int more_bytes = emacs_mule_bytes[c] - 1;
1917
1918           while (more_bytes > 0)
1919             {
1920               ONE_MORE_BYTE (c);
1921               if (c < 0xA0)
1922                 {
1923                   src--;        /* Unread the last byte.  */
1924                   break;
1925                 }
1926               more_bytes--;
1927             }
1928           if (more_bytes != 0)
1929             break;
1930           found = CATEGORY_MASK_EMACS_MULE;
1931         }
1932     }
1933   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934   return 0;
1935
1936  no_more_source:
1937   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1938     {
1939       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1940       return 0;
1941     }
1942   detect_info->found |= found;
1943   return 1;
1944 }
1945
1946
1947 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1948    character.  If CMP_STATUS indicates that we must expect MSEQ or
1949    RULE described above, decode it and return the negative value of
1950    the decoded character or rule.  If an invalid byte is found, return
1951    -1.  If SRC is too short, return -2.  */
1952
1953 static int
1954 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1955                  int *nbytes, int *nchars, int *id,
1956                  struct composition_status *cmp_status)
1957 {
1958   const unsigned char *src_end = coding->source + coding->src_bytes;
1959   const unsigned char *src_base = src;
1960   int multibytep = coding->src_multibyte;
1961   int charset_ID;
1962   unsigned code;
1963   int c;
1964   int consumed_chars = 0;
1965   int mseq_found = 0;
1966
1967   ONE_MORE_BYTE (c);
1968   if (c < 0)
1969     {
1970       c = -c;
1971       charset_ID = emacs_mule_charset[0];
1972     }
1973   else
1974     {
1975       if (c >= 0xA0)
1976         {
1977           if (cmp_status->state != COMPOSING_NO
1978               && cmp_status->old_form)
1979             {
1980               if (cmp_status->state == COMPOSING_CHAR)
1981                 {
1982                   if (c == 0xA0)
1983                     {
1984                       ONE_MORE_BYTE (c);
1985                       c -= 0x80;
1986                       if (c < 0)
1987                         goto invalid_code;
1988                     }
1989                   else
1990                     c -= 0x20;
1991                   mseq_found = 1;
1992                 }
1993               else
1994                 {
1995                   *nbytes = src - src_base;
1996                   *nchars = consumed_chars;
1997                   return -c;
1998                 }
1999             }
2000           else
2001             goto invalid_code;
2002         }
2003
2004       switch (emacs_mule_bytes[c])
2005         {
2006         case 2:
2007           if ((charset_ID = emacs_mule_charset[c]) < 0)
2008             goto invalid_code;
2009           ONE_MORE_BYTE (c);
2010           if (c < 0xA0)
2011             goto invalid_code;
2012           code = c & 0x7F;
2013           break;
2014
2015         case 3:
2016           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2017               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2018             {
2019               ONE_MORE_BYTE (c);
2020               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = c & 0x7F;
2026             }
2027           else
2028             {
2029               if ((charset_ID = emacs_mule_charset[c]) < 0)
2030                 goto invalid_code;
2031               ONE_MORE_BYTE (c);
2032               if (c < 0xA0)
2033                 goto invalid_code;
2034               code = (c & 0x7F) << 8;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code |= c & 0x7F;
2039             }
2040           break;
2041
2042         case 4:
2043           ONE_MORE_BYTE (c);
2044           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2045             goto invalid_code;
2046           ONE_MORE_BYTE (c);
2047           if (c < 0xA0)
2048             goto invalid_code;
2049           code = (c & 0x7F) << 8;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code |= c & 0x7F;
2054           break;
2055
2056         case 1:
2057           code = c;
2058           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2059           break;
2060
2061         default:
2062           abort ();
2063         }
2064       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2065                           CHARSET_FROM_ID (charset_ID), code, c);
2066       if (c < 0)
2067         goto invalid_code;
2068     }
2069   *nbytes = src - src_base;
2070   *nchars = consumed_chars;
2071   if (id)
2072     *id = charset_ID;
2073   return (mseq_found ? -c : c);
2074
2075  no_more_source:
2076   return -2;
2077
2078  invalid_code:
2079   return -1;
2080 }
2081
2082
2083 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2084
2085 /* Handle these composition sequence ('|': the end of header elements,
2086    BYTES and CHARS >= 0xA0):
2087
2088    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2089    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2090    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2091
2092    and these old form:
2093
2094    (4) relative composition: 0x80 | MSEQ ... MSEQ
2095    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2096
2097    When the starter 0x80 and the following header elements are found,
2098    this annotation header is produced.
2099
2100         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2101
2102    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2103    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2104
2105    Then, upon reading the following elements, these codes are produced
2106    until the composition end is found:
2107
2108    (1) CHAR ... CHAR
2109    (2) ALT ... ALT CHAR ... CHAR
2110    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2111    (4) CHAR ... CHAR
2112    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2113
2114    When the composition end is found, LENGTH and NCHARS in the
2115    annotation header is updated as below:
2116
2117    (1) LENGTH: unchanged, NCHARS: unchanged
2118    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2119    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2120    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2121    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2122
2123    If an error is found while composing, the annotation header is
2124    changed to the original composition header (plus filler -1s) as
2125    below:
2126
2127    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2128    (5)          [ 0x80 0xFF -1 -1- -1 ]
2129
2130    and the sequence [ -2 DECODED-RULE ] is changed to the original
2131    byte sequence as below:
2132         o the original byte sequence is B: [ B -1 ]
2133         o the original byte sequence is B1 B2: [ B1 B2 ]
2134
2135    Most of the routines are implemented by macros because many
2136    variables and labels in the caller decode_coding_emacs_mule must be
2137    accessible, and they are usually called just once (thus doesn't
2138    increase the size of compiled object).  */
2139
2140 /* Decode a composition rule represented by C as a component of
2141    composition sequence of Emacs 20 style.  Set RULE to the decoded
2142    rule. */
2143
2144 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2145   do {                                                  \
2146     int gref, nref;                                     \
2147                                                         \
2148     c -= 0xA0;                                          \
2149     if (c < 0 || c >= 81)                               \
2150       goto invalid_code;                                \
2151     gref = c / 9, nref = c % 9;                         \
2152     if (gref == 4) gref = 10;                           \
2153     if (nref == 4) nref = 10;                           \
2154     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2155   } while (0)
2156
2157
2158 /* Decode a composition rule represented by C and the following byte
2159    at SRC as a component of composition sequence of Emacs 21 style.
2160    Set RULE to the decoded rule.  */
2161
2162 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2163   do {                                                  \
2164     int gref, nref;                                     \
2165                                                         \
2166     gref = c - 0x20;                                    \
2167     if (gref < 0 || gref >= 81)                         \
2168       goto invalid_code;                                \
2169     ONE_MORE_BYTE (c);                                  \
2170     nref = c - 0x20;                                    \
2171     if (nref < 0 || nref >= 81)                         \
2172       goto invalid_code;                                \
2173     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2174   } while (0)
2175
2176
2177 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2178    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2179    byte length of this composition information, CHARS is the number of
2180    characters composed by this composition.  */
2181
2182 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2183   do {                                                                  \
2184     enum composition_method method = c - 0xF2;                          \
2185     int nbytes, nchars;                                                 \
2186                                                                         \
2187     ONE_MORE_BYTE (c);                                                  \
2188     if (c < 0)                                                          \
2189       goto invalid_code;                                                \
2190     nbytes = c - 0xA0;                                                  \
2191     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2192       goto invalid_code;                                                \
2193     ONE_MORE_BYTE (c);                                                  \
2194     nchars = c - 0xA0;                                                  \
2195     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2196       goto invalid_code;                                                \
2197     cmp_status->old_form = 0;                                           \
2198     cmp_status->method = method;                                        \
2199     if (method == COMPOSITION_RELATIVE)                                 \
2200       cmp_status->state = COMPOSING_CHAR;                               \
2201     else                                                                \
2202       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2203     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2204     cmp_status->nchars = nchars;                                        \
2205     cmp_status->ncomps = nbytes - 4;                                    \
2206     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2207   } while (0)
2208
2209
2210 /* Start of Emacs 20 style format for relative composition.  */
2211
2212 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2213   do {                                                          \
2214     cmp_status->old_form = 1;                                   \
2215     cmp_status->method = COMPOSITION_RELATIVE;                  \
2216     cmp_status->state = COMPOSING_CHAR;                         \
2217     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2218     cmp_status->nchars = cmp_status->ncomps = 0;                \
2219     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2220   } while (0)
2221
2222
2223 /* Start of Emacs 20 style format for rule-base composition.  */
2224
2225 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2226   do {                                                          \
2227     cmp_status->old_form = 1;                                   \
2228     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2229     cmp_status->state = COMPOSING_CHAR;                         \
2230     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2231     cmp_status->nchars = cmp_status->ncomps = 0;                \
2232     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2233   } while (0)
2234
2235
2236 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2237   do {                                                  \
2238     const unsigned char *current_src = src;             \
2239                                                         \
2240     ONE_MORE_BYTE (c);                                  \
2241     if (c < 0)                                          \
2242       goto invalid_code;                                \
2243     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2244         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2245       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2246     else if (c < 0xA0)                                  \
2247       goto invalid_code;                                \
2248     else if (c < 0xC0)                                  \
2249       {                                                 \
2250         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2251         /* Re-read C as a composition component.  */    \
2252         src = current_src;                              \
2253       }                                                 \
2254     else if (c == 0xFF)                                 \
2255       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2256     else                                                \
2257       goto invalid_code;                                \
2258   } while (0)
2259
2260 #define EMACS_MULE_COMPOSITION_END()                            \
2261   do {                                                          \
2262     int idx = - cmp_status->length;                             \
2263                                                                 \
2264     if (cmp_status->old_form)                                   \
2265       charbuf[idx + 2] = cmp_status->nchars;                    \
2266     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2267       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2268     cmp_status->state = COMPOSING_NO;                           \
2269   } while (0)
2270
2271
2272 static int
2273 emacs_mule_finish_composition (int *charbuf,
2274                                struct composition_status *cmp_status)
2275 {
2276   int idx = - cmp_status->length;
2277   int new_chars;
2278
2279   if (cmp_status->old_form && cmp_status->nchars > 0)
2280     {
2281       charbuf[idx + 2] = cmp_status->nchars;
2282       new_chars = 0;
2283       if (cmp_status->method == COMPOSITION_WITH_RULE
2284           && cmp_status->state == COMPOSING_CHAR)
2285         {
2286           /* The last rule was invalid.  */
2287           int rule = charbuf[-1] + 0xA0;
2288
2289           charbuf[-2] = BYTE8_TO_CHAR (rule);
2290           charbuf[-1] = -1;
2291           new_chars = 1;
2292         }
2293     }
2294   else
2295     {
2296       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2297
2298       if (cmp_status->method == COMPOSITION_WITH_RULE)
2299         {
2300           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2301           charbuf[idx++] = -3;
2302           charbuf[idx++] = 0;
2303           new_chars = 1;
2304         }
2305       else
2306         {
2307           int nchars = charbuf[idx + 1] + 0xA0;
2308           int nbytes = charbuf[idx + 2] + 0xA0;
2309
2310           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2311           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2312           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2313           charbuf[idx++] = -1;
2314           new_chars = 4;
2315         }
2316     }
2317   cmp_status->state = COMPOSING_NO;
2318   return new_chars;
2319 }
2320
2321 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2322   do {                                                                    \
2323     if (cmp_status->state != COMPOSING_NO)                                \
2324       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2325   } while (0)
2326
2327
2328 static void
2329 decode_coding_emacs_mule (struct coding_system *coding)
2330 {
2331   const unsigned char *src = coding->source + coding->consumed;
2332   const unsigned char *src_end = coding->source + coding->src_bytes;
2333   const unsigned char *src_base;
2334   int *charbuf = coding->charbuf + coding->charbuf_used;
2335   /* We may produce two annotations (charset and composition) in one
2336      loop and one more charset annotation at the end.  */
2337   int *charbuf_end
2338     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2339       /* We can produce up to 2 characters in a loop.  */
2340       - 1;
2341   EMACS_INT consumed_chars = 0, consumed_chars_base;
2342   int multibytep = coding->src_multibyte;
2343   EMACS_INT char_offset = coding->produced_char;
2344   EMACS_INT last_offset = char_offset;
2345   int last_id = charset_ascii;
2346   int eol_dos =
2347     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2348   int byte_after_cr = -1;
2349   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2350
2351   if (cmp_status->state != COMPOSING_NO)
2352     {
2353       int i;
2354
2355       if (charbuf_end - charbuf < cmp_status->length)
2356         abort ();
2357       for (i = 0; i < cmp_status->length; i++)
2358         *charbuf++ = cmp_status->carryover[i];
2359       coding->annotated = 1;
2360     }
2361
2362   while (1)
2363     {
2364       int c, id IF_LINT (= 0);
2365
2366       src_base = src;
2367       consumed_chars_base = consumed_chars;
2368
2369       if (charbuf >= charbuf_end)
2370         {
2371           if (byte_after_cr >= 0)
2372             src_base--;
2373           break;
2374         }
2375
2376       if (byte_after_cr >= 0)
2377         c = byte_after_cr, byte_after_cr = -1;
2378       else
2379         ONE_MORE_BYTE (c);
2380
2381       if (c < 0 || c == 0x80)
2382         {
2383           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2384           if (c < 0)
2385             {
2386               *charbuf++ = -c;
2387               char_offset++;
2388             }
2389           else
2390             DECODE_EMACS_MULE_COMPOSITION_START ();
2391           continue;
2392         }
2393
2394       if (c < 0x80)
2395         {
2396           if (eol_dos && c == '\r')
2397             ONE_MORE_BYTE (byte_after_cr);
2398           id = charset_ascii;
2399           if (cmp_status->state != COMPOSING_NO)
2400             {
2401               if (cmp_status->old_form)
2402                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2404                 cmp_status->ncomps--;
2405             }
2406         }
2407       else
2408         {
2409           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2410           /* emacs_mule_char can load a charset map from a file, which
2411              allocates a large structure and might cause buffer text
2412              to be relocated as result.  Thus, we need to remember the
2413              original pointer to buffer text, and fix up all related
2414              pointers after the call.  */
2415           const unsigned char *orig = coding->source;
2416           EMACS_INT offset;
2417
2418           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2419                                cmp_status);
2420           offset = coding->source - orig;
2421           if (offset)
2422             {
2423               src += offset;
2424               src_base += offset;
2425               src_end += offset;
2426             }
2427           if (c < 0)
2428             {
2429               if (c == -1)
2430                 goto invalid_code;
2431               if (c == -2)
2432                 break;
2433             }
2434           src = src_base + nbytes;
2435           consumed_chars = consumed_chars_base + nchars;
2436           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2437             cmp_status->ncomps -= nchars;
2438         }
2439
2440       /* Now if C >= 0, we found a normally encoded character, if C <
2441          0, we found an old-style composition component character or
2442          rule.  */
2443
2444       if (cmp_status->state == COMPOSING_NO)
2445         {
2446           if (last_id != id)
2447             {
2448               if (last_id != charset_ascii)
2449                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2450                                   last_id);
2451               last_id = id;
2452               last_offset = char_offset;
2453             }
2454           *charbuf++ = c;
2455           char_offset++;
2456         }
2457       else if (cmp_status->state == COMPOSING_CHAR)
2458         {
2459           if (cmp_status->old_form)
2460             {
2461               if (c >= 0)
2462                 {
2463                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2464                   *charbuf++ = c;
2465                   char_offset++;
2466                 }
2467               else
2468                 {
2469                   *charbuf++ = -c;
2470                   cmp_status->nchars++;
2471                   cmp_status->length++;
2472                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2473                     EMACS_MULE_COMPOSITION_END ();
2474                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2475                     cmp_status->state = COMPOSING_RULE;
2476                 }
2477             }
2478           else
2479             {
2480               *charbuf++ = c;
2481               cmp_status->length++;
2482               cmp_status->nchars--;
2483               if (cmp_status->nchars == 0)
2484                 EMACS_MULE_COMPOSITION_END ();
2485             }
2486         }
2487       else if (cmp_status->state == COMPOSING_RULE)
2488         {
2489           int rule;
2490
2491           if (c >= 0)
2492             {
2493               EMACS_MULE_COMPOSITION_END ();
2494               *charbuf++ = c;
2495               char_offset++;
2496             }
2497           else
2498             {
2499               c = -c;
2500               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2501               if (rule < 0)
2502                 goto invalid_code;
2503               *charbuf++ = -2;
2504               *charbuf++ = rule;
2505               cmp_status->length += 2;
2506               cmp_status->state = COMPOSING_CHAR;
2507             }
2508         }
2509       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2510         {
2511           *charbuf++ = c;
2512           cmp_status->length++;
2513           if (cmp_status->ncomps == 0)
2514             cmp_status->state = COMPOSING_CHAR;
2515           else if (cmp_status->ncomps > 0)
2516             {
2517               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2518                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2519             }
2520           else
2521             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2522         }
2523       else                      /* COMPOSING_COMPONENT_RULE */
2524         {
2525           int rule;
2526
2527           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2528           if (rule < 0)
2529             goto invalid_code;
2530           *charbuf++ = -2;
2531           *charbuf++ = rule;
2532           cmp_status->length += 2;
2533           cmp_status->ncomps--;
2534           if (cmp_status->ncomps > 0)
2535             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2536           else
2537             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2538         }
2539       continue;
2540
2541     invalid_code:
2542       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543       src = src_base;
2544       consumed_chars = consumed_chars_base;
2545       ONE_MORE_BYTE (c);
2546       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2547       char_offset++;
2548       coding->errors++;
2549     }
2550
2551  no_more_source:
2552   if (cmp_status->state != COMPOSING_NO)
2553     {
2554       if (coding->mode & CODING_MODE_LAST_BLOCK)
2555         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2556       else
2557         {
2558           int i;
2559
2560           charbuf -= cmp_status->length;
2561           for (i = 0; i < cmp_status->length; i++)
2562             cmp_status->carryover[i] = charbuf[i];
2563         }
2564     }
2565   if (last_id != charset_ascii)
2566     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2567   coding->consumed_char += consumed_chars_base;
2568   coding->consumed = src_base - coding->source;
2569   coding->charbuf_used = charbuf - coding->charbuf;
2570 }
2571
2572
2573 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2574   do {                                          \
2575     if (id < 0xA0)                              \
2576       codes[0] = id, codes[1] = 0;              \
2577     else if (id < 0xE0)                         \
2578       codes[0] = 0x9A, codes[1] = id;           \
2579     else if (id < 0xF0)                         \
2580       codes[0] = 0x9B, codes[1] = id;           \
2581     else if (id < 0xF5)                         \
2582       codes[0] = 0x9C, codes[1] = id;           \
2583     else                                        \
2584       codes[0] = 0x9D, codes[1] = id;           \
2585   } while (0);
2586
2587
2588 static int
2589 encode_coding_emacs_mule (struct coding_system *coding)
2590 {
2591   int multibytep = coding->dst_multibyte;
2592   int *charbuf = coding->charbuf;
2593   int *charbuf_end = charbuf + coding->charbuf_used;
2594   unsigned char *dst = coding->destination + coding->produced;
2595   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2596   int safe_room = 8;
2597   EMACS_INT produced_chars = 0;
2598   Lisp_Object attrs, charset_list;
2599   int c;
2600   int preferred_charset_id = -1;
2601
2602   CODING_GET_INFO (coding, attrs, charset_list);
2603   if (! EQ (charset_list, Vemacs_mule_charset_list))
2604     {
2605       CODING_ATTR_CHARSET_LIST (attrs)
2606         = charset_list = Vemacs_mule_charset_list;
2607     }
2608
2609   while (charbuf < charbuf_end)
2610     {
2611       ASSURE_DESTINATION (safe_room);
2612       c = *charbuf++;
2613
2614       if (c < 0)
2615         {
2616           /* Handle an annotation.  */
2617           switch (*charbuf)
2618             {
2619             case CODING_ANNOTATE_COMPOSITION_MASK:
2620               /* Not yet implemented.  */
2621               break;
2622             case CODING_ANNOTATE_CHARSET_MASK:
2623               preferred_charset_id = charbuf[3];
2624               if (preferred_charset_id >= 0
2625                   && NILP (Fmemq (make_number (preferred_charset_id),
2626                                   charset_list)))
2627                 preferred_charset_id = -1;
2628               break;
2629             default:
2630               abort ();
2631             }
2632           charbuf += -c - 1;
2633           continue;
2634         }
2635
2636       if (ASCII_CHAR_P (c))
2637         EMIT_ONE_ASCII_BYTE (c);
2638       else if (CHAR_BYTE8_P (c))
2639         {
2640           c = CHAR_TO_BYTE8 (c);
2641           EMIT_ONE_BYTE (c);
2642         }
2643       else
2644         {
2645           struct charset *charset;
2646           unsigned code;
2647           int dimension;
2648           int emacs_mule_id;
2649           unsigned char leading_codes[2];
2650
2651           if (preferred_charset_id >= 0)
2652             {
2653               charset = CHARSET_FROM_ID (preferred_charset_id);
2654               if (CHAR_CHARSET_P (c, charset))
2655                 code = ENCODE_CHAR (charset, c);
2656               else
2657                 charset = char_charset (c, charset_list, &code);
2658             }
2659           else
2660             charset = char_charset (c, charset_list, &code);
2661           if (! charset)
2662             {
2663               c = coding->default_char;
2664               if (ASCII_CHAR_P (c))
2665                 {
2666                   EMIT_ONE_ASCII_BYTE (c);
2667                   continue;
2668                 }
2669               charset = char_charset (c, charset_list, &code);
2670             }
2671           dimension = CHARSET_DIMENSION (charset);
2672           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2673           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2674           EMIT_ONE_BYTE (leading_codes[0]);
2675           if (leading_codes[1])
2676             EMIT_ONE_BYTE (leading_codes[1]);
2677           if (dimension == 1)
2678             EMIT_ONE_BYTE (code | 0x80);
2679           else
2680             {
2681               code |= 0x8080;
2682               EMIT_ONE_BYTE (code >> 8);
2683               EMIT_ONE_BYTE (code & 0xFF);
2684             }
2685         }
2686     }
2687   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2688   coding->produced_char += produced_chars;
2689   coding->produced = dst - coding->destination;
2690   return 0;
2691 }
2692
2693 \f
2694 /*** 7. ISO2022 handlers ***/
2695
2696 /* The following note describes the coding system ISO2022 briefly.
2697    Since the intention of this note is to help understand the
2698    functions in this file, some parts are NOT ACCURATE or are OVERLY
2699    SIMPLIFIED.  For thorough understanding, please refer to the
2700    original document of ISO2022.  This is equivalent to the standard
2701    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2702
2703    ISO2022 provides many mechanisms to encode several character sets
2704    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2705    is encoded using bytes less than 128.  This may make the encoded
2706    text a little bit longer, but the text passes more easily through
2707    several types of gateway, some of which strip off the MSB (Most
2708    Significant Bit).
2709
2710    There are two kinds of character sets: control character sets and
2711    graphic character sets.  The former contain control characters such
2712    as `newline' and `escape' to provide control functions (control
2713    functions are also provided by escape sequences).  The latter
2714    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2715    two control character sets and many graphic character sets.
2716
2717    Graphic character sets are classified into one of the following
2718    four classes, according to the number of bytes (DIMENSION) and
2719    number of characters in one dimension (CHARS) of the set:
2720    - DIMENSION1_CHARS94
2721    - DIMENSION1_CHARS96
2722    - DIMENSION2_CHARS94
2723    - DIMENSION2_CHARS96
2724
2725    In addition, each character set is assigned an identification tag,
2726    unique for each set, called the "final character" (denoted as <F>
2727    hereafter).  The <F> of each character set is decided by ECMA(*)
2728    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2729    (0x30..0x3F are for private use only).
2730
2731    Note (*): ECMA = European Computer Manufacturers Association
2732
2733    Here are examples of graphic character sets [NAME(<F>)]:
2734         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2735         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2736         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2737         o DIMENSION2_CHARS96 -- none for the moment
2738
2739    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2740         C0 [0x00..0x1F] -- control character plane 0
2741         GL [0x20..0x7F] -- graphic character plane 0
2742         C1 [0x80..0x9F] -- control character plane 1
2743         GR [0xA0..0xFF] -- graphic character plane 1
2744
2745    A control character set is directly designated and invoked to C0 or
2746    C1 by an escape sequence.  The most common case is that:
2747    - ISO646's  control character set is designated/invoked to C0, and
2748    - ISO6429's control character set is designated/invoked to C1,
2749    and usually these designations/invocations are omitted in encoded
2750    text.  In a 7-bit environment, only C0 can be used, and a control
2751    character for C1 is encoded by an appropriate escape sequence to
2752    fit into the environment.  All control characters for C1 are
2753    defined to have corresponding escape sequences.
2754
2755    A graphic character set is at first designated to one of four
2756    graphic registers (G0 through G3), then these graphic registers are
2757    invoked to GL or GR.  These designations and invocations can be
2758    done independently.  The most common case is that G0 is invoked to
2759    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2760    these invocations and designations are omitted in encoded text.
2761    In a 7-bit environment, only GL can be used.
2762
2763    When a graphic character set of CHARS94 is invoked to GL, codes
2764    0x20 and 0x7F of the GL area work as control characters SPACE and
2765    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2766    be used.
2767
2768    There are two ways of invocation: locking-shift and single-shift.
2769    With locking-shift, the invocation lasts until the next different
2770    invocation, whereas with single-shift, the invocation affects the
2771    following character only and doesn't affect the locking-shift
2772    state.  Invocations are done by the following control characters or
2773    escape sequences:
2774
2775    ----------------------------------------------------------------------
2776    abbrev  function                  cntrl escape seq   description
2777    ----------------------------------------------------------------------
2778    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2779    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2780    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2781    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2782    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2783    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2784    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2785    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2786    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2787    ----------------------------------------------------------------------
2788    (*) These are not used by any known coding system.
2789
2790    Control characters for these functions are defined by macros
2791    ISO_CODE_XXX in `coding.h'.
2792
2793    Designations are done by the following escape sequences:
2794    ----------------------------------------------------------------------
2795    escape sequence      description
2796    ----------------------------------------------------------------------
2797    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2798    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2799    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2800    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2801    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2802    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2803    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2804    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2805    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2806    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2807    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2808    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2809    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2810    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2811    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2812    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2813    ----------------------------------------------------------------------
2814
2815    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2816    of dimension 1, chars 94, and final character <F>, etc...
2817
2818    Note (*): Although these designations are not allowed in ISO2022,
2819    Emacs accepts them on decoding, and produces them on encoding
2820    CHARS96 character sets in a coding system which is characterized as
2821    7-bit environment, non-locking-shift, and non-single-shift.
2822
2823    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2824    '(' must be omitted.  We refer to this as "short-form" hereafter.
2825
2826    Now you may notice that there are a lot of ways of encoding the
2827    same multilingual text in ISO2022.  Actually, there exist many
2828    coding systems such as Compound Text (used in X11's inter client
2829    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2830    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2831    localized platforms), and all of these are variants of ISO2022.
2832
2833    In addition to the above, Emacs handles two more kinds of escape
2834    sequences: ISO6429's direction specification and Emacs' private
2835    sequence for specifying character composition.
2836
2837    ISO6429's direction specification takes the following form:
2838         o CSI ']'      -- end of the current direction
2839         o CSI '0' ']'  -- end of the current direction
2840         o CSI '1' ']'  -- start of left-to-right text
2841         o CSI '2' ']'  -- start of right-to-left text
2842    The control character CSI (0x9B: control sequence introducer) is
2843    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2844
2845    Character composition specification takes the following form:
2846         o ESC '0' -- start relative composition
2847         o ESC '1' -- end composition
2848         o ESC '2' -- start rule-base composition (*)
2849         o ESC '3' -- start relative composition with alternate chars  (**)
2850         o ESC '4' -- start rule-base composition with alternate chars  (**)
2851   Since these are not standard escape sequences of any ISO standard,
2852   the use of them with these meanings is restricted to Emacs only.
2853
2854   (*) This form is used only in Emacs 20.7 and older versions,
2855   but newer versions can safely decode it.
2856   (**) This form is used only in Emacs 21.1 and newer versions,
2857   and older versions can't decode it.
2858
2859   Here's a list of example usages of these composition escape
2860   sequences (categorized by `enum composition_method').
2861
2862   COMPOSITION_RELATIVE:
2863         ESC 0 CHAR [ CHAR ] ESC 1
2864   COMPOSITION_WITH_RULE:
2865         ESC 2 CHAR [ RULE CHAR ] ESC 1
2866   COMPOSITION_WITH_ALTCHARS:
2867         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2868   COMPOSITION_WITH_RULE_ALTCHARS:
2869         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2870
2871 static enum iso_code_class_type iso_code_class[256];
2872
2873 #define SAFE_CHARSET_P(coding, id)      \
2874   ((id) <= (coding)->max_charset_id     \
2875    && (coding)->safe_charsets[id] != 255)
2876
2877 static void
2878 setup_iso_safe_charsets (Lisp_Object attrs)
2879 {
2880   Lisp_Object charset_list, safe_charsets;
2881   Lisp_Object request;
2882   Lisp_Object reg_usage;
2883   Lisp_Object tail;
2884   int reg94, reg96;
2885   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2886   int max_charset_id;
2887
2888   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2889   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2890       && ! EQ (charset_list, Viso_2022_charset_list))
2891     {
2892       CODING_ATTR_CHARSET_LIST (attrs)
2893         = charset_list = Viso_2022_charset_list;
2894       ASET (attrs, coding_attr_safe_charsets, Qnil);
2895     }
2896
2897   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2898     return;
2899
2900   max_charset_id = 0;
2901   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2902     {
2903       int id = XINT (XCAR (tail));
2904       if (max_charset_id < id)
2905         max_charset_id = id;
2906     }
2907
2908   safe_charsets = make_uninit_string (max_charset_id + 1);
2909   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2910   request = AREF (attrs, coding_attr_iso_request);
2911   reg_usage = AREF (attrs, coding_attr_iso_usage);
2912   reg94 = XINT (XCAR (reg_usage));
2913   reg96 = XINT (XCDR (reg_usage));
2914
2915   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2916     {
2917       Lisp_Object id;
2918       Lisp_Object reg;
2919       struct charset *charset;
2920
2921       id = XCAR (tail);
2922       charset = CHARSET_FROM_ID (XINT (id));
2923       reg = Fcdr (Fassq (id, request));
2924       if (! NILP (reg))
2925         SSET (safe_charsets, XINT (id), XINT (reg));
2926       else if (charset->iso_chars_96)
2927         {
2928           if (reg96 < 4)
2929             SSET (safe_charsets, XINT (id), reg96);
2930         }
2931       else
2932         {
2933           if (reg94 < 4)
2934             SSET (safe_charsets, XINT (id), reg94);
2935         }
2936     }
2937   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2938 }
2939
2940
2941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2942    Check if a text is encoded in one of ISO-2022 based coding systems.
2943    If it is, return 1, else return 0.  */
2944
2945 static int
2946 detect_coding_iso_2022 (struct coding_system *coding,
2947                         struct coding_detection_info *detect_info)
2948 {
2949   const unsigned char *src = coding->source, *src_base = src;
2950   const unsigned char *src_end = coding->source + coding->src_bytes;
2951   int multibytep = coding->src_multibyte;
2952   int single_shifting = 0;
2953   int id;
2954   int c, c1;
2955   EMACS_INT consumed_chars = 0;
2956   int i;
2957   int rejected = 0;
2958   int found = 0;
2959   int composition_count = -1;
2960
2961   detect_info->checked |= CATEGORY_MASK_ISO;
2962
2963   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2964     {
2965       struct coding_system *this = &(coding_categories[i]);
2966       Lisp_Object attrs, val;
2967
2968       if (this->id < 0)
2969         continue;
2970       attrs = CODING_ID_ATTRS (this->id);
2971       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2972           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2973         setup_iso_safe_charsets (attrs);
2974       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2975       this->max_charset_id = SCHARS (val) - 1;
2976       this->safe_charsets = SDATA (val);
2977     }
2978
2979   /* A coding system of this category is always ASCII compatible.  */
2980   src += coding->head_ascii;
2981
2982   while (rejected != CATEGORY_MASK_ISO)
2983     {
2984       src_base = src;
2985       ONE_MORE_BYTE (c);
2986       switch (c)
2987         {
2988         case ISO_CODE_ESC:
2989           if (inhibit_iso_escape_detection)
2990             break;
2991           single_shifting = 0;
2992           ONE_MORE_BYTE (c);
2993           if (c == 'N' || c == 'O')
2994             {
2995               /* ESC <Fe> for SS2 or SS3.  */
2996               single_shifting = 1;
2997               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2998             }
2999           else if (c == '1')
3000             {
3001               /* End of composition.  */
3002               if (composition_count < 0
3003                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3004                 /* Invalid */
3005                 break;
3006               composition_count = -1;
3007               found |= CATEGORY_MASK_ISO;
3008             }
3009           else if (c >= '0' && c <= '4')
3010             {
3011               /* ESC <Fp> for start/end composition.  */
3012               composition_count = 0;
3013             }
3014           else
3015             {
3016               if (c >= '(' && c <= '/')
3017                 {
3018                   /* Designation sequence for a charset of dimension 1.  */
3019                   ONE_MORE_BYTE (c1);
3020                   if (c1 < ' ' || c1 >= 0x80
3021                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3022                     /* Invalid designation sequence.  Just ignore.  */
3023                     break;
3024                 }
3025               else if (c == '$')
3026                 {
3027                   /* Designation sequence for a charset of dimension 2.  */
3028                   ONE_MORE_BYTE (c);
3029                   if (c >= '@' && c <= 'B')
3030                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3031                     id = iso_charset_table[1][0][c];
3032                   else if (c >= '(' && c <= '/')
3033                     {
3034                       ONE_MORE_BYTE (c1);
3035                       if (c1 < ' ' || c1 >= 0x80
3036                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3037                         /* Invalid designation sequence.  Just ignore.  */
3038                         break;
3039                     }
3040                   else
3041                     /* Invalid designation sequence.  Just ignore it.  */
3042                     break;
3043                 }
3044               else
3045                 {
3046                   /* Invalid escape sequence.  Just ignore it.  */
3047                   break;
3048                 }
3049
3050               /* We found a valid designation sequence for CHARSET.  */
3051               rejected |= CATEGORY_MASK_ISO_8BIT;
3052               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3053                                   id))
3054                 found |= CATEGORY_MASK_ISO_7;
3055               else
3056                 rejected |= CATEGORY_MASK_ISO_7;
3057               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3058                                   id))
3059                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3060               else
3061                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3062               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3063                                   id))
3064                 found |= CATEGORY_MASK_ISO_7_ELSE;
3065               else
3066                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3067               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3068                                   id))
3069                 found |= CATEGORY_MASK_ISO_8_ELSE;
3070               else
3071                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3072             }
3073           break;
3074
3075         case ISO_CODE_SO:
3076         case ISO_CODE_SI:
3077           /* Locking shift out/in.  */
3078           if (inhibit_iso_escape_detection)
3079             break;
3080           single_shifting = 0;
3081           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3082           break;
3083
3084         case ISO_CODE_CSI:
3085           /* Control sequence introducer.  */
3086           single_shifting = 0;
3087           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088           found |= CATEGORY_MASK_ISO_8_ELSE;
3089           goto check_extra_latin;
3090
3091         case ISO_CODE_SS2:
3092         case ISO_CODE_SS3:
3093           /* Single shift.   */
3094           if (inhibit_iso_escape_detection)
3095             break;
3096           single_shifting = 0;
3097           rejected |= CATEGORY_MASK_ISO_7BIT;
3098           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3099               & CODING_ISO_FLAG_SINGLE_SHIFT)
3100             {
3101               found |= CATEGORY_MASK_ISO_8_1;
3102               single_shifting = 1;
3103             }
3104           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3105               & CODING_ISO_FLAG_SINGLE_SHIFT)
3106             {
3107               found |= CATEGORY_MASK_ISO_8_2;
3108               single_shifting = 1;
3109             }
3110           if (single_shifting)
3111             break;
3112         check_extra_latin:
3113           if (! VECTORP (Vlatin_extra_code_table)
3114               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3115             {
3116               rejected = CATEGORY_MASK_ISO;
3117               break;
3118             }
3119           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3120               & CODING_ISO_FLAG_LATIN_EXTRA)
3121             found |= CATEGORY_MASK_ISO_8_1;
3122           else
3123             rejected |= CATEGORY_MASK_ISO_8_1;
3124           rejected |= CATEGORY_MASK_ISO_8_2;
3125           break;
3126
3127         default:
3128           if (c < 0)
3129             continue;
3130           if (c < 0x80)
3131             {
3132               if (composition_count >= 0)
3133                 composition_count++;
3134               single_shifting = 0;
3135               break;
3136             }
3137           if (c >= 0xA0)
3138             {
3139               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3140               found |= CATEGORY_MASK_ISO_8_1;
3141               /* Check the length of succeeding codes of the range
3142                  0xA0..0FF.  If the byte length is even, we include
3143                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3144                  only when we are not single shifting.  */
3145               if (! single_shifting
3146                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3147                 {
3148                   int len = 1;
3149                   while (src < src_end)
3150                     {
3151                       src_base = src;
3152                       ONE_MORE_BYTE (c);
3153                       if (c < 0xA0)
3154                         {
3155                           src = src_base;
3156                           break;
3157                         }
3158                       len++;
3159                     }
3160
3161                   if (len & 1 && src < src_end)
3162                     {
3163                       rejected |= CATEGORY_MASK_ISO_8_2;
3164                       if (composition_count >= 0)
3165                         composition_count += len;
3166                     }
3167                   else
3168                     {
3169                       found |= CATEGORY_MASK_ISO_8_2;
3170                       if (composition_count >= 0)
3171                         composition_count += len / 2;
3172                     }
3173                 }
3174               break;
3175             }
3176         }
3177     }
3178   detect_info->rejected |= CATEGORY_MASK_ISO;
3179   return 0;
3180
3181  no_more_source:
3182   detect_info->rejected |= rejected;
3183   detect_info->found |= (found & ~rejected);
3184   return 1;
3185 }
3186
3187
3188 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3189    escape sequence should be kept.  */
3190 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3191   do {                                                                  \
3192     int id, prev;                                                       \
3193                                                                         \
3194     if (final < '0' || final >= 128                                     \
3195         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3196         || !SAFE_CHARSET_P (coding, id))                                \
3197       {                                                                 \
3198         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3199         chars_96 = -1;                                                  \
3200         break;                                                          \
3201       }                                                                 \
3202     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3203     if (id == charset_jisx0201_roman)                                   \
3204       {                                                                 \
3205         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3206           id = charset_ascii;                                           \
3207       }                                                                 \
3208     else if (id == charset_jisx0208_1978)                               \
3209       {                                                                 \
3210         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3211           id = charset_jisx0208;                                        \
3212       }                                                                 \
3213     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3214     /* If there was an invalid designation to REG previously, and this  \
3215        designation is ASCII to REG, we should keep this designation     \
3216        sequence.  */                                                    \
3217     if (prev == -2 && id == charset_ascii)                              \
3218       chars_96 = -1;                                                    \
3219   } while (0)
3220
3221
3222 /* Handle these composition sequence (ALT: alternate char):
3223
3224    (1) relative composition: ESC 0 CHAR ... ESC 1
3225    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3226    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3227    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3228
3229    When the start sequence (ESC 0/2/3/4) is found, this annotation
3230    header is produced.
3231
3232         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3233
3234    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3235    produced until the end sequence (ESC 1) is found:
3236
3237    (1) CHAR ... CHAR
3238    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3239    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3240    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3241
3242    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3243    annotation header is updated as below:
3244
3245    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3246    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3247    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3248    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3249
3250    If an error is found while composing, the annotation header is
3251    changed to:
3252
3253         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3254
3255    and the sequence [ -2 DECODED-RULE ] is changed to the original
3256    byte sequence as below:
3257         o the original byte sequence is B: [ B -1 ]
3258         o the original byte sequence is B1 B2: [ B1 B2 ]
3259    and the sequence [ -1 -1 ] is changed to the original byte
3260    sequence:
3261         [ ESC '0' ]
3262 */
3263
3264 /* Decode a composition rule C1 and maybe one more byte from the
3265    source, and set RULE to the encoded composition rule.  If the rule
3266    is invalid, goto invalid_code.  */
3267
3268 #define DECODE_COMPOSITION_RULE(rule)                                   \
3269   do {                                                                  \
3270     rule = c1 - 32;                                                     \
3271     if (rule < 0)                                                       \
3272       goto invalid_code;                                                \
3273     if (rule < 81)              /* old format (before ver.21) */        \
3274       {                                                                 \
3275         int gref = (rule) / 9;                                          \
3276         int nref = (rule) % 9;                                          \
3277         if (gref == 4) gref = 10;                                       \
3278         if (nref == 4) nref = 10;                                       \
3279         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3280       }                                                                 \
3281     else                        /* new format (after ver.21) */         \
3282       {                                                                 \
3283         int b;                                                          \
3284                                                                         \
3285         ONE_MORE_BYTE (b);                                              \
3286         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3287           goto invalid_code;                                            \
3288         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3289         rule += 0x100;   /* Distinguish it from the old format.  */     \
3290       }                                                                 \
3291   } while (0)
3292
3293 #define ENCODE_COMPOSITION_RULE(rule)                           \
3294   do {                                                          \
3295     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3296                                                                 \
3297     if (rule < 0x100)           /* old format */                \
3298       {                                                         \
3299         if (gref == 10) gref = 4;                               \
3300         if (nref == 10) nref = 4;                               \
3301         charbuf[idx] = 32 + gref * 9 + nref;                    \
3302         charbuf[idx + 1] = -1;                                  \
3303         new_chars++;                                            \
3304       }                                                         \
3305     else                                /* new format */        \
3306       {                                                         \
3307         charbuf[idx] = 32 + 81 + gref;                          \
3308         charbuf[idx + 1] = 32 + nref;                           \
3309         new_chars += 2;                                         \
3310       }                                                         \
3311   } while (0)
3312
3313 /* Finish the current composition as invalid.  */
3314
3315 static int finish_composition (int *, struct composition_status *);
3316
3317 static int
3318 finish_composition (int *charbuf, struct composition_status *cmp_status)
3319 {
3320   int idx = - cmp_status->length;
3321   int new_chars;
3322
3323   /* Recover the original ESC sequence */
3324   charbuf[idx++] = ISO_CODE_ESC;
3325   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3326                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3327                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3328                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3329                     : '4');
3330   charbuf[idx++] = -2;
3331   charbuf[idx++] = 0;
3332   charbuf[idx++] = -1;
3333   new_chars = cmp_status->nchars;
3334   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3335     for (; idx < 0; idx++)
3336       {
3337         int elt = charbuf[idx];
3338
3339         if (elt == -2)
3340           {
3341             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3342             idx++;
3343           }
3344         else if (elt == -1)
3345           {
3346             charbuf[idx++] = ISO_CODE_ESC;
3347             charbuf[idx] = '0';
3348             new_chars += 2;
3349           }
3350       }
3351   cmp_status->state = COMPOSING_NO;
3352   return new_chars;
3353 }
3354
3355 /* If characters are under composition, finish the composition.  */
3356 #define MAYBE_FINISH_COMPOSITION()                              \
3357   do {                                                          \
3358     if (cmp_status->state != COMPOSING_NO)                      \
3359       char_offset += finish_composition (charbuf, cmp_status);  \
3360   } while (0)
3361
3362 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3363
3364    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3365    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3366    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3367    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3368
3369    Produce this annotation sequence now:
3370
3371    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3372 */
3373
3374 #define DECODE_COMPOSITION_START(c1)                                       \
3375   do {                                                                     \
3376     if (c1 == '0'                                                          \
3377         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3378              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3379             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3380                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3381       {                                                                    \
3382         *charbuf++ = -1;                                                   \
3383         *charbuf++= -1;                                                    \
3384         cmp_status->state = COMPOSING_CHAR;                                \
3385         cmp_status->length += 2;                                           \
3386       }                                                                    \
3387     else                                                                   \
3388       {                                                                    \
3389         MAYBE_FINISH_COMPOSITION ();                                       \
3390         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3391                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3392                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3393                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3394         cmp_status->state                                                  \
3395           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3396         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3397         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3398         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3399         coding->annotated = 1;                                             \
3400       }                                                                    \
3401   } while (0)
3402
3403
3404 /* Handle composition end sequence ESC 1.  */
3405
3406 #define DECODE_COMPOSITION_END()                                        \
3407   do {                                                                  \
3408     if (cmp_status->nchars == 0                                         \
3409         || ((cmp_status->state == COMPOSING_CHAR)                       \
3410             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3411       {                                                                 \
3412         MAYBE_FINISH_COMPOSITION ();                                    \
3413         goto invalid_code;                                              \
3414       }                                                                 \
3415     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3416       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3417     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3418       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3419     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3420     char_offset += cmp_status->nchars;                                  \
3421     cmp_status->state = COMPOSING_NO;                                   \
3422   } while (0)
3423
3424 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3425
3426 #define STORE_COMPOSITION_RULE(rule)    \
3427   do {                                  \
3428     *charbuf++ = -2;                    \
3429     *charbuf++ = rule;                  \
3430     cmp_status->length += 2;            \
3431     cmp_status->state--;                \
3432   } while (0)
3433
3434 /* Store a composed char or a component char C in charbuf, and update
3435    cmp_status.  */
3436
3437 #define STORE_COMPOSITION_CHAR(c)                                       \
3438   do {                                                                  \
3439     *charbuf++ = (c);                                                   \
3440     cmp_status->length++;                                               \
3441     if (cmp_status->state == COMPOSING_CHAR)                            \
3442       cmp_status->nchars++;                                             \
3443     else                                                                \
3444       cmp_status->ncomps++;                                             \
3445     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3446         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3447             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3448       cmp_status->state++;                                              \
3449   } while (0)
3450
3451
3452 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3453
3454 static void
3455 decode_coding_iso_2022 (struct coding_system *coding)
3456 {
3457   const unsigned char *src = coding->source + coding->consumed;
3458   const unsigned char *src_end = coding->source + coding->src_bytes;
3459   const unsigned char *src_base;
3460   int *charbuf = coding->charbuf + coding->charbuf_used;
3461   /* We may produce two annotations (charset and composition) in one
3462      loop and one more charset annotation at the end.  */
3463   int *charbuf_end
3464     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3465   EMACS_INT consumed_chars = 0, consumed_chars_base;
3466   int multibytep = coding->src_multibyte;
3467   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3468   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3469   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3470   int charset_id_2, charset_id_3;
3471   struct charset *charset;
3472   int c;
3473   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3474   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3475   EMACS_INT char_offset = coding->produced_char;
3476   EMACS_INT last_offset = char_offset;
3477   int last_id = charset_ascii;
3478   int eol_dos =
3479     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3480   int byte_after_cr = -1;
3481   int i;
3482
3483   setup_iso_safe_charsets (attrs);
3484   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3485
3486   if (cmp_status->state != COMPOSING_NO)
3487     {
3488       if (charbuf_end - charbuf < cmp_status->length)
3489         abort ();
3490       for (i = 0; i < cmp_status->length; i++)
3491         *charbuf++ = cmp_status->carryover[i];
3492       coding->annotated = 1;
3493     }
3494
3495   while (1)
3496     {
3497       int c1, c2, c3;
3498
3499       src_base = src;
3500       consumed_chars_base = consumed_chars;
3501
3502       if (charbuf >= charbuf_end)
3503         {
3504           if (byte_after_cr >= 0)
3505             src_base--;
3506           break;
3507         }
3508
3509       if (byte_after_cr >= 0)
3510         c1 = byte_after_cr, byte_after_cr = -1;
3511       else
3512         ONE_MORE_BYTE (c1);
3513       if (c1 < 0)
3514         goto invalid_code;
3515
3516       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3517         {
3518           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3519           char_offset++;
3520           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3521           continue;
3522         }
3523
3524       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3525         {
3526           if (c1 == ISO_CODE_ESC)
3527             {
3528               if (src + 1 >= src_end)
3529                 goto no_more_source;
3530               *charbuf++ = ISO_CODE_ESC;
3531               char_offset++;
3532               if (src[0] == '%' && src[1] == '@')
3533                 {
3534                   src += 2;
3535                   consumed_chars += 2;
3536                   char_offset += 2;
3537                   /* We are sure charbuf can contain two more chars. */
3538                   *charbuf++ = '%';
3539                   *charbuf++ = '@';
3540                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3541                 }
3542             }
3543           else
3544             {
3545               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546               char_offset++;
3547             }
3548           continue;
3549         }
3550
3551       if ((cmp_status->state == COMPOSING_RULE
3552            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3553           && c1 != ISO_CODE_ESC)
3554         {
3555           int rule;
3556
3557           DECODE_COMPOSITION_RULE (rule);
3558           STORE_COMPOSITION_RULE (rule);
3559           continue;
3560         }
3561
3562       /* We produce at most one character.  */
3563       switch (iso_code_class [c1])
3564         {
3565         case ISO_0x20_or_0x7F:
3566           if (charset_id_0 < 0
3567               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3568             /* This is SPACE or DEL.  */
3569             charset = CHARSET_FROM_ID (charset_ascii);
3570           else
3571             charset = CHARSET_FROM_ID (charset_id_0);
3572           break;
3573
3574         case ISO_graphic_plane_0:
3575           if (charset_id_0 < 0)
3576             charset = CHARSET_FROM_ID (charset_ascii);
3577           else
3578             charset = CHARSET_FROM_ID (charset_id_0);
3579           break;
3580
3581         case ISO_0xA0_or_0xFF:
3582           if (charset_id_1 < 0
3583               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3584               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3585             goto invalid_code;
3586           /* This is a graphic character, we fall down ... */
3587
3588         case ISO_graphic_plane_1:
3589           if (charset_id_1 < 0)
3590             goto invalid_code;
3591           charset = CHARSET_FROM_ID (charset_id_1);
3592           break;
3593
3594         case ISO_control_0:
3595           if (eol_dos && c1 == '\r')
3596             ONE_MORE_BYTE (byte_after_cr);
3597           MAYBE_FINISH_COMPOSITION ();
3598           charset = CHARSET_FROM_ID (charset_ascii);
3599           break;
3600
3601         case ISO_control_1:
3602           goto invalid_code;
3603
3604         case ISO_shift_out:
3605           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3606               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3607             goto invalid_code;
3608           CODING_ISO_INVOCATION (coding, 0) = 1;
3609           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3610           continue;
3611
3612         case ISO_shift_in:
3613           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3614             goto invalid_code;
3615           CODING_ISO_INVOCATION (coding, 0) = 0;
3616           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3617           continue;
3618
3619         case ISO_single_shift_2_7:
3620           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3621             goto invalid_code;
3622         case ISO_single_shift_2:
3623           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3624             goto invalid_code;
3625           /* SS2 is handled as an escape sequence of ESC 'N' */
3626           c1 = 'N';
3627           goto label_escape_sequence;
3628
3629         case ISO_single_shift_3:
3630           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3631             goto invalid_code;
3632           /* SS2 is handled as an escape sequence of ESC 'O' */
3633           c1 = 'O';
3634           goto label_escape_sequence;
3635
3636         case ISO_control_sequence_introducer:
3637           /* CSI is handled as an escape sequence of ESC '[' ...  */
3638           c1 = '[';
3639           goto label_escape_sequence;
3640
3641         case ISO_escape:
3642           ONE_MORE_BYTE (c1);
3643         label_escape_sequence:
3644           /* Escape sequences handled here are invocation,
3645              designation, direction specification, and character
3646              composition specification.  */
3647           switch (c1)
3648             {
3649             case '&':           /* revision of following character set */
3650               ONE_MORE_BYTE (c1);
3651               if (!(c1 >= '@' && c1 <= '~'))
3652                 goto invalid_code;
3653               ONE_MORE_BYTE (c1);
3654               if (c1 != ISO_CODE_ESC)
3655                 goto invalid_code;
3656               ONE_MORE_BYTE (c1);
3657               goto label_escape_sequence;
3658
3659             case '$':           /* designation of 2-byte character set */
3660               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3661                 goto invalid_code;
3662               {
3663                 int reg, chars96;
3664
3665                 ONE_MORE_BYTE (c1);
3666                 if (c1 >= '@' && c1 <= 'B')
3667                   {     /* designation of JISX0208.1978, GB2312.1980,
3668                            or JISX0208.1980 */
3669                     reg = 0, chars96 = 0;
3670                   }
3671                 else if (c1 >= 0x28 && c1 <= 0x2B)
3672                   { /* designation of DIMENSION2_CHARS94 character set */
3673                     reg = c1 - 0x28, chars96 = 0;
3674                     ONE_MORE_BYTE (c1);
3675                   }
3676                 else if (c1 >= 0x2C && c1 <= 0x2F)
3677                   { /* designation of DIMENSION2_CHARS96 character set */
3678                     reg = c1 - 0x2C, chars96 = 1;
3679                     ONE_MORE_BYTE (c1);
3680                   }
3681                 else
3682                   goto invalid_code;
3683                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3684                 /* We must update these variables now.  */
3685                 if (reg == 0)
3686                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3687                 else if (reg == 1)
3688                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3689                 if (chars96 < 0)
3690                   goto invalid_code;
3691               }
3692               continue;
3693
3694             case 'n':           /* invocation of locking-shift-2 */
3695               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3696                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3697                 goto invalid_code;
3698               CODING_ISO_INVOCATION (coding, 0) = 2;
3699               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3700               continue;
3701
3702             case 'o':           /* invocation of locking-shift-3 */
3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3704                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3705                 goto invalid_code;
3706               CODING_ISO_INVOCATION (coding, 0) = 3;
3707               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3708               continue;
3709
3710             case 'N':           /* invocation of single-shift-2 */
3711               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3712                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713                 goto invalid_code;
3714               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3715               if (charset_id_2 < 0)
3716                 charset = CHARSET_FROM_ID (charset_ascii);
3717               else
3718                 charset = CHARSET_FROM_ID (charset_id_2);
3719               ONE_MORE_BYTE (c1);
3720               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3721                 goto invalid_code;
3722               break;
3723
3724             case 'O':           /* invocation of single-shift-3 */
3725               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3726                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3727                 goto invalid_code;
3728               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3729               if (charset_id_3 < 0)
3730                 charset = CHARSET_FROM_ID (charset_ascii);
3731               else
3732                 charset = CHARSET_FROM_ID (charset_id_3);
3733               ONE_MORE_BYTE (c1);
3734               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3735                 goto invalid_code;
3736               break;
3737
3738             case '0': case '2': case '3': case '4': /* start composition */
3739               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3740                 goto invalid_code;
3741               if (last_id != charset_ascii)
3742                 {
3743                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3744                   last_id = charset_ascii;
3745                   last_offset = char_offset;
3746                 }
3747               DECODE_COMPOSITION_START (c1);
3748               continue;
3749
3750             case '1':           /* end composition */
3751               if (cmp_status->state == COMPOSING_NO)
3752                 goto invalid_code;
3753               DECODE_COMPOSITION_END ();
3754               continue;
3755
3756             case '[':           /* specification of direction */
3757               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3758                 goto invalid_code;
3759               /* For the moment, nested direction is not supported.
3760                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3761                  left-to-right, and nonzero means right-to-left.  */
3762               ONE_MORE_BYTE (c1);
3763               switch (c1)
3764                 {
3765                 case ']':       /* end of the current direction */
3766                   coding->mode &= ~CODING_MODE_DIRECTION;
3767
3768                 case '0':       /* end of the current direction */
3769                 case '1':       /* start of left-to-right direction */
3770                   ONE_MORE_BYTE (c1);
3771                   if (c1 == ']')
3772                     coding->mode &= ~CODING_MODE_DIRECTION;
3773                   else
3774                     goto invalid_code;
3775                   break;
3776
3777                 case '2':       /* start of right-to-left direction */
3778                   ONE_MORE_BYTE (c1);
3779                   if (c1 == ']')
3780                     coding->mode |= CODING_MODE_DIRECTION;
3781                   else
3782                     goto invalid_code;
3783                   break;
3784
3785                 default:
3786                   goto invalid_code;
3787                 }
3788               continue;
3789
3790             case '%':
3791               ONE_MORE_BYTE (c1);
3792               if (c1 == '/')
3793                 {
3794                   /* CTEXT extended segment:
3795                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3796                      We keep these bytes as is for the moment.
3797                      They may be decoded by post-read-conversion.  */
3798                   int dim, M, L;
3799                   int size;
3800
3801                   ONE_MORE_BYTE (dim);
3802                   if (dim < '0' || dim > '4')
3803                     goto invalid_code;
3804                   ONE_MORE_BYTE (M);
3805                   if (M < 128)
3806                     goto invalid_code;
3807                   ONE_MORE_BYTE (L);
3808                   if (L < 128)
3809                     goto invalid_code;
3810                   size = ((M - 128) * 128) + (L - 128);
3811                   if (charbuf + 6 > charbuf_end)
3812                     goto break_loop;
3813                   *charbuf++ = ISO_CODE_ESC;
3814                   *charbuf++ = '%';
3815                   *charbuf++ = '/';
3816                   *charbuf++ = dim;
3817                   *charbuf++ = BYTE8_TO_CHAR (M);
3818                   *charbuf++ = BYTE8_TO_CHAR (L);
3819                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3820                 }
3821               else if (c1 == 'G')
3822                 {
3823                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3824                      ESC % G --UTF-8-BYTES-- ESC % @
3825                      We keep these bytes as is for the moment.
3826                      They may be decoded by post-read-conversion.  */
3827                   if (charbuf + 3 > charbuf_end)
3828                     goto break_loop;
3829                   *charbuf++ = ISO_CODE_ESC;
3830                   *charbuf++ = '%';
3831                   *charbuf++ = 'G';
3832                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3833                 }
3834               else
3835                 goto invalid_code;
3836               continue;
3837               break;
3838
3839             default:
3840               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3841                 goto invalid_code;
3842               {
3843                 int reg, chars96;
3844
3845                 if (c1 >= 0x28 && c1 <= 0x2B)
3846                   { /* designation of DIMENSION1_CHARS94 character set */
3847                     reg = c1 - 0x28, chars96 = 0;
3848                     ONE_MORE_BYTE (c1);
3849                   }
3850                 else if (c1 >= 0x2C && c1 <= 0x2F)
3851                   { /* designation of DIMENSION1_CHARS96 character set */
3852                     reg = c1 - 0x2C, chars96 = 1;
3853                     ONE_MORE_BYTE (c1);
3854                   }
3855                 else
3856                   goto invalid_code;
3857                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3858                 /* We must update these variables now.  */
3859                 if (reg == 0)
3860                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3861                 else if (reg == 1)
3862                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3863                 if (chars96 < 0)
3864                   goto invalid_code;
3865               }
3866               continue;
3867             }
3868           break;
3869
3870         default:
3871           abort ();
3872         }
3873
3874       if (cmp_status->state == COMPOSING_NO
3875           && charset->id != charset_ascii
3876           && last_id != charset->id)
3877         {
3878           if (last_id != charset_ascii)
3879             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3880           last_id = charset->id;
3881           last_offset = char_offset;
3882         }
3883
3884       /* Now we know CHARSET and 1st position code C1 of a character.
3885          Produce a decoded character while getting 2nd and 3rd
3886          position codes C2, C3 if necessary.  */
3887       if (CHARSET_DIMENSION (charset) > 1)
3888         {
3889           ONE_MORE_BYTE (c2);
3890           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3891               || ((c1 & 0x80) != (c2 & 0x80)))
3892             /* C2 is not in a valid range.  */
3893             goto invalid_code;
3894           if (CHARSET_DIMENSION (charset) == 2)
3895             c1 = (c1 << 8) | c2;
3896           else
3897             {
3898               ONE_MORE_BYTE (c3);
3899               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3900                   || ((c1 & 0x80) != (c3 & 0x80)))
3901                 /* C3 is not in a valid range.  */
3902                 goto invalid_code;
3903               c1 = (c1 << 16) | (c2 << 8) | c2;
3904             }
3905         }
3906       c1 &= 0x7F7F7F;
3907       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3908       if (c < 0)
3909         {
3910           MAYBE_FINISH_COMPOSITION ();
3911           for (; src_base < src; src_base++, char_offset++)
3912             {
3913               if (ASCII_BYTE_P (*src_base))
3914                 *charbuf++ = *src_base;
3915               else
3916                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3917             }
3918         }
3919       else if (cmp_status->state == COMPOSING_NO)
3920         {
3921           *charbuf++ = c;
3922           char_offset++;
3923         }
3924       else if ((cmp_status->state == COMPOSING_CHAR
3925                 ? cmp_status->nchars
3926                 : cmp_status->ncomps)
3927                >= MAX_COMPOSITION_COMPONENTS)
3928         {
3929           /* Too long composition.  */
3930           MAYBE_FINISH_COMPOSITION ();
3931           *charbuf++ = c;
3932           char_offset++;
3933         }
3934       else
3935         STORE_COMPOSITION_CHAR (c);
3936       continue;
3937
3938     invalid_code:
3939       MAYBE_FINISH_COMPOSITION ();
3940       src = src_base;
3941       consumed_chars = consumed_chars_base;
3942       ONE_MORE_BYTE (c);
3943       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3944       char_offset++;
3945       coding->errors++;
3946       continue;
3947
3948     break_loop:
3949       break;
3950     }
3951
3952  no_more_source:
3953   if (cmp_status->state != COMPOSING_NO)
3954     {
3955       if (coding->mode & CODING_MODE_LAST_BLOCK)
3956         MAYBE_FINISH_COMPOSITION ();
3957       else
3958         {
3959           charbuf -= cmp_status->length;
3960           for (i = 0; i < cmp_status->length; i++)
3961             cmp_status->carryover[i] = charbuf[i];
3962         }
3963     }
3964   else if (last_id != charset_ascii)
3965     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3966   coding->consumed_char += consumed_chars_base;
3967   coding->consumed = src_base - coding->source;
3968   coding->charbuf_used = charbuf - coding->charbuf;
3969 }
3970
3971
3972 /* ISO2022 encoding stuff.  */
3973
3974 /*
3975    It is not enough to say just "ISO2022" on encoding, we have to
3976    specify more details.  In Emacs, each coding system of ISO2022
3977    variant has the following specifications:
3978         1. Initial designation to G0 thru G3.
3979         2. Allows short-form designation?
3980         3. ASCII should be designated to G0 before control characters?
3981         4. ASCII should be designated to G0 at end of line?
3982         5. 7-bit environment or 8-bit environment?
3983         6. Use locking-shift?
3984         7. Use Single-shift?
3985    And the following two are only for Japanese:
3986         8. Use ASCII in place of JIS0201-1976-Roman?
3987         9. Use JISX0208-1983 in place of JISX0208-1978?
3988    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3989    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3990    details.
3991 */
3992
3993 /* Produce codes (escape sequence) for designating CHARSET to graphic
3994    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3995    '@', 'A', or 'B' and the coding system CODING allows, produce
3996    designation sequence of short-form.  */
3997
3998 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3999   do {                                                                  \
4000     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4001     const char *intermediate_char_94 = "()*+";                          \
4002     const char *intermediate_char_96 = ",-./";                          \
4003     int revision = -1;                                                  \
4004                                                                         \
4005     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4006       revision = CHARSET_ISO_REVISION (charset);                        \
4007                                                                         \
4008     if (revision >= 0)                                                  \
4009       {                                                                 \
4010         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4011         EMIT_ONE_BYTE ('@' + revision);                                 \
4012       }                                                                 \
4013     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4014     if (CHARSET_DIMENSION (charset) == 1)                               \
4015       {                                                                 \
4016         int b;                                                          \
4017         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4018           b = intermediate_char_94[reg];                                \
4019         else                                                            \
4020           b = intermediate_char_96[reg];                                \
4021         EMIT_ONE_ASCII_BYTE (b);                                        \
4022       }                                                                 \
4023     else                                                                \
4024       {                                                                 \
4025         EMIT_ONE_ASCII_BYTE ('$');                                      \
4026         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4027           {                                                             \
4028             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4029                 || reg != 0                                             \
4030                 || final_char < '@' || final_char > 'B')                \
4031               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4032           }                                                             \
4033         else                                                            \
4034           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4035       }                                                                 \
4036     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4037                                                                         \
4038     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4039   } while (0)
4040
4041
4042 /* The following two macros produce codes (control character or escape
4043    sequence) for ISO2022 single-shift functions (single-shift-2 and
4044    single-shift-3).  */
4045
4046 #define ENCODE_SINGLE_SHIFT_2                                           \
4047   do {                                                                  \
4048     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4049       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4050     else                                                                \
4051       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4052     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4053   } while (0)
4054
4055
4056 #define ENCODE_SINGLE_SHIFT_3                                           \
4057   do {                                                                  \
4058     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4059       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4060     else                                                                \
4061       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4062     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4063   } while (0)
4064
4065
4066 /* The following four macros produce codes (control character or
4067    escape sequence) for ISO2022 locking-shift functions (shift-in,
4068    shift-out, locking-shift-2, and locking-shift-3).  */
4069
4070 #define ENCODE_SHIFT_IN                                 \
4071   do {                                                  \
4072     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4073     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4074   } while (0)
4075
4076
4077 #define ENCODE_SHIFT_OUT                                \
4078   do {                                                  \
4079     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4080     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4081   } while (0)
4082
4083
4084 #define ENCODE_LOCKING_SHIFT_2                          \
4085   do {                                                  \
4086     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4087     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4088   } while (0)
4089
4090
4091 #define ENCODE_LOCKING_SHIFT_3                          \
4092   do {                                                  \
4093     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4094     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4095   } while (0)
4096
4097
4098 /* Produce codes for a DIMENSION1 character whose character set is
4099    CHARSET and whose position-code is C1.  Designation and invocation
4100    sequences are also produced in advance if necessary.  */
4101
4102 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4103   do {                                                                  \
4104     int id = CHARSET_ID (charset);                                      \
4105                                                                         \
4106     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4107         && id == charset_ascii)                                         \
4108       {                                                                 \
4109         id = charset_jisx0201_roman;                                    \
4110         charset = CHARSET_FROM_ID (id);                                 \
4111       }                                                                 \
4112                                                                         \
4113     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4114       {                                                                 \
4115         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4116           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4117         else                                                            \
4118           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4119         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4120         break;                                                          \
4121       }                                                                 \
4122     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4123       {                                                                 \
4124         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4125         break;                                                          \
4126       }                                                                 \
4127     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4128       {                                                                 \
4129         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4130         break;                                                          \
4131       }                                                                 \
4132     else                                                                \
4133       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4134          must invoke it, or, at first, designate it to some graphic     \
4135          register.  Then repeat the loop to actually produce the        \
4136          character.  */                                                 \
4137       dst = encode_invocation_designation (charset, coding, dst,        \
4138                                            &produced_chars);            \
4139   } while (1)
4140
4141
4142 /* Produce codes for a DIMENSION2 character whose character set is
4143    CHARSET and whose position-codes are C1 and C2.  Designation and
4144    invocation codes are also produced in advance if necessary.  */
4145
4146 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4147   do {                                                                  \
4148     int id = CHARSET_ID (charset);                                      \
4149                                                                         \
4150     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4151         && id == charset_jisx0208)                                      \
4152       {                                                                 \
4153         id = charset_jisx0208_1978;                                     \
4154         charset = CHARSET_FROM_ID (id);                                 \
4155       }                                                                 \
4156                                                                         \
4157     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4158       {                                                                 \
4159         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4160           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4161         else                                                            \
4162           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4163         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4167       {                                                                 \
4168         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4169         break;                                                          \
4170       }                                                                 \
4171     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4172       {                                                                 \
4173         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4174         break;                                                          \
4175       }                                                                 \
4176     else                                                                \
4177       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4178          must invoke it, or, at first, designate it to some graphic     \
4179          register.  Then repeat the loop to actually produce the        \
4180          character.  */                                                 \
4181       dst = encode_invocation_designation (charset, coding, dst,        \
4182                                            &produced_chars);            \
4183   } while (1)
4184
4185
4186 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4187   do {                                                                     \
4188     int code = ENCODE_CHAR ((charset), (c));                               \
4189                                                                            \
4190     if (CHARSET_DIMENSION (charset) == 1)                                  \
4191       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4192     else                                                                   \
4193       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4194   } while (0)
4195
4196
4197 /* Produce designation and invocation codes at a place pointed by DST
4198    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4199    Return new DST.  */
4200
4201 static unsigned char *
4202 encode_invocation_designation (struct charset *charset,
4203                                struct coding_system *coding,
4204                                unsigned char *dst, EMACS_INT *p_nchars)
4205 {
4206   int multibytep = coding->dst_multibyte;
4207   EMACS_INT produced_chars = *p_nchars;
4208   int reg;                      /* graphic register number */
4209   int id = CHARSET_ID (charset);
4210
4211   /* At first, check designations.  */
4212   for (reg = 0; reg < 4; reg++)
4213     if (id == CODING_ISO_DESIGNATION (coding, reg))
4214       break;
4215
4216   if (reg >= 4)
4217     {
4218       /* CHARSET is not yet designated to any graphic registers.  */
4219       /* At first check the requested designation.  */
4220       reg = CODING_ISO_REQUEST (coding, id);
4221       if (reg < 0)
4222         /* Since CHARSET requests no special designation, designate it
4223            to graphic register 0.  */
4224         reg = 0;
4225
4226       ENCODE_DESIGNATION (charset, reg, coding);
4227     }
4228
4229   if (CODING_ISO_INVOCATION (coding, 0) != reg
4230       && CODING_ISO_INVOCATION (coding, 1) != reg)
4231     {
4232       /* Since the graphic register REG is not invoked to any graphic
4233          planes, invoke it to graphic plane 0.  */
4234       switch (reg)
4235         {
4236         case 0:                 /* graphic register 0 */
4237           ENCODE_SHIFT_IN;
4238           break;
4239
4240         case 1:                 /* graphic register 1 */
4241           ENCODE_SHIFT_OUT;
4242           break;
4243
4244         case 2:                 /* graphic register 2 */
4245           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4246             ENCODE_SINGLE_SHIFT_2;
4247           else
4248             ENCODE_LOCKING_SHIFT_2;
4249           break;
4250
4251         case 3:                 /* graphic register 3 */
4252           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4253             ENCODE_SINGLE_SHIFT_3;
4254           else
4255             ENCODE_LOCKING_SHIFT_3;
4256           break;
4257         }
4258     }
4259
4260   *p_nchars = produced_chars;
4261   return dst;
4262 }
4263
4264
4265 /* Produce codes for designation and invocation to reset the graphic
4266    planes and registers to initial state.  */
4267 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4268   do {                                                                  \
4269     int reg;                                                            \
4270     struct charset *charset;                                            \
4271                                                                         \
4272     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4273       ENCODE_SHIFT_IN;                                                  \
4274     for (reg = 0; reg < 4; reg++)                                       \
4275       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4276           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4277               != CODING_ISO_INITIAL (coding, reg)))                     \
4278         {                                                               \
4279           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4280           ENCODE_DESIGNATION (charset, reg, coding);                    \
4281         }                                                               \
4282   } while (0)
4283
4284
4285 /* Produce designation sequences of charsets in the line started from
4286    SRC to a place pointed by DST, and return updated DST.
4287
4288    If the current block ends before any end-of-line, we may fail to
4289    find all the necessary designations.  */
4290
4291 static unsigned char *
4292 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4293                            unsigned char *dst)
4294 {
4295   struct charset *charset;
4296   /* Table of charsets to be designated to each graphic register.  */
4297   int r[4];
4298   int c, found = 0, reg;
4299   EMACS_INT produced_chars = 0;
4300   int multibytep = coding->dst_multibyte;
4301   Lisp_Object attrs;
4302   Lisp_Object charset_list;
4303
4304   attrs = CODING_ID_ATTRS (coding->id);
4305   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4306   if (EQ (charset_list, Qiso_2022))
4307     charset_list = Viso_2022_charset_list;
4308
4309   for (reg = 0; reg < 4; reg++)
4310     r[reg] = -1;
4311
4312   while (found < 4)
4313     {
4314       int id;
4315
4316       c = *charbuf++;
4317       if (c == '\n')
4318         break;
4319       charset = char_charset (c, charset_list, NULL);
4320       id = CHARSET_ID (charset);
4321       reg = CODING_ISO_REQUEST (coding, id);
4322       if (reg >= 0 && r[reg] < 0)
4323         {
4324           found++;
4325           r[reg] = id;
4326         }
4327     }
4328
4329   if (found)
4330     {
4331       for (reg = 0; reg < 4; reg++)
4332         if (r[reg] >= 0
4333             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4334           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4335     }
4336
4337   return dst;
4338 }
4339
4340 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4341
4342 static int
4343 encode_coding_iso_2022 (struct coding_system *coding)
4344 {
4345   int multibytep = coding->dst_multibyte;
4346   int *charbuf = coding->charbuf;
4347   int *charbuf_end = charbuf + coding->charbuf_used;
4348   unsigned char *dst = coding->destination + coding->produced;
4349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4350   int safe_room = 16;
4351   int bol_designation
4352     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4353        && CODING_ISO_BOL (coding));
4354   EMACS_INT produced_chars = 0;
4355   Lisp_Object attrs, eol_type, charset_list;
4356   int ascii_compatible;
4357   int c;
4358   int preferred_charset_id = -1;
4359
4360   CODING_GET_INFO (coding, attrs, charset_list);
4361   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4362   if (VECTORP (eol_type))
4363     eol_type = Qunix;
4364
4365   setup_iso_safe_charsets (attrs);
4366   /* Charset list may have been changed.  */
4367   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4368   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4369
4370   ascii_compatible
4371     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4372        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4373                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4374
4375   while (charbuf < charbuf_end)
4376     {
4377       ASSURE_DESTINATION (safe_room);
4378
4379       if (bol_designation)
4380         {
4381           unsigned char *dst_prev = dst;
4382
4383           /* We have to produce designation sequences if any now.  */
4384           dst = encode_designation_at_bol (coding, charbuf, dst);
4385           bol_designation = 0;
4386           /* We are sure that designation sequences are all ASCII bytes.  */
4387           produced_chars += dst - dst_prev;
4388         }
4389
4390       c = *charbuf++;
4391
4392       if (c < 0)
4393         {
4394           /* Handle an annotation.  */
4395           switch (*charbuf)
4396             {
4397             case CODING_ANNOTATE_COMPOSITION_MASK:
4398               /* Not yet implemented.  */
4399               break;
4400             case CODING_ANNOTATE_CHARSET_MASK:
4401               preferred_charset_id = charbuf[2];
4402               if (preferred_charset_id >= 0
4403                   && NILP (Fmemq (make_number (preferred_charset_id),
4404                                   charset_list)))
4405                 preferred_charset_id = -1;
4406               break;
4407             default:
4408               abort ();
4409             }
4410           charbuf += -c - 1;
4411           continue;
4412         }
4413
4414       /* Now encode the character C.  */
4415       if (c < 0x20 || c == 0x7F)
4416         {
4417           if (c == '\n'
4418               || (c == '\r' && EQ (eol_type, Qmac)))
4419             {
4420               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4421                 ENCODE_RESET_PLANE_AND_REGISTER ();
4422               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4423                 {
4424                   int i;
4425
4426                   for (i = 0; i < 4; i++)
4427                     CODING_ISO_DESIGNATION (coding, i)
4428                       = CODING_ISO_INITIAL (coding, i);
4429                 }
4430               bol_designation
4431                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4432             }
4433           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4434             ENCODE_RESET_PLANE_AND_REGISTER ();
4435           EMIT_ONE_ASCII_BYTE (c);
4436         }
4437       else if (ASCII_CHAR_P (c))
4438         {
4439           if (ascii_compatible)
4440             EMIT_ONE_ASCII_BYTE (c);
4441           else
4442             {
4443               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4444               ENCODE_ISO_CHARACTER (charset, c);
4445             }
4446         }
4447       else if (CHAR_BYTE8_P (c))
4448         {
4449           c = CHAR_TO_BYTE8 (c);
4450           EMIT_ONE_BYTE (c);
4451         }
4452       else
4453         {
4454           struct charset *charset;
4455
4456           if (preferred_charset_id >= 0)
4457             {
4458               charset = CHARSET_FROM_ID (preferred_charset_id);
4459               if (! CHAR_CHARSET_P (c, charset))
4460                 charset = char_charset (c, charset_list, NULL);
4461             }
4462           else
4463             charset = char_charset (c, charset_list, NULL);
4464           if (!charset)
4465             {
4466               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4467                 {
4468                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4469                   charset = CHARSET_FROM_ID (charset_ascii);
4470                 }
4471               else
4472                 {
4473                   c = coding->default_char;
4474                   charset = char_charset (c, charset_list, NULL);
4475                 }
4476             }
4477           ENCODE_ISO_CHARACTER (charset, c);
4478         }
4479     }
4480
4481   if (coding->mode & CODING_MODE_LAST_BLOCK
4482       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4483     {
4484       ASSURE_DESTINATION (safe_room);
4485       ENCODE_RESET_PLANE_AND_REGISTER ();
4486     }
4487   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4488   CODING_ISO_BOL (coding) = bol_designation;
4489   coding->produced_char += produced_chars;
4490   coding->produced = dst - coding->destination;
4491   return 0;
4492 }
4493
4494 \f
4495 /*** 8,9. SJIS and BIG5 handlers ***/
4496
4497 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4498    quite widely.  So, for the moment, Emacs supports them in the bare
4499    C code.  But, in the future, they may be supported only by CCL.  */
4500
4501 /* SJIS is a coding system encoding three character sets: ASCII, right
4502    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4503    as is.  A character of charset katakana-jisx0201 is encoded by
4504    "position-code + 0x80".  A character of charset japanese-jisx0208
4505    is encoded in 2-byte but two position-codes are divided and shifted
4506    so that it fit in the range below.
4507
4508    --- CODE RANGE of SJIS ---
4509    (character set)      (range)
4510    ASCII                0x00 .. 0x7F
4511    KATAKANA-JISX0201    0xA0 .. 0xDF
4512    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4513             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4514    -------------------------------
4515
4516 */
4517
4518 /* BIG5 is a coding system encoding two character sets: ASCII and
4519    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4520    character set and is encoded in two-byte.
4521
4522    --- CODE RANGE of BIG5 ---
4523    (character set)      (range)
4524    ASCII                0x00 .. 0x7F
4525    Big5 (1st byte)      0xA1 .. 0xFE
4526         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4527    --------------------------
4528
4529   */
4530
4531 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4532    Check if a text is encoded in SJIS.  If it is, return
4533    CATEGORY_MASK_SJIS, else return 0.  */
4534
4535 static int
4536 detect_coding_sjis (struct coding_system *coding,
4537                     struct coding_detection_info *detect_info)
4538 {
4539   const unsigned char *src = coding->source, *src_base;
4540   const unsigned char *src_end = coding->source + coding->src_bytes;
4541   int multibytep = coding->src_multibyte;
4542   EMACS_INT consumed_chars = 0;
4543   int found = 0;
4544   int c;
4545   Lisp_Object attrs, charset_list;
4546   int max_first_byte_of_2_byte_code;
4547
4548   CODING_GET_INFO (coding, attrs, charset_list);
4549   max_first_byte_of_2_byte_code
4550     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4551
4552   detect_info->checked |= CATEGORY_MASK_SJIS;
4553   /* A coding system of this category is always ASCII compatible.  */
4554   src += coding->head_ascii;
4555
4556   while (1)
4557     {
4558       src_base = src;
4559       ONE_MORE_BYTE (c);
4560       if (c < 0x80)
4561         continue;
4562       if ((c >= 0x81 && c <= 0x9F)
4563           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4564         {
4565           ONE_MORE_BYTE (c);
4566           if (c < 0x40 || c == 0x7F || c > 0xFC)
4567             break;
4568           found = CATEGORY_MASK_SJIS;
4569         }
4570       else if (c >= 0xA0 && c < 0xE0)
4571         found = CATEGORY_MASK_SJIS;
4572       else
4573         break;
4574     }
4575   detect_info->rejected |= CATEGORY_MASK_SJIS;
4576   return 0;
4577
4578  no_more_source:
4579   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4580     {
4581       detect_info->rejected |= CATEGORY_MASK_SJIS;
4582       return 0;
4583     }
4584   detect_info->found |= found;
4585   return 1;
4586 }
4587
4588 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4589    Check if a text is encoded in BIG5.  If it is, return
4590    CATEGORY_MASK_BIG5, else return 0.  */
4591
4592 static int
4593 detect_coding_big5 (struct coding_system *coding,
4594                     struct coding_detection_info *detect_info)
4595 {
4596   const unsigned char *src = coding->source, *src_base;
4597   const unsigned char *src_end = coding->source + coding->src_bytes;
4598   int multibytep = coding->src_multibyte;
4599   EMACS_INT consumed_chars = 0;
4600   int found = 0;
4601   int c;
4602
4603   detect_info->checked |= CATEGORY_MASK_BIG5;
4604   /* A coding system of this category is always ASCII compatible.  */
4605   src += coding->head_ascii;
4606
4607   while (1)
4608     {
4609       src_base = src;
4610       ONE_MORE_BYTE (c);
4611       if (c < 0x80)
4612         continue;
4613       if (c >= 0xA1)
4614         {
4615           ONE_MORE_BYTE (c);
4616           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4617             return 0;
4618           found = CATEGORY_MASK_BIG5;
4619         }
4620       else
4621         break;
4622     }
4623   detect_info->rejected |= CATEGORY_MASK_BIG5;
4624   return 0;
4625
4626  no_more_source:
4627   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4628     {
4629       detect_info->rejected |= CATEGORY_MASK_BIG5;
4630       return 0;
4631     }
4632   detect_info->found |= found;
4633   return 1;
4634 }
4635
4636 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4637    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4638
4639 static void
4640 decode_coding_sjis (struct coding_system *coding)
4641 {
4642   const unsigned char *src = coding->source + coding->consumed;
4643   const unsigned char *src_end = coding->source + coding->src_bytes;
4644   const unsigned char *src_base;
4645   int *charbuf = coding->charbuf + coding->charbuf_used;
4646   /* We may produce one charset annotation in one loop and one more at
4647      the end.  */
4648   int *charbuf_end
4649     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4650   EMACS_INT consumed_chars = 0, consumed_chars_base;
4651   int multibytep = coding->src_multibyte;
4652   struct charset *charset_roman, *charset_kanji, *charset_kana;
4653   struct charset *charset_kanji2;
4654   Lisp_Object attrs, charset_list, val;
4655   EMACS_INT char_offset = coding->produced_char;
4656   EMACS_INT last_offset = char_offset;
4657   int last_id = charset_ascii;
4658   int eol_dos =
4659     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4660   int byte_after_cr = -1;
4661
4662   CODING_GET_INFO (coding, attrs, charset_list);
4663
4664   val = charset_list;
4665   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4666   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4667   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4668   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4669
4670   while (1)
4671     {
4672       int c, c1;
4673       struct charset *charset;
4674
4675       src_base = src;
4676       consumed_chars_base = consumed_chars;
4677
4678       if (charbuf >= charbuf_end)
4679         {
4680           if (byte_after_cr >= 0)
4681             src_base--;
4682           break;
4683         }
4684
4685       if (byte_after_cr >= 0)
4686         c = byte_after_cr, byte_after_cr = -1;
4687       else
4688         ONE_MORE_BYTE (c);
4689       if (c < 0)
4690         goto invalid_code;
4691       if (c < 0x80)
4692         {
4693           if (eol_dos && c == '\r')
4694             ONE_MORE_BYTE (byte_after_cr);
4695           charset = charset_roman;
4696         }
4697       else if (c == 0x80 || c == 0xA0)
4698         goto invalid_code;
4699       else if (c >= 0xA1 && c <= 0xDF)
4700         {
4701           /* SJIS -> JISX0201-Kana */
4702           c &= 0x7F;
4703           charset = charset_kana;
4704         }
4705       else if (c <= 0xEF)
4706         {
4707           /* SJIS -> JISX0208 */
4708           ONE_MORE_BYTE (c1);
4709           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4710             goto invalid_code;
4711           c = (c << 8) | c1;
4712           SJIS_TO_JIS (c);
4713           charset = charset_kanji;
4714         }
4715       else if (c <= 0xFC && charset_kanji2)
4716         {
4717           /* SJIS -> JISX0213-2 */
4718           ONE_MORE_BYTE (c1);
4719           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4720             goto invalid_code;
4721           c = (c << 8) | c1;
4722           SJIS_TO_JIS2 (c);
4723           charset = charset_kanji2;
4724         }
4725       else
4726         goto invalid_code;
4727       if (charset->id != charset_ascii
4728           && last_id != charset->id)
4729         {
4730           if (last_id != charset_ascii)
4731             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4732           last_id = charset->id;
4733           last_offset = char_offset;
4734         }
4735       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4736       *charbuf++ = c;
4737       char_offset++;
4738       continue;
4739
4740     invalid_code:
4741       src = src_base;
4742       consumed_chars = consumed_chars_base;
4743       ONE_MORE_BYTE (c);
4744       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4745       char_offset++;
4746       coding->errors++;
4747     }
4748
4749  no_more_source:
4750   if (last_id != charset_ascii)
4751     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4752   coding->consumed_char += consumed_chars_base;
4753   coding->consumed = src_base - coding->source;
4754   coding->charbuf_used = charbuf - coding->charbuf;
4755 }
4756
4757 static void
4758 decode_coding_big5 (struct coding_system *coding)
4759 {
4760   const unsigned char *src = coding->source + coding->consumed;
4761   const unsigned char *src_end = coding->source + coding->src_bytes;
4762   const unsigned char *src_base;
4763   int *charbuf = coding->charbuf + coding->charbuf_used;
4764   /* We may produce one charset annotation in one loop and one more at
4765      the end.  */
4766   int *charbuf_end
4767     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4768   EMACS_INT consumed_chars = 0, consumed_chars_base;
4769   int multibytep = coding->src_multibyte;
4770   struct charset *charset_roman, *charset_big5;
4771   Lisp_Object attrs, charset_list, val;
4772   EMACS_INT char_offset = coding->produced_char;
4773   EMACS_INT last_offset = char_offset;
4774   int last_id = charset_ascii;
4775   int eol_dos =
4776     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4777   int byte_after_cr = -1;
4778
4779   CODING_GET_INFO (coding, attrs, charset_list);
4780   val = charset_list;
4781   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4782   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4783
4784   while (1)
4785     {
4786       int c, c1;
4787       struct charset *charset;
4788
4789       src_base = src;
4790       consumed_chars_base = consumed_chars;
4791
4792       if (charbuf >= charbuf_end)
4793         {
4794           if (byte_after_cr >= 0)
4795             src_base--;
4796           break;
4797         }
4798
4799       if (byte_after_cr >= 0)
4800         c = byte_after_cr, byte_after_cr = -1;
4801       else
4802         ONE_MORE_BYTE (c);
4803
4804       if (c < 0)
4805         goto invalid_code;
4806       if (c < 0x80)
4807         {
4808           if (eol_dos && c == '\r')
4809             ONE_MORE_BYTE (byte_after_cr);
4810           charset = charset_roman;
4811         }
4812       else
4813         {
4814           /* BIG5 -> Big5 */
4815           if (c < 0xA1 || c > 0xFE)
4816             goto invalid_code;
4817           ONE_MORE_BYTE (c1);
4818           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4819             goto invalid_code;
4820           c = c << 8 | c1;
4821           charset = charset_big5;
4822         }
4823       if (charset->id != charset_ascii
4824           && last_id != charset->id)
4825         {
4826           if (last_id != charset_ascii)
4827             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4828           last_id = charset->id;
4829           last_offset = char_offset;
4830         }
4831       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4832       *charbuf++ = c;
4833       char_offset++;
4834       continue;
4835
4836     invalid_code:
4837       src = src_base;
4838       consumed_chars = consumed_chars_base;
4839       ONE_MORE_BYTE (c);
4840       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4841       char_offset++;
4842       coding->errors++;
4843     }
4844
4845  no_more_source:
4846   if (last_id != charset_ascii)
4847     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4848   coding->consumed_char += consumed_chars_base;
4849   coding->consumed = src_base - coding->source;
4850   coding->charbuf_used = charbuf - coding->charbuf;
4851 }
4852
4853 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4854    This function can encode charsets `ascii', `katakana-jisx0201',
4855    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4856    are sure that all these charsets are registered as official charset
4857    (i.e. do not have extended leading-codes).  Characters of other
4858    charsets are produced without any encoding.  If SJIS_P is 1, encode
4859    SJIS text, else encode BIG5 text.  */
4860
4861 static int
4862 encode_coding_sjis (struct coding_system *coding)
4863 {
4864   int multibytep = coding->dst_multibyte;
4865   int *charbuf = coding->charbuf;
4866   int *charbuf_end = charbuf + coding->charbuf_used;
4867   unsigned char *dst = coding->destination + coding->produced;
4868   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4869   int safe_room = 4;
4870   EMACS_INT produced_chars = 0;
4871   Lisp_Object attrs, charset_list, val;
4872   int ascii_compatible;
4873   struct charset *charset_kanji, *charset_kana;
4874   struct charset *charset_kanji2;
4875   int c;
4876
4877   CODING_GET_INFO (coding, attrs, charset_list);
4878   val = XCDR (charset_list);
4879   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4880   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4881   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4882
4883   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4884
4885   while (charbuf < charbuf_end)
4886     {
4887       ASSURE_DESTINATION (safe_room);
4888       c = *charbuf++;
4889       /* Now encode the character C.  */
4890       if (ASCII_CHAR_P (c) && ascii_compatible)
4891         EMIT_ONE_ASCII_BYTE (c);
4892       else if (CHAR_BYTE8_P (c))
4893         {
4894           c = CHAR_TO_BYTE8 (c);
4895           EMIT_ONE_BYTE (c);
4896         }
4897       else
4898         {
4899           unsigned code;
4900           struct charset *charset = char_charset (c, charset_list, &code);
4901
4902           if (!charset)
4903             {
4904               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4905                 {
4906                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4907                   charset = CHARSET_FROM_ID (charset_ascii);
4908                 }
4909               else
4910                 {
4911                   c = coding->default_char;
4912                   charset = char_charset (c, charset_list, &code);
4913                 }
4914             }
4915           if (code == CHARSET_INVALID_CODE (charset))
4916             abort ();
4917           if (charset == charset_kanji)
4918             {
4919               int c1, c2;
4920               JIS_TO_SJIS (code);
4921               c1 = code >> 8, c2 = code & 0xFF;
4922               EMIT_TWO_BYTES (c1, c2);
4923             }
4924           else if (charset == charset_kana)
4925             EMIT_ONE_BYTE (code | 0x80);
4926           else if (charset_kanji2 && charset == charset_kanji2)
4927             {
4928               int c1, c2;
4929
4930               c1 = code >> 8;
4931               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4932                   || c1 == 0x28
4933                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4934                 {
4935                   JIS_TO_SJIS2 (code);
4936                   c1 = code >> 8, c2 = code & 0xFF;
4937                   EMIT_TWO_BYTES (c1, c2);
4938                 }
4939               else
4940                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4941             }
4942           else
4943             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4944         }
4945     }
4946   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4947   coding->produced_char += produced_chars;
4948   coding->produced = dst - coding->destination;
4949   return 0;
4950 }
4951
4952 static int
4953 encode_coding_big5 (struct coding_system *coding)
4954 {
4955   int multibytep = coding->dst_multibyte;
4956   int *charbuf = coding->charbuf;
4957   int *charbuf_end = charbuf + coding->charbuf_used;
4958   unsigned char *dst = coding->destination + coding->produced;
4959   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4960   int safe_room = 4;
4961   EMACS_INT produced_chars = 0;
4962   Lisp_Object attrs, charset_list, val;
4963   int ascii_compatible;
4964   struct charset *charset_big5;
4965   int c;
4966
4967   CODING_GET_INFO (coding, attrs, charset_list);
4968   val = XCDR (charset_list);
4969   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4970   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4971
4972   while (charbuf < charbuf_end)
4973     {
4974       ASSURE_DESTINATION (safe_room);
4975       c = *charbuf++;
4976       /* Now encode the character C.  */
4977       if (ASCII_CHAR_P (c) && ascii_compatible)
4978         EMIT_ONE_ASCII_BYTE (c);
4979       else if (CHAR_BYTE8_P (c))
4980         {
4981           c = CHAR_TO_BYTE8 (c);
4982           EMIT_ONE_BYTE (c);
4983         }
4984       else
4985         {
4986           unsigned code;
4987           struct charset *charset = char_charset (c, charset_list, &code);
4988
4989           if (! charset)
4990             {
4991               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4992                 {
4993                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4994                   charset = CHARSET_FROM_ID (charset_ascii);
4995                 }
4996               else
4997                 {
4998                   c = coding->default_char;
4999                   charset = char_charset (c, charset_list, &code);
5000                 }
5001             }
5002           if (code == CHARSET_INVALID_CODE (charset))
5003             abort ();
5004           if (charset == charset_big5)
5005             {
5006               int c1, c2;
5007
5008               c1 = code >> 8, c2 = code & 0xFF;
5009               EMIT_TWO_BYTES (c1, c2);
5010             }
5011           else
5012             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5013         }
5014     }
5015   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5016   coding->produced_char += produced_chars;
5017   coding->produced = dst - coding->destination;
5018   return 0;
5019 }
5020
5021 \f
5022 /*** 10. CCL handlers ***/
5023
5024 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5025    Check if a text is encoded in a coding system of which
5026    encoder/decoder are written in CCL program.  If it is, return
5027    CATEGORY_MASK_CCL, else return 0.  */
5028
5029 static int
5030 detect_coding_ccl (struct coding_system *coding,
5031                    struct coding_detection_info *detect_info)
5032 {
5033   const unsigned char *src = coding->source, *src_base;
5034   const unsigned char *src_end = coding->source + coding->src_bytes;
5035   int multibytep = coding->src_multibyte;
5036   EMACS_INT consumed_chars = 0;
5037   int found = 0;
5038   unsigned char *valids;
5039   EMACS_INT head_ascii = coding->head_ascii;
5040   Lisp_Object attrs;
5041
5042   detect_info->checked |= CATEGORY_MASK_CCL;
5043
5044   coding = &coding_categories[coding_category_ccl];
5045   valids = CODING_CCL_VALIDS (coding);
5046   attrs = CODING_ID_ATTRS (coding->id);
5047   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5048     src += head_ascii;
5049
5050   while (1)
5051     {
5052       int c;
5053
5054       src_base = src;
5055       ONE_MORE_BYTE (c);
5056       if (c < 0 || ! valids[c])
5057         break;
5058       if ((valids[c] > 1))
5059         found = CATEGORY_MASK_CCL;
5060     }
5061   detect_info->rejected |= CATEGORY_MASK_CCL;
5062   return 0;
5063
5064  no_more_source:
5065   detect_info->found |= found;
5066   return 1;
5067 }
5068
5069 static void
5070 decode_coding_ccl (struct coding_system *coding)
5071 {
5072   const unsigned char *src = coding->source + coding->consumed;
5073   const unsigned char *src_end = coding->source + coding->src_bytes;
5074   int *charbuf = coding->charbuf + coding->charbuf_used;
5075   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5076   EMACS_INT consumed_chars = 0;
5077   int multibytep = coding->src_multibyte;
5078   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5079   int source_charbuf[1024];
5080   int source_byteidx[1025];
5081   Lisp_Object attrs, charset_list;
5082
5083   CODING_GET_INFO (coding, attrs, charset_list);
5084
5085   while (1)
5086     {
5087       const unsigned char *p = src;
5088       int i = 0;
5089
5090       if (multibytep)
5091         {
5092           while (i < 1024 && p < src_end)
5093             {
5094               source_byteidx[i] = p - src;
5095               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5096             }
5097           source_byteidx[i] = p - src;
5098         }
5099       else
5100         while (i < 1024 && p < src_end)
5101           source_charbuf[i++] = *p++;
5102
5103       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5104         ccl->last_block = 1;
5105       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5106                   charset_list);
5107       charbuf += ccl->produced;
5108       if (multibytep)
5109         src += source_byteidx[ccl->consumed];
5110       else
5111         src += ccl->consumed;
5112       consumed_chars += ccl->consumed;
5113       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5114         break;
5115     }
5116
5117   switch (ccl->status)
5118     {
5119     case CCL_STAT_SUSPEND_BY_SRC:
5120       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5121       break;
5122     case CCL_STAT_SUSPEND_BY_DST:
5123       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5124       break;
5125     case CCL_STAT_QUIT:
5126     case CCL_STAT_INVALID_CMD:
5127       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5128       break;
5129     default:
5130       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5131       break;
5132     }
5133   coding->consumed_char += consumed_chars;
5134   coding->consumed = src - coding->source;
5135   coding->charbuf_used = charbuf - coding->charbuf;
5136 }
5137
5138 static int
5139 encode_coding_ccl (struct coding_system *coding)
5140 {
5141   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5142   int multibytep = coding->dst_multibyte;
5143   int *charbuf = coding->charbuf;
5144   int *charbuf_end = charbuf + coding->charbuf_used;
5145   unsigned char *dst = coding->destination + coding->produced;
5146   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5147   int destination_charbuf[1024];
5148   EMACS_INT produced_chars = 0;
5149   int i;
5150   Lisp_Object attrs, charset_list;
5151
5152   CODING_GET_INFO (coding, attrs, charset_list);
5153   if (coding->consumed_char == coding->src_chars
5154       && coding->mode & CODING_MODE_LAST_BLOCK)
5155     ccl->last_block = 1;
5156
5157   while (charbuf < charbuf_end)
5158     {
5159       ccl_driver (ccl, charbuf, destination_charbuf,
5160                   charbuf_end - charbuf, 1024, charset_list);
5161       if (multibytep)
5162         {
5163           ASSURE_DESTINATION (ccl->produced * 2);
5164           for (i = 0; i < ccl->produced; i++)
5165             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5166         }
5167       else
5168         {
5169           ASSURE_DESTINATION (ccl->produced);
5170           for (i = 0; i < ccl->produced; i++)
5171             *dst++ = destination_charbuf[i] & 0xFF;
5172           produced_chars += ccl->produced;
5173         }
5174       charbuf += ccl->consumed;
5175       if (ccl->status == CCL_STAT_QUIT
5176           || ccl->status == CCL_STAT_INVALID_CMD)
5177         break;
5178     }
5179
5180   switch (ccl->status)
5181     {
5182     case CCL_STAT_SUSPEND_BY_SRC:
5183       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5184       break;
5185     case CCL_STAT_SUSPEND_BY_DST:
5186       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5187       break;
5188     case CCL_STAT_QUIT:
5189     case CCL_STAT_INVALID_CMD:
5190       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5191       break;
5192     default:
5193       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5194       break;
5195     }
5196
5197   coding->produced_char += produced_chars;
5198   coding->produced = dst - coding->destination;
5199   return 0;
5200 }
5201
5202
5203 \f
5204 /*** 10, 11. no-conversion handlers ***/
5205
5206 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5207
5208 static void
5209 decode_coding_raw_text (struct coding_system *coding)
5210 {
5211   int eol_dos =
5212     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5213
5214   coding->chars_at_source = 1;
5215   coding->consumed_char = coding->src_chars;
5216   coding->consumed = coding->src_bytes;
5217   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5218     {
5219       coding->consumed_char--;
5220       coding->consumed--;
5221       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5222     }
5223   else
5224     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5225 }
5226
5227 static int
5228 encode_coding_raw_text (struct coding_system *coding)
5229 {
5230   int multibytep = coding->dst_multibyte;
5231   int *charbuf = coding->charbuf;
5232   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5233   unsigned char *dst = coding->destination + coding->produced;
5234   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5235   EMACS_INT produced_chars = 0;
5236   int c;
5237
5238   if (multibytep)
5239     {
5240       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5241
5242       if (coding->src_multibyte)
5243         while (charbuf < charbuf_end)
5244           {
5245             ASSURE_DESTINATION (safe_room);
5246             c = *charbuf++;
5247             if (ASCII_CHAR_P (c))
5248               EMIT_ONE_ASCII_BYTE (c);
5249             else if (CHAR_BYTE8_P (c))
5250               {
5251                 c = CHAR_TO_BYTE8 (c);
5252                 EMIT_ONE_BYTE (c);
5253               }
5254             else
5255               {
5256                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5257
5258                 CHAR_STRING_ADVANCE (c, p1);
5259                 do
5260                   {
5261                     EMIT_ONE_BYTE (*p0);
5262                     p0++;
5263                   }
5264                 while (p0 < p1);
5265               }
5266           }
5267       else
5268         while (charbuf < charbuf_end)
5269           {
5270             ASSURE_DESTINATION (safe_room);
5271             c = *charbuf++;
5272             EMIT_ONE_BYTE (c);
5273           }
5274     }
5275   else
5276     {
5277       if (coding->src_multibyte)
5278         {
5279           int safe_room = MAX_MULTIBYTE_LENGTH;
5280
5281           while (charbuf < charbuf_end)
5282             {
5283               ASSURE_DESTINATION (safe_room);
5284               c = *charbuf++;
5285               if (ASCII_CHAR_P (c))
5286                 *dst++ = c;
5287               else if (CHAR_BYTE8_P (c))
5288                 *dst++ = CHAR_TO_BYTE8 (c);
5289               else
5290                 CHAR_STRING_ADVANCE (c, dst);
5291             }
5292         }
5293       else
5294         {
5295           ASSURE_DESTINATION (charbuf_end - charbuf);
5296           while (charbuf < charbuf_end && dst < dst_end)
5297             *dst++ = *charbuf++;
5298         }
5299       produced_chars = dst - (coding->destination + coding->produced);
5300     }
5301   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302   coding->produced_char += produced_chars;
5303   coding->produced = dst - coding->destination;
5304   return 0;
5305 }
5306
5307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5308    Check if a text is encoded in a charset-based coding system.  If it
5309    is, return 1, else return 0.  */
5310
5311 static int
5312 detect_coding_charset (struct coding_system *coding,
5313                        struct coding_detection_info *detect_info)
5314 {
5315   const unsigned char *src = coding->source, *src_base;
5316   const unsigned char *src_end = coding->source + coding->src_bytes;
5317   int multibytep = coding->src_multibyte;
5318   EMACS_INT consumed_chars = 0;
5319   Lisp_Object attrs, valids, name;
5320   int found = 0;
5321   EMACS_INT head_ascii = coding->head_ascii;
5322   int check_latin_extra = 0;
5323
5324   detect_info->checked |= CATEGORY_MASK_CHARSET;
5325
5326   coding = &coding_categories[coding_category_charset];
5327   attrs = CODING_ID_ATTRS (coding->id);
5328   valids = AREF (attrs, coding_attr_charset_valids);
5329   name = CODING_ID_NAME (coding->id);
5330   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5331                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5332       || strncmp (SSDATA (SYMBOL_NAME (name)),
5333                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5334     check_latin_extra = 1;
5335
5336   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5337     src += head_ascii;
5338
5339   while (1)
5340     {
5341       int c;
5342       Lisp_Object val;
5343       struct charset *charset;
5344       int dim, idx;
5345
5346       src_base = src;
5347       ONE_MORE_BYTE (c);
5348       if (c < 0)
5349         continue;
5350       val = AREF (valids, c);
5351       if (NILP (val))
5352         break;
5353       if (c >= 0x80)
5354         {
5355           if (c < 0xA0
5356               && check_latin_extra
5357               && (!VECTORP (Vlatin_extra_code_table)
5358                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5359             break;
5360           found = CATEGORY_MASK_CHARSET;
5361         }
5362       if (INTEGERP (val))
5363         {
5364           charset = CHARSET_FROM_ID (XFASTINT (val));
5365           dim = CHARSET_DIMENSION (charset);
5366           for (idx = 1; idx < dim; idx++)
5367             {
5368               if (src == src_end)
5369                 goto too_short;
5370               ONE_MORE_BYTE (c);
5371               if (c < charset->code_space[(dim - 1 - idx) * 4]
5372                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5373                 break;
5374             }
5375           if (idx < dim)
5376             break;
5377         }
5378       else
5379         {
5380           idx = 1;
5381           for (; CONSP (val); val = XCDR (val))
5382             {
5383               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5384               dim = CHARSET_DIMENSION (charset);
5385               while (idx < dim)
5386                 {
5387                   if (src == src_end)
5388                     goto too_short;
5389                   ONE_MORE_BYTE (c);
5390                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5391                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5392                     break;
5393                   idx++;
5394                 }
5395               if (idx == dim)
5396                 {
5397                   val = Qnil;
5398                   break;
5399                 }
5400             }
5401           if (CONSP (val))
5402             break;
5403         }
5404     }
5405  too_short:
5406   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5407   return 0;
5408
5409  no_more_source:
5410   detect_info->found |= found;
5411   return 1;
5412 }
5413
5414 static void
5415 decode_coding_charset (struct coding_system *coding)
5416 {
5417   const unsigned char *src = coding->source + coding->consumed;
5418   const unsigned char *src_end = coding->source + coding->src_bytes;
5419   const unsigned char *src_base;
5420   int *charbuf = coding->charbuf + coding->charbuf_used;
5421   /* We may produce one charset annotation in one loop and one more at
5422      the end.  */
5423   int *charbuf_end
5424     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5425   EMACS_INT consumed_chars = 0, consumed_chars_base;
5426   int multibytep = coding->src_multibyte;
5427   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5428   Lisp_Object valids;
5429   EMACS_INT char_offset = coding->produced_char;
5430   EMACS_INT last_offset = char_offset;
5431   int last_id = charset_ascii;
5432   int eol_dos =
5433     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5434   int byte_after_cr = -1;
5435
5436   valids = AREF (attrs, coding_attr_charset_valids);
5437
5438   while (1)
5439     {
5440       int c;
5441       Lisp_Object val;
5442       struct charset *charset;
5443       int dim;
5444       int len = 1;
5445       unsigned code;
5446
5447       src_base = src;
5448       consumed_chars_base = consumed_chars;
5449
5450       if (charbuf >= charbuf_end)
5451         {
5452           if (byte_after_cr >= 0)
5453             src_base--;
5454           break;
5455         }
5456
5457       if (byte_after_cr >= 0)
5458         {
5459           c = byte_after_cr;
5460           byte_after_cr = -1;
5461         }
5462       else
5463         {
5464           ONE_MORE_BYTE (c);
5465           if (eol_dos && c == '\r')
5466             ONE_MORE_BYTE (byte_after_cr);
5467         }
5468       if (c < 0)
5469         goto invalid_code;
5470       code = c;
5471
5472       val = AREF (valids, c);
5473       if (! INTEGERP (val) && ! CONSP (val))
5474         goto invalid_code;
5475       if (INTEGERP (val))
5476         {
5477           charset = CHARSET_FROM_ID (XFASTINT (val));
5478           dim = CHARSET_DIMENSION (charset);
5479           while (len < dim)
5480             {
5481               ONE_MORE_BYTE (c);
5482               code = (code << 8) | c;
5483               len++;
5484             }
5485           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5486                               charset, code, c);
5487         }
5488       else
5489         {
5490           /* VAL is a list of charset IDs.  It is assured that the
5491              list is sorted by charset dimensions (smaller one
5492              comes first).  */
5493           while (CONSP (val))
5494             {
5495               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5496               dim = CHARSET_DIMENSION (charset);
5497               while (len < dim)
5498                 {
5499                   ONE_MORE_BYTE (c);
5500                   code = (code << 8) | c;
5501                   len++;
5502                 }
5503               CODING_DECODE_CHAR (coding, src, src_base,
5504                                   src_end, charset, code, c);
5505               if (c >= 0)
5506                 break;
5507               val = XCDR (val);
5508             }
5509         }
5510       if (c < 0)
5511         goto invalid_code;
5512       if (charset->id != charset_ascii
5513           && last_id != charset->id)
5514         {
5515           if (last_id != charset_ascii)
5516             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5517           last_id = charset->id;
5518           last_offset = char_offset;
5519         }
5520
5521       *charbuf++ = c;
5522       char_offset++;
5523       continue;
5524
5525     invalid_code:
5526       src = src_base;
5527       consumed_chars = consumed_chars_base;
5528       ONE_MORE_BYTE (c);
5529       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5530       char_offset++;
5531       coding->errors++;
5532     }
5533
5534  no_more_source:
5535   if (last_id != charset_ascii)
5536     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5537   coding->consumed_char += consumed_chars_base;
5538   coding->consumed = src_base - coding->source;
5539   coding->charbuf_used = charbuf - coding->charbuf;
5540 }
5541
5542 static int
5543 encode_coding_charset (struct coding_system *coding)
5544 {
5545   int multibytep = coding->dst_multibyte;
5546   int *charbuf = coding->charbuf;
5547   int *charbuf_end = charbuf + coding->charbuf_used;
5548   unsigned char *dst = coding->destination + coding->produced;
5549   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5550   int safe_room = MAX_MULTIBYTE_LENGTH;
5551   EMACS_INT produced_chars = 0;
5552   Lisp_Object attrs, charset_list;
5553   int ascii_compatible;
5554   int c;
5555
5556   CODING_GET_INFO (coding, attrs, charset_list);
5557   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5558
5559   while (charbuf < charbuf_end)
5560     {
5561       struct charset *charset;
5562       unsigned code;
5563
5564       ASSURE_DESTINATION (safe_room);
5565       c = *charbuf++;
5566       if (ascii_compatible && ASCII_CHAR_P (c))
5567         EMIT_ONE_ASCII_BYTE (c);
5568       else if (CHAR_BYTE8_P (c))
5569         {
5570           c = CHAR_TO_BYTE8 (c);
5571           EMIT_ONE_BYTE (c);
5572         }
5573       else
5574         {
5575           charset = char_charset (c, charset_list, &code);
5576           if (charset)
5577             {
5578               if (CHARSET_DIMENSION (charset) == 1)
5579                 EMIT_ONE_BYTE (code);
5580               else if (CHARSET_DIMENSION (charset) == 2)
5581                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5582               else if (CHARSET_DIMENSION (charset) == 3)
5583                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5584               else
5585                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5586                                  (code >> 8) & 0xFF, code & 0xFF);
5587             }
5588           else
5589             {
5590               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5591                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5592               else
5593                 c = coding->default_char;
5594               EMIT_ONE_BYTE (c);
5595             }
5596         }
5597     }
5598
5599   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5600   coding->produced_char += produced_chars;
5601   coding->produced = dst - coding->destination;
5602   return 0;
5603 }
5604
5605 \f
5606 /*** 7. C library functions ***/
5607
5608 /* Setup coding context CODING from information about CODING_SYSTEM.
5609    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5610    CODING_SYSTEM is invalid, signal an error.  */
5611
5612 void
5613 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5614 {
5615   Lisp_Object attrs;
5616   Lisp_Object eol_type;
5617   Lisp_Object coding_type;
5618   Lisp_Object val;
5619
5620   if (NILP (coding_system))
5621     coding_system = Qundecided;
5622
5623   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5624
5625   attrs = CODING_ID_ATTRS (coding->id);
5626   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5627
5628   coding->mode = 0;
5629   coding->head_ascii = -1;
5630   if (VECTORP (eol_type))
5631     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5632                             | CODING_REQUIRE_DETECTION_MASK);
5633   else if (! EQ (eol_type, Qunix))
5634     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5635                             | CODING_REQUIRE_ENCODING_MASK);
5636   else
5637     coding->common_flags = 0;
5638   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5639     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5640   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5641     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5642   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5643     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5644
5645   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5646   coding->max_charset_id = SCHARS (val) - 1;
5647   coding->safe_charsets = SDATA (val);
5648   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5649   coding->carryover_bytes = 0;
5650
5651   coding_type = CODING_ATTR_TYPE (attrs);
5652   if (EQ (coding_type, Qundecided))
5653     {
5654       coding->detector = NULL;
5655       coding->decoder = decode_coding_raw_text;
5656       coding->encoder = encode_coding_raw_text;
5657       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5658     }
5659   else if (EQ (coding_type, Qiso_2022))
5660     {
5661       int i;
5662       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5663
5664       /* Invoke graphic register 0 to plane 0.  */
5665       CODING_ISO_INVOCATION (coding, 0) = 0;
5666       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5667       CODING_ISO_INVOCATION (coding, 1)
5668         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5669       /* Setup the initial status of designation.  */
5670       for (i = 0; i < 4; i++)
5671         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5672       /* Not single shifting initially.  */
5673       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5674       /* Beginning of buffer should also be regarded as bol. */
5675       CODING_ISO_BOL (coding) = 1;
5676       coding->detector = detect_coding_iso_2022;
5677       coding->decoder = decode_coding_iso_2022;
5678       coding->encoder = encode_coding_iso_2022;
5679       if (flags & CODING_ISO_FLAG_SAFE)
5680         coding->mode |= CODING_MODE_SAFE_ENCODING;
5681       coding->common_flags
5682         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5683             | CODING_REQUIRE_FLUSHING_MASK);
5684       if (flags & CODING_ISO_FLAG_COMPOSITION)
5685         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5686       if (flags & CODING_ISO_FLAG_DESIGNATION)
5687         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5688       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5689         {
5690           setup_iso_safe_charsets (attrs);
5691           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5692           coding->max_charset_id = SCHARS (val) - 1;
5693           coding->safe_charsets = SDATA (val);
5694         }
5695       CODING_ISO_FLAGS (coding) = flags;
5696       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5697       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5698       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5699       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5700     }
5701   else if (EQ (coding_type, Qcharset))
5702     {
5703       coding->detector = detect_coding_charset;
5704       coding->decoder = decode_coding_charset;
5705       coding->encoder = encode_coding_charset;
5706       coding->common_flags
5707         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708     }
5709   else if (EQ (coding_type, Qutf_8))
5710     {
5711       val = AREF (attrs, coding_attr_utf_bom);
5712       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5713                                    : EQ (val, Qt) ? utf_with_bom
5714                                    : utf_without_bom);
5715       coding->detector = detect_coding_utf_8;
5716       coding->decoder = decode_coding_utf_8;
5717       coding->encoder = encode_coding_utf_8;
5718       coding->common_flags
5719         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5720       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5721         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5722     }
5723   else if (EQ (coding_type, Qutf_16))
5724     {
5725       val = AREF (attrs, coding_attr_utf_bom);
5726       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5727                                     : EQ (val, Qt) ? utf_with_bom
5728                                     : utf_without_bom);
5729       val = AREF (attrs, coding_attr_utf_16_endian);
5730       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5731                                        : utf_16_little_endian);
5732       CODING_UTF_16_SURROGATE (coding) = 0;
5733       coding->detector = detect_coding_utf_16;
5734       coding->decoder = decode_coding_utf_16;
5735       coding->encoder = encode_coding_utf_16;
5736       coding->common_flags
5737         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5738       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5739         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5740     }
5741   else if (EQ (coding_type, Qccl))
5742     {
5743       coding->detector = detect_coding_ccl;
5744       coding->decoder = decode_coding_ccl;
5745       coding->encoder = encode_coding_ccl;
5746       coding->common_flags
5747         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5748             | CODING_REQUIRE_FLUSHING_MASK);
5749     }
5750   else if (EQ (coding_type, Qemacs_mule))
5751     {
5752       coding->detector = detect_coding_emacs_mule;
5753       coding->decoder = decode_coding_emacs_mule;
5754       coding->encoder = encode_coding_emacs_mule;
5755       coding->common_flags
5756         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5757       coding->spec.emacs_mule.full_support = 1;
5758       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5759           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5760         {
5761           Lisp_Object tail, safe_charsets;
5762           int max_charset_id = 0;
5763
5764           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5765                tail = XCDR (tail))
5766             if (max_charset_id < XFASTINT (XCAR (tail)))
5767               max_charset_id = XFASTINT (XCAR (tail));
5768           safe_charsets = make_uninit_string (max_charset_id + 1);
5769           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5770           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5771                tail = XCDR (tail))
5772             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5773           coding->max_charset_id = max_charset_id;
5774           coding->safe_charsets = SDATA (safe_charsets);
5775           coding->spec.emacs_mule.full_support = 1;
5776         }
5777       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5778       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5779     }
5780   else if (EQ (coding_type, Qshift_jis))
5781     {
5782       coding->detector = detect_coding_sjis;
5783       coding->decoder = decode_coding_sjis;
5784       coding->encoder = encode_coding_sjis;
5785       coding->common_flags
5786         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5787     }
5788   else if (EQ (coding_type, Qbig5))
5789     {
5790       coding->detector = detect_coding_big5;
5791       coding->decoder = decode_coding_big5;
5792       coding->encoder = encode_coding_big5;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795     }
5796   else                          /* EQ (coding_type, Qraw_text) */
5797     {
5798       coding->detector = NULL;
5799       coding->decoder = decode_coding_raw_text;
5800       coding->encoder = encode_coding_raw_text;
5801       if (! EQ (eol_type, Qunix))
5802         {
5803           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5804           if (! VECTORP (eol_type))
5805             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5806         }
5807
5808     }
5809
5810   return;
5811 }
5812
5813 /* Return a list of charsets supported by CODING.  */
5814
5815 Lisp_Object
5816 coding_charset_list (struct coding_system *coding)
5817 {
5818   Lisp_Object attrs, charset_list;
5819
5820   CODING_GET_INFO (coding, attrs, charset_list);
5821   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5822     {
5823       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5824
5825       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5826         charset_list = Viso_2022_charset_list;
5827     }
5828   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5829     {
5830       charset_list = Vemacs_mule_charset_list;
5831     }
5832   return charset_list;
5833 }
5834
5835
5836 /* Return a list of charsets supported by CODING-SYSTEM.  */
5837
5838 Lisp_Object
5839 coding_system_charset_list (Lisp_Object coding_system)
5840 {
5841   ptrdiff_t id;
5842   Lisp_Object attrs, charset_list;
5843
5844   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5845   attrs = CODING_ID_ATTRS (id);
5846
5847   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5848     {
5849       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5850
5851       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5852         charset_list = Viso_2022_charset_list;
5853       else
5854         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5855     }
5856   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5857     {
5858       charset_list = Vemacs_mule_charset_list;
5859     }
5860   else
5861     {
5862       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5863     }
5864   return charset_list;
5865 }
5866
5867
5868 /* Return raw-text or one of its subsidiaries that has the same
5869    eol_type as CODING-SYSTEM.  */
5870
5871 Lisp_Object
5872 raw_text_coding_system (Lisp_Object coding_system)
5873 {
5874   Lisp_Object spec, attrs;
5875   Lisp_Object eol_type, raw_text_eol_type;
5876
5877   if (NILP (coding_system))
5878     return Qraw_text;
5879   spec = CODING_SYSTEM_SPEC (coding_system);
5880   attrs = AREF (spec, 0);
5881
5882   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5883     return coding_system;
5884
5885   eol_type = AREF (spec, 2);
5886   if (VECTORP (eol_type))
5887     return Qraw_text;
5888   spec = CODING_SYSTEM_SPEC (Qraw_text);
5889   raw_text_eol_type = AREF (spec, 2);
5890   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5891           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5892           : AREF (raw_text_eol_type, 2));
5893 }
5894
5895
5896 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5897    the subsidiary that has the same eol-spec as PARENT (if it is not
5898    nil and specifies end-of-line format) or the system's setting
5899    (system_eol_type).  */
5900
5901 Lisp_Object
5902 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5903 {
5904   Lisp_Object spec, eol_type;
5905
5906   if (NILP (coding_system))
5907     coding_system = Qraw_text;
5908   spec = CODING_SYSTEM_SPEC (coding_system);
5909   eol_type = AREF (spec, 2);
5910   if (VECTORP (eol_type))
5911     {
5912       Lisp_Object parent_eol_type;
5913
5914       if (! NILP (parent))
5915         {
5916           Lisp_Object parent_spec;
5917
5918           parent_spec = CODING_SYSTEM_SPEC (parent);
5919           parent_eol_type = AREF (parent_spec, 2);
5920           if (VECTORP (parent_eol_type))
5921             parent_eol_type = system_eol_type;
5922         }
5923       else
5924         parent_eol_type = system_eol_type;
5925       if (EQ (parent_eol_type, Qunix))
5926         coding_system = AREF (eol_type, 0);
5927       else if (EQ (parent_eol_type, Qdos))
5928         coding_system = AREF (eol_type, 1);
5929       else if (EQ (parent_eol_type, Qmac))
5930         coding_system = AREF (eol_type, 2);
5931     }
5932   return coding_system;
5933 }
5934
5935
5936 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5937    decided for writing to a process.  If not, complement them, and
5938    return a new coding system.  */
5939
5940 Lisp_Object
5941 complement_process_encoding_system (Lisp_Object coding_system)
5942 {
5943   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5944   Lisp_Object spec, attrs;
5945   int i;
5946
5947   for (i = 0; i < 3; i++)
5948     {
5949       if (i == 1)
5950         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5951       else if (i == 2)
5952         coding_system = preferred_coding_system ();
5953       spec = CODING_SYSTEM_SPEC (coding_system);
5954       if (NILP (spec))
5955         continue;
5956       attrs = AREF (spec, 0);
5957       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5958         coding_base = CODING_ATTR_BASE_NAME (attrs);
5959       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5960         eol_base = coding_system;
5961       if (! NILP (coding_base) && ! NILP (eol_base))
5962         break;
5963     }
5964
5965   if (i > 0)
5966     /* The original CODING_SYSTEM didn't specify text-conversion or
5967        eol-conversion.  Be sure that we return a fully complemented
5968        coding system.  */
5969     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5970   return coding_system;
5971 }
5972
5973
5974 /* Emacs has a mechanism to automatically detect a coding system if it
5975    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5976    it's impossible to distinguish some coding systems accurately
5977    because they use the same range of codes.  So, at first, coding
5978    systems are categorized into 7, those are:
5979
5980    o coding-category-emacs-mule
5981
5982         The category for a coding system which has the same code range
5983         as Emacs' internal format.  Assigned the coding-system (Lisp
5984         symbol) `emacs-mule' by default.
5985
5986    o coding-category-sjis
5987
5988         The category for a coding system which has the same code range
5989         as SJIS.  Assigned the coding-system (Lisp
5990         symbol) `japanese-shift-jis' by default.
5991
5992    o coding-category-iso-7
5993
5994         The category for a coding system which has the same code range
5995         as ISO2022 of 7-bit environment.  This doesn't use any locking
5996         shift and single shift functions.  This can encode/decode all
5997         charsets.  Assigned the coding-system (Lisp symbol)
5998         `iso-2022-7bit' by default.
5999
6000    o coding-category-iso-7-tight
6001
6002         Same as coding-category-iso-7 except that this can
6003         encode/decode only the specified charsets.
6004
6005    o coding-category-iso-8-1
6006
6007         The category for a coding system which has the same code range
6008         as ISO2022 of 8-bit environment and graphic plane 1 used only
6009         for DIMENSION1 charset.  This doesn't use any locking shift
6010         and single shift functions.  Assigned the coding-system (Lisp
6011         symbol) `iso-latin-1' by default.
6012
6013    o coding-category-iso-8-2
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 8-bit environment and graphic plane 1 used only
6017         for DIMENSION2 charset.  This doesn't use any locking shift
6018         and single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `japanese-iso-8bit' by default.
6020
6021    o coding-category-iso-7-else
6022
6023         The category for a coding system which has the same code range
6024         as ISO2022 of 7-bit environment but uses locking shift or
6025         single shift functions.  Assigned the coding-system (Lisp
6026         symbol) `iso-2022-7bit-lock' by default.
6027
6028    o coding-category-iso-8-else
6029
6030         The category for a coding system which has the same code range
6031         as ISO2022 of 8-bit environment but uses locking shift or
6032         single shift functions.  Assigned the coding-system (Lisp
6033         symbol) `iso-2022-8bit-ss2' by default.
6034
6035    o coding-category-big5
6036
6037         The category for a coding system which has the same code range
6038         as BIG5.  Assigned the coding-system (Lisp symbol)
6039         `cn-big5' by default.
6040
6041    o coding-category-utf-8
6042
6043         The category for a coding system which has the same code range
6044         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6045         symbol) `utf-8' by default.
6046
6047    o coding-category-utf-16-be
6048
6049         The category for a coding system in which a text has an
6050         Unicode signature (cf. Unicode Standard) in the order of BIG
6051         endian at the head.  Assigned the coding-system (Lisp symbol)
6052         `utf-16-be' by default.
6053
6054    o coding-category-utf-16-le
6055
6056         The category for a coding system in which a text has an
6057         Unicode signature (cf. Unicode Standard) in the order of
6058         LITTLE endian at the head.  Assigned the coding-system (Lisp
6059         symbol) `utf-16-le' by default.
6060
6061    o coding-category-ccl
6062
6063         The category for a coding system of which encoder/decoder is
6064         written in CCL programs.  The default value is nil, i.e., no
6065         coding system is assigned.
6066
6067    o coding-category-binary
6068
6069         The category for a coding system not categorized in any of the
6070         above.  Assigned the coding-system (Lisp symbol)
6071         `no-conversion' by default.
6072
6073    Each of them is a Lisp symbol and the value is an actual
6074    `coding-system's (this is also a Lisp symbol) assigned by a user.
6075    What Emacs does actually is to detect a category of coding system.
6076    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6077    decide only one possible category, it selects a category of the
6078    highest priority.  Priorities of categories are also specified by a
6079    user in a Lisp variable `coding-category-list'.
6080
6081 */
6082
6083 #define EOL_SEEN_NONE   0
6084 #define EOL_SEEN_LF     1
6085 #define EOL_SEEN_CR     2
6086 #define EOL_SEEN_CRLF   4
6087
6088 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6089    SOURCE is encoded.  If CATEGORY is one of
6090    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6091    two-byte, else they are encoded by one-byte.
6092
6093    Return one of EOL_SEEN_XXX.  */
6094
6095 #define MAX_EOL_CHECK_COUNT 3
6096
6097 static int
6098 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6099             enum coding_category category)
6100 {
6101   const unsigned char *src = source, *src_end = src + src_bytes;
6102   unsigned char c;
6103   int total  = 0;
6104   int eol_seen = EOL_SEEN_NONE;
6105
6106   if ((1 << category) & CATEGORY_MASK_UTF_16)
6107     {
6108       int msb, lsb;
6109
6110       msb = category == (coding_category_utf_16_le
6111                          | coding_category_utf_16_le_nosig);
6112       lsb = 1 - msb;
6113
6114       while (src + 1 < src_end)
6115         {
6116           c = src[lsb];
6117           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6118             {
6119               int this_eol;
6120
6121               if (c == '\n')
6122                 this_eol = EOL_SEEN_LF;
6123               else if (src + 3 >= src_end
6124                        || src[msb + 2] != 0
6125                        || src[lsb + 2] != '\n')
6126                 this_eol = EOL_SEEN_CR;
6127               else
6128                 {
6129                   this_eol = EOL_SEEN_CRLF;
6130                   src += 2;
6131                 }
6132
6133               if (eol_seen == EOL_SEEN_NONE)
6134                 /* This is the first end-of-line.  */
6135                 eol_seen = this_eol;
6136               else if (eol_seen != this_eol)
6137                 {
6138                   /* The found type is different from what found before.
6139                      Allow for stray ^M characters in DOS EOL files.  */
6140                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6141                       || (eol_seen == EOL_SEEN_CRLF
6142                           && this_eol == EOL_SEEN_CR))
6143                     eol_seen = EOL_SEEN_CRLF;
6144                   else
6145                     {
6146                       eol_seen = EOL_SEEN_LF;
6147                       break;
6148                     }
6149                 }
6150               if (++total == MAX_EOL_CHECK_COUNT)
6151                 break;
6152             }
6153           src += 2;
6154         }
6155     }
6156   else
6157     while (src < src_end)
6158       {
6159         c = *src++;
6160         if (c == '\n' || c == '\r')
6161           {
6162             int this_eol;
6163
6164             if (c == '\n')
6165               this_eol = EOL_SEEN_LF;
6166             else if (src >= src_end || *src != '\n')
6167               this_eol = EOL_SEEN_CR;
6168             else
6169               this_eol = EOL_SEEN_CRLF, src++;
6170
6171             if (eol_seen == EOL_SEEN_NONE)
6172               /* This is the first end-of-line.  */
6173               eol_seen = this_eol;
6174             else if (eol_seen != this_eol)
6175               {
6176                 /* The found type is different from what found before.
6177                    Allow for stray ^M characters in DOS EOL files.  */
6178                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6179                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6180                   eol_seen = EOL_SEEN_CRLF;
6181                 else
6182                   {
6183                     eol_seen = EOL_SEEN_LF;
6184                     break;
6185                   }
6186               }
6187             if (++total == MAX_EOL_CHECK_COUNT)
6188               break;
6189           }
6190       }
6191   return eol_seen;
6192 }
6193
6194
6195 static Lisp_Object
6196 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6197 {
6198   Lisp_Object eol_type;
6199
6200   eol_type = CODING_ID_EOL_TYPE (coding->id);
6201   if (eol_seen & EOL_SEEN_LF)
6202     {
6203       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6204       eol_type = Qunix;
6205     }
6206   else if (eol_seen & EOL_SEEN_CRLF)
6207     {
6208       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6209       eol_type = Qdos;
6210     }
6211   else if (eol_seen & EOL_SEEN_CR)
6212     {
6213       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6214       eol_type = Qmac;
6215     }
6216   return eol_type;
6217 }
6218
6219 /* Detect how a text specified in CODING is encoded.  If a coding
6220    system is detected, update fields of CODING by the detected coding
6221    system.  */
6222
6223 static void
6224 detect_coding (struct coding_system *coding)
6225 {
6226   const unsigned char *src, *src_end;
6227   int saved_mode = coding->mode;
6228
6229   coding->consumed = coding->consumed_char = 0;
6230   coding->produced = coding->produced_char = 0;
6231   coding_set_source (coding);
6232
6233   src_end = coding->source + coding->src_bytes;
6234   coding->head_ascii = 0;
6235
6236   /* If we have not yet decided the text encoding type, detect it
6237      now.  */
6238   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6239     {
6240       int c, i;
6241       struct coding_detection_info detect_info;
6242       int null_byte_found = 0, eight_bit_found = 0;
6243
6244       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6245       for (src = coding->source; src < src_end; src++)
6246         {
6247           c = *src;
6248           if (c & 0x80)
6249             {
6250               eight_bit_found = 1;
6251               if (null_byte_found)
6252                 break;
6253             }
6254           else if (c < 0x20)
6255             {
6256               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6257                   && ! inhibit_iso_escape_detection
6258                   && ! detect_info.checked)
6259                 {
6260                   if (detect_coding_iso_2022 (coding, &detect_info))
6261                     {
6262                       /* We have scanned the whole data.  */
6263                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6264                         {
6265                           /* We didn't find an 8-bit code.  We may
6266                              have found a null-byte, but it's very
6267                              rare that a binary file conforms to
6268                              ISO-2022.  */
6269                           src = src_end;
6270                           coding->head_ascii = src - coding->source;
6271                         }
6272                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6273                       break;
6274                     }
6275                 }
6276               else if (! c && !inhibit_null_byte_detection)
6277                 {
6278                   null_byte_found = 1;
6279                   if (eight_bit_found)
6280                     break;
6281                 }
6282               if (! eight_bit_found)
6283                 coding->head_ascii++;
6284             }
6285           else if (! eight_bit_found)
6286             coding->head_ascii++;
6287         }
6288
6289       if (null_byte_found || eight_bit_found
6290           || coding->head_ascii < coding->src_bytes
6291           || detect_info.found)
6292         {
6293           enum coding_category category;
6294           struct coding_system *this;
6295
6296           if (coding->head_ascii == coding->src_bytes)
6297             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6298             for (i = 0; i < coding_category_raw_text; i++)
6299               {
6300                 category = coding_priorities[i];
6301                 this = coding_categories + category;
6302                 if (detect_info.found & (1 << category))
6303                   break;
6304               }
6305           else
6306             {
6307               if (null_byte_found)
6308                 {
6309                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6310                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6311                 }
6312               for (i = 0; i < coding_category_raw_text; i++)
6313                 {
6314                   category = coding_priorities[i];
6315                   this = coding_categories + category;
6316                   if (this->id < 0)
6317                     {
6318                       /* No coding system of this category is defined.  */
6319                       detect_info.rejected |= (1 << category);
6320                     }
6321                   else if (category >= coding_category_raw_text)
6322                     continue;
6323                   else if (detect_info.checked & (1 << category))
6324                     {
6325                       if (detect_info.found & (1 << category))
6326                         break;
6327                     }
6328                   else if ((*(this->detector)) (coding, &detect_info)
6329                            && detect_info.found & (1 << category))
6330                     {
6331                       if (category == coding_category_utf_16_auto)
6332                         {
6333                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6334                             category = coding_category_utf_16_le;
6335                           else
6336                             category = coding_category_utf_16_be;
6337                         }
6338                       break;
6339                     }
6340                 }
6341             }
6342
6343           if (i < coding_category_raw_text)
6344             setup_coding_system (CODING_ID_NAME (this->id), coding);
6345           else if (null_byte_found)
6346             setup_coding_system (Qno_conversion, coding);
6347           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6348                    == CATEGORY_MASK_ANY)
6349             setup_coding_system (Qraw_text, coding);
6350           else if (detect_info.rejected)
6351             for (i = 0; i < coding_category_raw_text; i++)
6352               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6353                 {
6354                   this = coding_categories + coding_priorities[i];
6355                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6356                   break;
6357                 }
6358         }
6359     }
6360   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6361            == coding_category_utf_8_auto)
6362     {
6363       Lisp_Object coding_systems;
6364       struct coding_detection_info detect_info;
6365
6366       coding_systems
6367         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6368       detect_info.found = detect_info.rejected = 0;
6369       coding->head_ascii = 0;
6370       if (CONSP (coding_systems)
6371           && detect_coding_utf_8 (coding, &detect_info))
6372         {
6373           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6374             setup_coding_system (XCAR (coding_systems), coding);
6375           else
6376             setup_coding_system (XCDR (coding_systems), coding);
6377         }
6378     }
6379   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6380            == coding_category_utf_16_auto)
6381     {
6382       Lisp_Object coding_systems;
6383       struct coding_detection_info detect_info;
6384
6385       coding_systems
6386         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6387       detect_info.found = detect_info.rejected = 0;
6388       coding->head_ascii = 0;
6389       if (CONSP (coding_systems)
6390           && detect_coding_utf_16 (coding, &detect_info))
6391         {
6392           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6393             setup_coding_system (XCAR (coding_systems), coding);
6394           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6395             setup_coding_system (XCDR (coding_systems), coding);
6396         }
6397     }
6398   coding->mode = saved_mode;
6399 }
6400
6401
6402 static void
6403 decode_eol (struct coding_system *coding)
6404 {
6405   Lisp_Object eol_type;
6406   unsigned char *p, *pbeg, *pend;
6407
6408   eol_type = CODING_ID_EOL_TYPE (coding->id);
6409   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6410     return;
6411
6412   if (NILP (coding->dst_object))
6413     pbeg = coding->destination;
6414   else
6415     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6416   pend = pbeg + coding->produced;
6417
6418   if (VECTORP (eol_type))
6419     {
6420       int eol_seen = EOL_SEEN_NONE;
6421
6422       for (p = pbeg; p < pend; p++)
6423         {
6424           if (*p == '\n')
6425             eol_seen |= EOL_SEEN_LF;
6426           else if (*p == '\r')
6427             {
6428               if (p + 1 < pend && *(p + 1) == '\n')
6429                 {
6430                   eol_seen |= EOL_SEEN_CRLF;
6431                   p++;
6432                 }
6433               else
6434                 eol_seen |= EOL_SEEN_CR;
6435             }
6436         }
6437       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6438       if ((eol_seen & EOL_SEEN_CRLF) != 0
6439           && (eol_seen & EOL_SEEN_CR) != 0
6440           && (eol_seen & EOL_SEEN_LF) == 0)
6441         eol_seen = EOL_SEEN_CRLF;
6442       else if (eol_seen != EOL_SEEN_NONE
6443           && eol_seen != EOL_SEEN_LF
6444           && eol_seen != EOL_SEEN_CRLF
6445           && eol_seen != EOL_SEEN_CR)
6446         eol_seen = EOL_SEEN_LF;
6447       if (eol_seen != EOL_SEEN_NONE)
6448         eol_type = adjust_coding_eol_type (coding, eol_seen);
6449     }
6450
6451   if (EQ (eol_type, Qmac))
6452     {
6453       for (p = pbeg; p < pend; p++)
6454         if (*p == '\r')
6455           *p = '\n';
6456     }
6457   else if (EQ (eol_type, Qdos))
6458     {
6459       EMACS_INT n = 0;
6460
6461       if (NILP (coding->dst_object))
6462         {
6463           /* Start deleting '\r' from the tail to minimize the memory
6464              movement.  */
6465           for (p = pend - 2; p >= pbeg; p--)
6466             if (*p == '\r')
6467               {
6468                 memmove (p, p + 1, pend-- - p - 1);
6469                 n++;
6470               }
6471         }
6472       else
6473         {
6474           EMACS_INT pos_byte = coding->dst_pos_byte;
6475           EMACS_INT pos = coding->dst_pos;
6476           EMACS_INT pos_end = pos + coding->produced_char - 1;
6477
6478           while (pos < pos_end)
6479             {
6480               p = BYTE_POS_ADDR (pos_byte);
6481               if (*p == '\r' && p[1] == '\n')
6482                 {
6483                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6484                   n++;
6485                   pos_end--;
6486                 }
6487               pos++;
6488               if (coding->dst_multibyte)
6489                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6490               else
6491                 pos_byte++;
6492             }
6493         }
6494       coding->produced -= n;
6495       coding->produced_char -= n;
6496     }
6497 }
6498
6499
6500 /* Return a translation table (or list of them) from coding system
6501    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6502    decoding (ENCODEP is zero). */
6503
6504 static Lisp_Object
6505 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6506 {
6507   Lisp_Object standard, translation_table;
6508   Lisp_Object val;
6509
6510   if (NILP (Venable_character_translation))
6511     {
6512       if (max_lookup)
6513         *max_lookup = 0;
6514       return Qnil;
6515     }
6516   if (encodep)
6517     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6518       standard = Vstandard_translation_table_for_encode;
6519   else
6520     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6521       standard = Vstandard_translation_table_for_decode;
6522   if (NILP (translation_table))
6523     translation_table = standard;
6524   else
6525     {
6526       if (SYMBOLP (translation_table))
6527         translation_table = Fget (translation_table, Qtranslation_table);
6528       else if (CONSP (translation_table))
6529         {
6530           translation_table = Fcopy_sequence (translation_table);
6531           for (val = translation_table; CONSP (val); val = XCDR (val))
6532             if (SYMBOLP (XCAR (val)))
6533               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6534         }
6535       if (CHAR_TABLE_P (standard))
6536         {
6537           if (CONSP (translation_table))
6538             translation_table = nconc2 (translation_table,
6539                                         Fcons (standard, Qnil));
6540           else
6541             translation_table = Fcons (translation_table,
6542                                        Fcons (standard, Qnil));
6543         }
6544     }
6545
6546   if (max_lookup)
6547     {
6548       *max_lookup = 1;
6549       if (CHAR_TABLE_P (translation_table)
6550           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6551         {
6552           val = XCHAR_TABLE (translation_table)->extras[1];
6553           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6554             *max_lookup = XFASTINT (val);
6555         }
6556       else if (CONSP (translation_table))
6557         {
6558           Lisp_Object tail;
6559
6560           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6561             if (CHAR_TABLE_P (XCAR (tail))
6562                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6563               {
6564                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6565                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6566                   *max_lookup = XFASTINT (tailval);
6567               }
6568         }
6569     }
6570   return translation_table;
6571 }
6572
6573 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6574   do {                                                          \
6575     trans = Qnil;                                               \
6576     if (CHAR_TABLE_P (table))                                   \
6577       {                                                         \
6578         trans = CHAR_TABLE_REF (table, c);                      \
6579         if (CHARACTERP (trans))                                 \
6580           c = XFASTINT (trans), trans = Qnil;                   \
6581       }                                                         \
6582     else if (CONSP (table))                                     \
6583       {                                                         \
6584         Lisp_Object tail;                                       \
6585                                                                 \
6586         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6587           if (CHAR_TABLE_P (XCAR (tail)))                       \
6588             {                                                   \
6589               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6590               if (CHARACTERP (trans))                           \
6591                 c = XFASTINT (trans), trans = Qnil;             \
6592               else if (! NILP (trans))                          \
6593                 break;                                          \
6594             }                                                   \
6595       }                                                         \
6596   } while (0)
6597
6598
6599 /* Return a translation of character(s) at BUF according to TRANS.
6600    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6601    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6602    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6603    translation is found, and Qnil if not found..
6604    If BUF is too short to lookup characters in FROM, return Qt.  */
6605
6606 static Lisp_Object
6607 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6608 {
6609
6610   if (INTEGERP (trans))
6611     return trans;
6612   for (; CONSP (trans); trans = XCDR (trans))
6613     {
6614       Lisp_Object val = XCAR (trans);
6615       Lisp_Object from = XCAR (val);
6616       int len = ASIZE (from);
6617       int i;
6618
6619       for (i = 0; i < len; i++)
6620         {
6621           if (buf + i == buf_end)
6622             return Qt;
6623           if (XINT (AREF (from, i)) != buf[i])
6624             break;
6625         }
6626       if (i == len)
6627         return val;
6628     }
6629   return Qnil;
6630 }
6631
6632
6633 static int
6634 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6635                int last_block)
6636 {
6637   unsigned char *dst = coding->destination + coding->produced;
6638   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6639   EMACS_INT produced;
6640   EMACS_INT produced_chars = 0;
6641   int carryover = 0;
6642
6643   if (! coding->chars_at_source)
6644     {
6645       /* Source characters are in coding->charbuf.  */
6646       int *buf = coding->charbuf;
6647       int *buf_end = buf + coding->charbuf_used;
6648
6649       if (EQ (coding->src_object, coding->dst_object))
6650         {
6651           coding_set_source (coding);
6652           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6653         }
6654
6655       while (buf < buf_end)
6656         {
6657           int c = *buf, i;
6658
6659           if (c >= 0)
6660             {
6661               EMACS_INT from_nchars = 1, to_nchars = 1;
6662               Lisp_Object trans = Qnil;
6663
6664               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6665               if (! NILP (trans))
6666                 {
6667                   trans = get_translation (trans, buf, buf_end);
6668                   if (INTEGERP (trans))
6669                     c = XINT (trans);
6670                   else if (CONSP (trans))
6671                     {
6672                       from_nchars = ASIZE (XCAR (trans));
6673                       trans = XCDR (trans);
6674                       if (INTEGERP (trans))
6675                         c = XINT (trans);
6676                       else
6677                         {
6678                           to_nchars = ASIZE (trans);
6679                           c = XINT (AREF (trans, 0));
6680                         }
6681                     }
6682                   else if (EQ (trans, Qt) && ! last_block)
6683                     break;
6684                 }
6685
6686               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6687                 {
6688                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6689                        / MAX_MULTIBYTE_LENGTH)
6690                       < to_nchars)
6691                     memory_full (SIZE_MAX);
6692                   dst = alloc_destination (coding,
6693                                            buf_end - buf
6694                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6695                                            dst);
6696                   if (EQ (coding->src_object, coding->dst_object))
6697                     {
6698                       coding_set_source (coding);
6699                       dst_end = (((unsigned char *) coding->source)
6700                                  + coding->consumed);
6701                     }
6702                   else
6703                     dst_end = coding->destination + coding->dst_bytes;
6704                 }
6705
6706               for (i = 0; i < to_nchars; i++)
6707                 {
6708                   if (i > 0)
6709                     c = XINT (AREF (trans, i));
6710                   if (coding->dst_multibyte
6711                       || ! CHAR_BYTE8_P (c))
6712                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6713                   else
6714                     *dst++ = CHAR_TO_BYTE8 (c);
6715                 }
6716               produced_chars += to_nchars;
6717               buf += from_nchars;
6718             }
6719           else
6720             /* This is an annotation datum.  (-C) is the length.  */
6721             buf += -c;
6722         }
6723       carryover = buf_end - buf;
6724     }
6725   else
6726     {
6727       /* Source characters are at coding->source.  */
6728       const unsigned char *src = coding->source;
6729       const unsigned char *src_end = src + coding->consumed;
6730
6731       if (EQ (coding->dst_object, coding->src_object))
6732         dst_end = (unsigned char *) src;
6733       if (coding->src_multibyte != coding->dst_multibyte)
6734         {
6735           if (coding->src_multibyte)
6736             {
6737               int multibytep = 1;
6738               EMACS_INT consumed_chars = 0;
6739
6740               while (1)
6741                 {
6742                   const unsigned char *src_base = src;
6743                   int c;
6744
6745                   ONE_MORE_BYTE (c);
6746                   if (dst == dst_end)
6747                     {
6748                       if (EQ (coding->src_object, coding->dst_object))
6749                         dst_end = (unsigned char *) src;
6750                       if (dst == dst_end)
6751                         {
6752                           EMACS_INT offset = src - coding->source;
6753
6754                           dst = alloc_destination (coding, src_end - src + 1,
6755                                                    dst);
6756                           dst_end = coding->destination + coding->dst_bytes;
6757                           coding_set_source (coding);
6758                           src = coding->source + offset;
6759                           src_end = coding->source + coding->src_bytes;
6760                           if (EQ (coding->src_object, coding->dst_object))
6761                             dst_end = (unsigned char *) src;
6762                         }
6763                     }
6764                   *dst++ = c;
6765                   produced_chars++;
6766                 }
6767             no_more_source:
6768               ;
6769             }
6770           else
6771             while (src < src_end)
6772               {
6773                 int multibytep = 1;
6774                 int c = *src++;
6775
6776                 if (dst >= dst_end - 1)
6777                   {
6778                     if (EQ (coding->src_object, coding->dst_object))
6779                       dst_end = (unsigned char *) src;
6780                     if (dst >= dst_end - 1)
6781                       {
6782                         EMACS_INT offset = src - coding->source;
6783                         EMACS_INT more_bytes;
6784
6785                         if (EQ (coding->src_object, coding->dst_object))
6786                           more_bytes = ((src_end - src) / 2) + 2;
6787                         else
6788                           more_bytes = src_end - src + 2;
6789                         dst = alloc_destination (coding, more_bytes, dst);
6790                         dst_end = coding->destination + coding->dst_bytes;
6791                         coding_set_source (coding);
6792                         src = coding->source + offset;
6793                         src_end = coding->source + coding->src_bytes;
6794                         if (EQ (coding->src_object, coding->dst_object))
6795                           dst_end = (unsigned char *) src;
6796                       }
6797                   }
6798                 EMIT_ONE_BYTE (c);
6799               }
6800         }
6801       else
6802         {
6803           if (!EQ (coding->src_object, coding->dst_object))
6804             {
6805               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6806
6807               if (require > 0)
6808                 {
6809                   EMACS_INT offset = src - coding->source;
6810
6811                   dst = alloc_destination (coding, require, dst);
6812                   coding_set_source (coding);
6813                   src = coding->source + offset;
6814                   src_end = coding->source + coding->src_bytes;
6815                 }
6816             }
6817           produced_chars = coding->consumed_char;
6818           while (src < src_end)
6819             *dst++ = *src++;
6820         }
6821     }
6822
6823   produced = dst - (coding->destination + coding->produced);
6824   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6825     insert_from_gap (produced_chars, produced);
6826   coding->produced += produced;
6827   coding->produced_char += produced_chars;
6828   return carryover;
6829 }
6830
6831 /* Compose text in CODING->object according to the annotation data at
6832    CHARBUF.  CHARBUF is an array:
6833      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6834  */
6835
6836 static inline void
6837 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6838 {
6839   int len;
6840   EMACS_INT to;
6841   enum composition_method method;
6842   Lisp_Object components;
6843
6844   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6845   to = pos + charbuf[2];
6846   method = (enum composition_method) (charbuf[4]);
6847
6848   if (method == COMPOSITION_RELATIVE)
6849     components = Qnil;
6850   else
6851     {
6852       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6853       int i, j;
6854
6855       if (method == COMPOSITION_WITH_RULE)
6856         len = charbuf[2] * 3 - 2;
6857       charbuf += MAX_ANNOTATION_LENGTH;
6858       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6859       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6860         {
6861           if (charbuf[i] >= 0)
6862             args[j] = make_number (charbuf[i]);
6863           else
6864             {
6865               i++;
6866               args[j] = make_number (charbuf[i] % 0x100);
6867             }
6868         }
6869       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6870     }
6871   compose_text (pos, to, components, Qnil, coding->dst_object);
6872 }
6873
6874
6875 /* Put `charset' property on text in CODING->object according to
6876    the annotation data at CHARBUF.  CHARBUF is an array:
6877      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6878  */
6879
6880 static inline void
6881 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6882 {
6883   EMACS_INT from = pos - charbuf[2];
6884   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6885
6886   Fput_text_property (make_number (from), make_number (pos),
6887                       Qcharset, CHARSET_NAME (charset),
6888                       coding->dst_object);
6889 }
6890
6891
6892 #define CHARBUF_SIZE 0x4000
6893
6894 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6895   do {                                                                  \
6896     int size = CHARBUF_SIZE;                                            \
6897                                                                         \
6898     coding->charbuf = NULL;                                             \
6899     while (size > 1024)                                                 \
6900       {                                                                 \
6901         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6902         if (coding->charbuf)                                            \
6903           break;                                                        \
6904         size >>= 1;                                                     \
6905       }                                                                 \
6906     if (! coding->charbuf)                                              \
6907       {                                                                 \
6908         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6909         return coding->result;                                          \
6910       }                                                                 \
6911     coding->charbuf_size = size;                                        \
6912   } while (0)
6913
6914
6915 static void
6916 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6917 {
6918   int *charbuf = coding->charbuf;
6919   int *charbuf_end = charbuf + coding->charbuf_used;
6920
6921   if (NILP (coding->dst_object))
6922     return;
6923
6924   while (charbuf < charbuf_end)
6925     {
6926       if (*charbuf >= 0)
6927         pos++, charbuf++;
6928       else
6929         {
6930           int len = -*charbuf;
6931
6932           if (len > 2)
6933             switch (charbuf[1])
6934               {
6935               case CODING_ANNOTATE_COMPOSITION_MASK:
6936                 produce_composition (coding, charbuf, pos);
6937                 break;
6938               case CODING_ANNOTATE_CHARSET_MASK:
6939                 produce_charset (coding, charbuf, pos);
6940                 break;
6941               }
6942           charbuf += len;
6943         }
6944     }
6945 }
6946
6947 /* Decode the data at CODING->src_object into CODING->dst_object.
6948    CODING->src_object is a buffer, a string, or nil.
6949    CODING->dst_object is a buffer.
6950
6951    If CODING->src_object is a buffer, it must be the current buffer.
6952    In this case, if CODING->src_pos is positive, it is a position of
6953    the source text in the buffer, otherwise, the source text is in the
6954    gap area of the buffer, and CODING->src_pos specifies the offset of
6955    the text from GPT (which must be the same as PT).  If this is the
6956    same buffer as CODING->dst_object, CODING->src_pos must be
6957    negative.
6958
6959    If CODING->src_object is a string, CODING->src_pos is an index to
6960    that string.
6961
6962    If CODING->src_object is nil, CODING->source must already point to
6963    the non-relocatable memory area.  In this case, CODING->src_pos is
6964    an offset from CODING->source.
6965
6966    The decoded data is inserted at the current point of the buffer
6967    CODING->dst_object.
6968 */
6969
6970 static int
6971 decode_coding (struct coding_system *coding)
6972 {
6973   Lisp_Object attrs;
6974   Lisp_Object undo_list;
6975   Lisp_Object translation_table;
6976   struct ccl_spec cclspec;
6977   int carryover;
6978   int i;
6979
6980   if (BUFFERP (coding->src_object)
6981       && coding->src_pos > 0
6982       && coding->src_pos < GPT
6983       && coding->src_pos + coding->src_chars > GPT)
6984     move_gap_both (coding->src_pos, coding->src_pos_byte);
6985
6986   undo_list = Qt;
6987   if (BUFFERP (coding->dst_object))
6988     {
6989       if (current_buffer != XBUFFER (coding->dst_object))
6990         set_buffer_internal (XBUFFER (coding->dst_object));
6991       if (GPT != PT)
6992         move_gap_both (PT, PT_BYTE);
6993       undo_list = BVAR (current_buffer, undo_list);
6994       BVAR (current_buffer, undo_list) = Qt;
6995     }
6996
6997   coding->consumed = coding->consumed_char = 0;
6998   coding->produced = coding->produced_char = 0;
6999   coding->chars_at_source = 0;
7000   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7001   coding->errors = 0;
7002
7003   ALLOC_CONVERSION_WORK_AREA (coding);
7004
7005   attrs = CODING_ID_ATTRS (coding->id);
7006   translation_table = get_translation_table (attrs, 0, NULL);
7007
7008   carryover = 0;
7009   if (coding->decoder == decode_coding_ccl)
7010     {
7011       coding->spec.ccl = &cclspec;
7012       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7013     }
7014   do
7015     {
7016       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7017
7018       coding_set_source (coding);
7019       coding->annotated = 0;
7020       coding->charbuf_used = carryover;
7021       (*(coding->decoder)) (coding);
7022       coding_set_destination (coding);
7023       carryover = produce_chars (coding, translation_table, 0);
7024       if (coding->annotated)
7025         produce_annotation (coding, pos);
7026       for (i = 0; i < carryover; i++)
7027         coding->charbuf[i]
7028           = coding->charbuf[coding->charbuf_used - carryover + i];
7029     }
7030   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7031          || (coding->consumed < coding->src_bytes
7032              && (coding->result == CODING_RESULT_SUCCESS
7033                  || coding->result == CODING_RESULT_INVALID_SRC)));
7034
7035   if (carryover > 0)
7036     {
7037       coding_set_destination (coding);
7038       coding->charbuf_used = carryover;
7039       produce_chars (coding, translation_table, 1);
7040     }
7041
7042   coding->carryover_bytes = 0;
7043   if (coding->consumed < coding->src_bytes)
7044     {
7045       int nbytes = coding->src_bytes - coding->consumed;
7046       const unsigned char *src;
7047
7048       coding_set_source (coding);
7049       coding_set_destination (coding);
7050       src = coding->source + coding->consumed;
7051
7052       if (coding->mode & CODING_MODE_LAST_BLOCK)
7053         {
7054           /* Flush out unprocessed data as binary chars.  We are sure
7055              that the number of data is less than the size of
7056              coding->charbuf.  */
7057           coding->charbuf_used = 0;
7058           coding->chars_at_source = 0;
7059
7060           while (nbytes-- > 0)
7061             {
7062               int c = *src++;
7063
7064               if (c & 0x80)
7065                 c = BYTE8_TO_CHAR (c);
7066               coding->charbuf[coding->charbuf_used++] = c;
7067             }
7068           produce_chars (coding, Qnil, 1);
7069         }
7070       else
7071         {
7072           /* Record unprocessed bytes in coding->carryover.  We are
7073              sure that the number of data is less than the size of
7074              coding->carryover.  */
7075           unsigned char *p = coding->carryover;
7076
7077           if (nbytes > sizeof coding->carryover)
7078             nbytes = sizeof coding->carryover;
7079           coding->carryover_bytes = nbytes;
7080           while (nbytes-- > 0)
7081             *p++ = *src++;
7082         }
7083       coding->consumed = coding->src_bytes;
7084     }
7085
7086   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7087       && !inhibit_eol_conversion)
7088     decode_eol (coding);
7089   if (BUFFERP (coding->dst_object))
7090     {
7091       BVAR (current_buffer, undo_list) = undo_list;
7092       record_insert (coding->dst_pos, coding->produced_char);
7093     }
7094   return coding->result;
7095 }
7096
7097
7098 /* Extract an annotation datum from a composition starting at POS and
7099    ending before LIMIT of CODING->src_object (buffer or string), store
7100    the data in BUF, set *STOP to a starting position of the next
7101    composition (if any) or to LIMIT, and return the address of the
7102    next element of BUF.
7103
7104    If such an annotation is not found, set *STOP to a starting
7105    position of a composition after POS (if any) or to LIMIT, and
7106    return BUF.  */
7107
7108 static inline int *
7109 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7110                                struct coding_system *coding, int *buf,
7111                                EMACS_INT *stop)
7112 {
7113   EMACS_INT start, end;
7114   Lisp_Object prop;
7115
7116   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7117       || end > limit)
7118     *stop = limit;
7119   else if (start > pos)
7120     *stop = start;
7121   else
7122     {
7123       if (start == pos)
7124         {
7125           /* We found a composition.  Store the corresponding
7126              annotation data in BUF.  */
7127           int *head = buf;
7128           enum composition_method method = COMPOSITION_METHOD (prop);
7129           int nchars = COMPOSITION_LENGTH (prop);
7130
7131           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7132           if (method != COMPOSITION_RELATIVE)
7133             {
7134               Lisp_Object components;
7135               int len, i, i_byte;
7136
7137               components = COMPOSITION_COMPONENTS (prop);
7138               if (VECTORP (components))
7139                 {
7140                   len = ASIZE (components);
7141                   for (i = 0; i < len; i++)
7142                     *buf++ = XINT (AREF (components, i));
7143                 }
7144               else if (STRINGP (components))
7145                 {
7146                   len = SCHARS (components);
7147                   i = i_byte = 0;
7148                   while (i < len)
7149                     {
7150                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7151                       buf++;
7152                     }
7153                 }
7154               else if (INTEGERP (components))
7155                 {
7156                   len = 1;
7157                   *buf++ = XINT (components);
7158                 }
7159               else if (CONSP (components))
7160                 {
7161                   for (len = 0; CONSP (components);
7162                        len++, components = XCDR (components))
7163                     *buf++ = XINT (XCAR (components));
7164                 }
7165               else
7166                 abort ();
7167               *head -= len;
7168             }
7169         }
7170
7171       if (find_composition (end, limit, &start, &end, &prop,
7172                             coding->src_object)
7173           && end <= limit)
7174         *stop = start;
7175       else
7176         *stop = limit;
7177     }
7178   return buf;
7179 }
7180
7181
7182 /* Extract an annotation datum from a text property `charset' at POS of
7183    CODING->src_object (buffer of string), store the data in BUF, set
7184    *STOP to the position where the value of `charset' property changes
7185    (limiting by LIMIT), and return the address of the next element of
7186    BUF.
7187
7188    If the property value is nil, set *STOP to the position where the
7189    property value is non-nil (limiting by LIMIT), and return BUF.  */
7190
7191 static inline int *
7192 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7193                            struct coding_system *coding, int *buf,
7194                            EMACS_INT *stop)
7195 {
7196   Lisp_Object val, next;
7197   int id;
7198
7199   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7200   if (! NILP (val) && CHARSETP (val))
7201     id = XINT (CHARSET_SYMBOL_ID (val));
7202   else
7203     id = -1;
7204   ADD_CHARSET_DATA (buf, 0, id);
7205   next = Fnext_single_property_change (make_number (pos), Qcharset,
7206                                        coding->src_object,
7207                                        make_number (limit));
7208   *stop = XINT (next);
7209   return buf;
7210 }
7211
7212
7213 static void
7214 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7215                int max_lookup)
7216 {
7217   int *buf = coding->charbuf;
7218   int *buf_end = coding->charbuf + coding->charbuf_size;
7219   const unsigned char *src = coding->source + coding->consumed;
7220   const unsigned char *src_end = coding->source + coding->src_bytes;
7221   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7222   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7223   int multibytep = coding->src_multibyte;
7224   Lisp_Object eol_type;
7225   int c;
7226   EMACS_INT stop, stop_composition, stop_charset;
7227   int *lookup_buf = NULL;
7228
7229   if (! NILP (translation_table))
7230     lookup_buf = alloca (sizeof (int) * max_lookup);
7231
7232   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7233   if (VECTORP (eol_type))
7234     eol_type = Qunix;
7235
7236   /* Note: composition handling is not yet implemented.  */
7237   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7238
7239   if (NILP (coding->src_object))
7240     stop = stop_composition = stop_charset = end_pos;
7241   else
7242     {
7243       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7244         stop = stop_composition = pos;
7245       else
7246         stop = stop_composition = end_pos;
7247       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7248         stop = stop_charset = pos;
7249       else
7250         stop_charset = end_pos;
7251     }
7252
7253   /* Compensate for CRLF and conversion.  */
7254   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7255   while (buf < buf_end)
7256     {
7257       Lisp_Object trans;
7258
7259       if (pos == stop)
7260         {
7261           if (pos == end_pos)
7262             break;
7263           if (pos == stop_composition)
7264             buf = handle_composition_annotation (pos, end_pos, coding,
7265                                                  buf, &stop_composition);
7266           if (pos == stop_charset)
7267             buf = handle_charset_annotation (pos, end_pos, coding,
7268                                              buf, &stop_charset);
7269           stop = (stop_composition < stop_charset
7270                   ? stop_composition : stop_charset);
7271         }
7272
7273       if (! multibytep)
7274         {
7275           EMACS_INT bytes;
7276
7277           if (coding->encoder == encode_coding_raw_text
7278               || coding->encoder == encode_coding_ccl)
7279             c = *src++, pos++;
7280           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7281             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7282           else
7283             c = BYTE8_TO_CHAR (*src), src++, pos++;
7284         }
7285       else
7286         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7287       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7288         c = '\n';
7289       if (! EQ (eol_type, Qunix))
7290         {
7291           if (c == '\n')
7292             {
7293               if (EQ (eol_type, Qdos))
7294                 *buf++ = '\r';
7295               else
7296                 c = '\r';
7297             }
7298         }
7299
7300       trans = Qnil;
7301       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7302       if (NILP (trans))
7303         *buf++ = c;
7304       else
7305         {
7306           int from_nchars = 1, to_nchars = 1;
7307           int *lookup_buf_end;
7308           const unsigned char *p = src;
7309           int i;
7310
7311           lookup_buf[0] = c;
7312           for (i = 1; i < max_lookup && p < src_end; i++)
7313             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7314           lookup_buf_end = lookup_buf + i;
7315           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7316           if (INTEGERP (trans))
7317             c = XINT (trans);
7318           else if (CONSP (trans))
7319             {
7320               from_nchars = ASIZE (XCAR (trans));
7321               trans = XCDR (trans);
7322               if (INTEGERP (trans))
7323                 c = XINT (trans);
7324               else
7325                 {
7326                   to_nchars = ASIZE (trans);
7327                   if (buf + to_nchars > buf_end)
7328                     break;
7329                   c = XINT (AREF (trans, 0));
7330                 }
7331             }
7332           else
7333             break;
7334           *buf++ = c;
7335           for (i = 1; i < to_nchars; i++)
7336             *buf++ = XINT (AREF (trans, i));
7337           for (i = 1; i < from_nchars; i++, pos++)
7338             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7339         }
7340     }
7341
7342   coding->consumed = src - coding->source;
7343   coding->consumed_char = pos - coding->src_pos;
7344   coding->charbuf_used = buf - coding->charbuf;
7345   coding->chars_at_source = 0;
7346 }
7347
7348
7349 /* Encode the text at CODING->src_object into CODING->dst_object.
7350    CODING->src_object is a buffer or a string.
7351    CODING->dst_object is a buffer or nil.
7352
7353    If CODING->src_object is a buffer, it must be the current buffer.
7354    In this case, if CODING->src_pos is positive, it is a position of
7355    the source text in the buffer, otherwise. the source text is in the
7356    gap area of the buffer, and coding->src_pos specifies the offset of
7357    the text from GPT (which must be the same as PT).  If this is the
7358    same buffer as CODING->dst_object, CODING->src_pos must be
7359    negative and CODING should not have `pre-write-conversion'.
7360
7361    If CODING->src_object is a string, CODING should not have
7362    `pre-write-conversion'.
7363
7364    If CODING->dst_object is a buffer, the encoded data is inserted at
7365    the current point of that buffer.
7366
7367    If CODING->dst_object is nil, the encoded data is placed at the
7368    memory area specified by CODING->destination.  */
7369
7370 static int
7371 encode_coding (struct coding_system *coding)
7372 {
7373   Lisp_Object attrs;
7374   Lisp_Object translation_table;
7375   int max_lookup;
7376   struct ccl_spec cclspec;
7377
7378   attrs = CODING_ID_ATTRS (coding->id);
7379   if (coding->encoder == encode_coding_raw_text)
7380     translation_table = Qnil, max_lookup = 0;
7381   else
7382     translation_table = get_translation_table (attrs, 1, &max_lookup);
7383
7384   if (BUFFERP (coding->dst_object))
7385     {
7386       set_buffer_internal (XBUFFER (coding->dst_object));
7387       coding->dst_multibyte
7388         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7389     }
7390
7391   coding->consumed = coding->consumed_char = 0;
7392   coding->produced = coding->produced_char = 0;
7393   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7394   coding->errors = 0;
7395
7396   ALLOC_CONVERSION_WORK_AREA (coding);
7397
7398   if (coding->encoder == encode_coding_ccl)
7399     {
7400       coding->spec.ccl = &cclspec;
7401       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7402     }
7403   do {
7404     coding_set_source (coding);
7405     consume_chars (coding, translation_table, max_lookup);
7406     coding_set_destination (coding);
7407     (*(coding->encoder)) (coding);
7408   } while (coding->consumed_char < coding->src_chars);
7409
7410   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7411     insert_from_gap (coding->produced_char, coding->produced);
7412
7413   return (coding->result);
7414 }
7415
7416
7417 /* Name (or base name) of work buffer for code conversion.  */
7418 static Lisp_Object Vcode_conversion_workbuf_name;
7419
7420 /* A working buffer used by the top level conversion.  Once it is
7421    created, it is never destroyed.  It has the name
7422    Vcode_conversion_workbuf_name.  The other working buffers are
7423    destroyed after the use is finished, and their names are modified
7424    versions of Vcode_conversion_workbuf_name.  */
7425 static Lisp_Object Vcode_conversion_reused_workbuf;
7426
7427 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7428 static int reused_workbuf_in_use;
7429
7430
7431 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7432    multibyteness of returning buffer.  */
7433
7434 static Lisp_Object
7435 make_conversion_work_buffer (int multibyte)
7436 {
7437   Lisp_Object name, workbuf;
7438   struct buffer *current;
7439
7440   if (reused_workbuf_in_use++)
7441     {
7442       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7443       workbuf = Fget_buffer_create (name);
7444     }
7445   else
7446     {
7447       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7448         Vcode_conversion_reused_workbuf
7449           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7450       workbuf = Vcode_conversion_reused_workbuf;
7451     }
7452   current = current_buffer;
7453   set_buffer_internal (XBUFFER (workbuf));
7454   /* We can't allow modification hooks to run in the work buffer.  For
7455      instance, directory_files_internal assumes that file decoding
7456      doesn't compile new regexps.  */
7457   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7458   Ferase_buffer ();
7459   BVAR (current_buffer, undo_list) = Qt;
7460   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7461   set_buffer_internal (current);
7462   return workbuf;
7463 }
7464
7465
7466 static Lisp_Object
7467 code_conversion_restore (Lisp_Object arg)
7468 {
7469   Lisp_Object current, workbuf;
7470   struct gcpro gcpro1;
7471
7472   GCPRO1 (arg);
7473   current = XCAR (arg);
7474   workbuf = XCDR (arg);
7475   if (! NILP (workbuf))
7476     {
7477       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7478         reused_workbuf_in_use = 0;
7479       else if (! NILP (Fbuffer_live_p (workbuf)))
7480         Fkill_buffer (workbuf);
7481     }
7482   set_buffer_internal (XBUFFER (current));
7483   UNGCPRO;
7484   return Qnil;
7485 }
7486
7487 Lisp_Object
7488 code_conversion_save (int with_work_buf, int multibyte)
7489 {
7490   Lisp_Object workbuf = Qnil;
7491
7492   if (with_work_buf)
7493     workbuf = make_conversion_work_buffer (multibyte);
7494   record_unwind_protect (code_conversion_restore,
7495                          Fcons (Fcurrent_buffer (), workbuf));
7496   return workbuf;
7497 }
7498
7499 int
7500 decode_coding_gap (struct coding_system *coding,
7501                    EMACS_INT chars, EMACS_INT bytes)
7502 {
7503   int count = SPECPDL_INDEX ();
7504   Lisp_Object attrs;
7505
7506   code_conversion_save (0, 0);
7507
7508   coding->src_object = Fcurrent_buffer ();
7509   coding->src_chars = chars;
7510   coding->src_bytes = bytes;
7511   coding->src_pos = -chars;
7512   coding->src_pos_byte = -bytes;
7513   coding->src_multibyte = chars < bytes;
7514   coding->dst_object = coding->src_object;
7515   coding->dst_pos = PT;
7516   coding->dst_pos_byte = PT_BYTE;
7517   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7518
7519   if (CODING_REQUIRE_DETECTION (coding))
7520     detect_coding (coding);
7521
7522   coding->mode |= CODING_MODE_LAST_BLOCK;
7523   current_buffer->text->inhibit_shrinking = 1;
7524   decode_coding (coding);
7525   current_buffer->text->inhibit_shrinking = 0;
7526
7527   attrs = CODING_ID_ATTRS (coding->id);
7528   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7529     {
7530       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7531       Lisp_Object val;
7532
7533       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7534       val = call1 (CODING_ATTR_POST_READ (attrs),
7535                    make_number (coding->produced_char));
7536       CHECK_NATNUM (val);
7537       coding->produced_char += Z - prev_Z;
7538       coding->produced += Z_BYTE - prev_Z_BYTE;
7539     }
7540
7541   unbind_to (count, Qnil);
7542   return coding->result;
7543 }
7544
7545
7546 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7547    SRC_OBJECT into DST_OBJECT by coding context CODING.
7548
7549    SRC_OBJECT is a buffer, a string, or Qnil.
7550
7551    If it is a buffer, the text is at point of the buffer.  FROM and TO
7552    are positions in the buffer.
7553
7554    If it is a string, the text is at the beginning of the string.
7555    FROM and TO are indices to the string.
7556
7557    If it is nil, the text is at coding->source.  FROM and TO are
7558    indices to coding->source.
7559
7560    DST_OBJECT is a buffer, Qt, or Qnil.
7561
7562    If it is a buffer, the decoded text is inserted at point of the
7563    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7564    is deleted.
7565
7566    If it is Qt, a string is made from the decoded text, and
7567    set in CODING->dst_object.
7568
7569    If it is Qnil, the decoded text is stored at CODING->destination.
7570    The caller must allocate CODING->dst_bytes bytes at
7571    CODING->destination by xmalloc.  If the decoded text is longer than
7572    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7573  */
7574
7575 void
7576 decode_coding_object (struct coding_system *coding,
7577                       Lisp_Object src_object,
7578                       EMACS_INT from, EMACS_INT from_byte,
7579                       EMACS_INT to, EMACS_INT to_byte,
7580                       Lisp_Object dst_object)
7581 {
7582   int count = SPECPDL_INDEX ();
7583   unsigned char *destination IF_LINT (= NULL);
7584   EMACS_INT dst_bytes IF_LINT (= 0);
7585   EMACS_INT chars = to - from;
7586   EMACS_INT bytes = to_byte - from_byte;
7587   Lisp_Object attrs;
7588   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7589   int need_marker_adjustment = 0;
7590   Lisp_Object old_deactivate_mark;
7591
7592   old_deactivate_mark = Vdeactivate_mark;
7593
7594   if (NILP (dst_object))
7595     {
7596       destination = coding->destination;
7597       dst_bytes = coding->dst_bytes;
7598     }
7599
7600   coding->src_object = src_object;
7601   coding->src_chars = chars;
7602   coding->src_bytes = bytes;
7603   coding->src_multibyte = chars < bytes;
7604
7605   if (STRINGP (src_object))
7606     {
7607       coding->src_pos = from;
7608       coding->src_pos_byte = from_byte;
7609     }
7610   else if (BUFFERP (src_object))
7611     {
7612       set_buffer_internal (XBUFFER (src_object));
7613       if (from != GPT)
7614         move_gap_both (from, from_byte);
7615       if (EQ (src_object, dst_object))
7616         {
7617           struct Lisp_Marker *tail;
7618
7619           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7620             {
7621               tail->need_adjustment
7622                 = tail->charpos == (tail->insertion_type ? from : to);
7623               need_marker_adjustment |= tail->need_adjustment;
7624             }
7625           saved_pt = PT, saved_pt_byte = PT_BYTE;
7626           TEMP_SET_PT_BOTH (from, from_byte);
7627           current_buffer->text->inhibit_shrinking = 1;
7628           del_range_both (from, from_byte, to, to_byte, 1);
7629           coding->src_pos = -chars;
7630           coding->src_pos_byte = -bytes;
7631         }
7632       else
7633         {
7634           coding->src_pos = from;
7635           coding->src_pos_byte = from_byte;
7636         }
7637     }
7638
7639   if (CODING_REQUIRE_DETECTION (coding))
7640     detect_coding (coding);
7641   attrs = CODING_ID_ATTRS (coding->id);
7642
7643   if (EQ (dst_object, Qt)
7644       || (! NILP (CODING_ATTR_POST_READ (attrs))
7645           && NILP (dst_object)))
7646     {
7647       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7648       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7649       coding->dst_pos = BEG;
7650       coding->dst_pos_byte = BEG_BYTE;
7651     }
7652   else if (BUFFERP (dst_object))
7653     {
7654       code_conversion_save (0, 0);
7655       coding->dst_object = dst_object;
7656       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7657       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7658       coding->dst_multibyte
7659         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7660     }
7661   else
7662     {
7663       code_conversion_save (0, 0);
7664       coding->dst_object = Qnil;
7665       /* Most callers presume this will return a multibyte result, and they
7666          won't use `binary' or `raw-text' anyway, so let's not worry about
7667          CODING_FOR_UNIBYTE.  */
7668       coding->dst_multibyte = 1;
7669     }
7670
7671   decode_coding (coding);
7672
7673   if (BUFFERP (coding->dst_object))
7674     set_buffer_internal (XBUFFER (coding->dst_object));
7675
7676   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7677     {
7678       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7679       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7680       Lisp_Object val;
7681
7682       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7683       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7684               old_deactivate_mark);
7685       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7686                         make_number (coding->produced_char));
7687       UNGCPRO;
7688       CHECK_NATNUM (val);
7689       coding->produced_char += Z - prev_Z;
7690       coding->produced += Z_BYTE - prev_Z_BYTE;
7691     }
7692
7693   if (EQ (dst_object, Qt))
7694     {
7695       coding->dst_object = Fbuffer_string ();
7696     }
7697   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7698     {
7699       set_buffer_internal (XBUFFER (coding->dst_object));
7700       if (dst_bytes < coding->produced)
7701         {
7702           destination = xrealloc (destination, coding->produced);
7703           if (! destination)
7704             {
7705               record_conversion_result (coding,
7706                                         CODING_RESULT_INSUFFICIENT_MEM);
7707               unbind_to (count, Qnil);
7708               return;
7709             }
7710           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7711             move_gap_both (BEGV, BEGV_BYTE);
7712           memcpy (destination, BEGV_ADDR, coding->produced);
7713           coding->destination = destination;
7714         }
7715     }
7716
7717   if (saved_pt >= 0)
7718     {
7719       /* This is the case of:
7720          (BUFFERP (src_object) && EQ (src_object, dst_object))
7721          As we have moved PT while replacing the original buffer
7722          contents, we must recover it now.  */
7723       set_buffer_internal (XBUFFER (src_object));
7724       current_buffer->text->inhibit_shrinking = 0;
7725       if (saved_pt < from)
7726         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7727       else if (saved_pt < from + chars)
7728         TEMP_SET_PT_BOTH (from, from_byte);
7729       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7730         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7731                           saved_pt_byte + (coding->produced - bytes));
7732       else
7733         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7734                           saved_pt_byte + (coding->produced - bytes));
7735
7736       if (need_marker_adjustment)
7737         {
7738           struct Lisp_Marker *tail;
7739
7740           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7741             if (tail->need_adjustment)
7742               {
7743                 tail->need_adjustment = 0;
7744                 if (tail->insertion_type)
7745                   {
7746                     tail->bytepos = from_byte;
7747                     tail->charpos = from;
7748                   }
7749                 else
7750                   {
7751                     tail->bytepos = from_byte + coding->produced;
7752                     tail->charpos
7753                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7754                          ? tail->bytepos : from + coding->produced_char);
7755                   }
7756               }
7757         }
7758     }
7759
7760   Vdeactivate_mark = old_deactivate_mark;
7761   unbind_to (count, coding->dst_object);
7762 }
7763
7764
7765 void
7766 encode_coding_object (struct coding_system *coding,
7767                       Lisp_Object src_object,
7768                       EMACS_INT from, EMACS_INT from_byte,
7769                       EMACS_INT to, EMACS_INT to_byte,
7770                       Lisp_Object dst_object)
7771 {
7772   int count = SPECPDL_INDEX ();
7773   EMACS_INT chars = to - from;
7774   EMACS_INT bytes = to_byte - from_byte;
7775   Lisp_Object attrs;
7776   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7777   int need_marker_adjustment = 0;
7778   int kill_src_buffer = 0;
7779   Lisp_Object old_deactivate_mark;
7780
7781   old_deactivate_mark = Vdeactivate_mark;
7782
7783   coding->src_object = src_object;
7784   coding->src_chars = chars;
7785   coding->src_bytes = bytes;
7786   coding->src_multibyte = chars < bytes;
7787
7788   attrs = CODING_ID_ATTRS (coding->id);
7789
7790   if (EQ (src_object, dst_object))
7791     {
7792       struct Lisp_Marker *tail;
7793
7794       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7795         {
7796           tail->need_adjustment
7797             = tail->charpos == (tail->insertion_type ? from : to);
7798           need_marker_adjustment |= tail->need_adjustment;
7799         }
7800     }
7801
7802   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7803     {
7804       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7805       set_buffer_internal (XBUFFER (coding->src_object));
7806       if (STRINGP (src_object))
7807         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7808       else if (BUFFERP (src_object))
7809         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7810       else
7811         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7812
7813       if (EQ (src_object, dst_object))
7814         {
7815           set_buffer_internal (XBUFFER (src_object));
7816           saved_pt = PT, saved_pt_byte = PT_BYTE;
7817           del_range_both (from, from_byte, to, to_byte, 1);
7818           set_buffer_internal (XBUFFER (coding->src_object));
7819         }
7820
7821       {
7822         Lisp_Object args[3];
7823         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7824
7825         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7826                 old_deactivate_mark);
7827         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7828         args[1] = make_number (BEG);
7829         args[2] = make_number (Z);
7830         safe_call (3, args);
7831         UNGCPRO;
7832       }
7833       if (XBUFFER (coding->src_object) != current_buffer)
7834         kill_src_buffer = 1;
7835       coding->src_object = Fcurrent_buffer ();
7836       if (BEG != GPT)
7837         move_gap_both (BEG, BEG_BYTE);
7838       coding->src_chars = Z - BEG;
7839       coding->src_bytes = Z_BYTE - BEG_BYTE;
7840       coding->src_pos = BEG;
7841       coding->src_pos_byte = BEG_BYTE;
7842       coding->src_multibyte = Z < Z_BYTE;
7843     }
7844   else if (STRINGP (src_object))
7845     {
7846       code_conversion_save (0, 0);
7847       coding->src_pos = from;
7848       coding->src_pos_byte = from_byte;
7849     }
7850   else if (BUFFERP (src_object))
7851     {
7852       code_conversion_save (0, 0);
7853       set_buffer_internal (XBUFFER (src_object));
7854       if (EQ (src_object, dst_object))
7855         {
7856           saved_pt = PT, saved_pt_byte = PT_BYTE;
7857           coding->src_object = del_range_1 (from, to, 1, 1);
7858           coding->src_pos = 0;
7859           coding->src_pos_byte = 0;
7860         }
7861       else
7862         {
7863           if (from < GPT && to >= GPT)
7864             move_gap_both (from, from_byte);
7865           coding->src_pos = from;
7866           coding->src_pos_byte = from_byte;
7867         }
7868     }
7869   else
7870     code_conversion_save (0, 0);
7871
7872   if (BUFFERP (dst_object))
7873     {
7874       coding->dst_object = dst_object;
7875       if (EQ (src_object, dst_object))
7876         {
7877           coding->dst_pos = from;
7878           coding->dst_pos_byte = from_byte;
7879         }
7880       else
7881         {
7882           struct buffer *current = current_buffer;
7883
7884           set_buffer_temp (XBUFFER (dst_object));
7885           coding->dst_pos = PT;
7886           coding->dst_pos_byte = PT_BYTE;
7887           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7888           set_buffer_temp (current);
7889         }
7890       coding->dst_multibyte
7891         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7892     }
7893   else if (EQ (dst_object, Qt))
7894     {
7895       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7896       coding->dst_object = Qnil;
7897       coding->destination = (unsigned char *) xmalloc (dst_bytes);
7898       coding->dst_bytes = dst_bytes;
7899       coding->dst_multibyte = 0;
7900     }
7901   else
7902     {
7903       coding->dst_object = Qnil;
7904       coding->dst_multibyte = 0;
7905     }
7906
7907   encode_coding (coding);
7908
7909   if (EQ (dst_object, Qt))
7910     {
7911       if (BUFFERP (coding->dst_object))
7912         coding->dst_object = Fbuffer_string ();
7913       else
7914         {
7915           coding->dst_object
7916             = make_unibyte_string ((char *) coding->destination,
7917                                    coding->produced);
7918           xfree (coding->destination);
7919         }
7920     }
7921
7922   if (saved_pt >= 0)
7923     {
7924       /* This is the case of:
7925          (BUFFERP (src_object) && EQ (src_object, dst_object))
7926          As we have moved PT while replacing the original buffer
7927          contents, we must recover it now.  */
7928       set_buffer_internal (XBUFFER (src_object));
7929       if (saved_pt < from)
7930         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7931       else if (saved_pt < from + chars)
7932         TEMP_SET_PT_BOTH (from, from_byte);
7933       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7934         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7935                           saved_pt_byte + (coding->produced - bytes));
7936       else
7937         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7938                           saved_pt_byte + (coding->produced - bytes));
7939
7940       if (need_marker_adjustment)
7941         {
7942           struct Lisp_Marker *tail;
7943
7944           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7945             if (tail->need_adjustment)
7946               {
7947                 tail->need_adjustment = 0;
7948                 if (tail->insertion_type)
7949                   {
7950                     tail->bytepos = from_byte;
7951                     tail->charpos = from;
7952                   }
7953                 else
7954                   {
7955                     tail->bytepos = from_byte + coding->produced;
7956                     tail->charpos
7957                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7958                          ? tail->bytepos : from + coding->produced_char);
7959                   }
7960               }
7961         }
7962     }
7963
7964   if (kill_src_buffer)
7965     Fkill_buffer (coding->src_object);
7966
7967   Vdeactivate_mark = old_deactivate_mark;
7968   unbind_to (count, Qnil);
7969 }
7970
7971
7972 Lisp_Object
7973 preferred_coding_system (void)
7974 {
7975   int id = coding_categories[coding_priorities[0]].id;
7976
7977   return CODING_ID_NAME (id);
7978 }
7979
7980 \f
7981 #ifdef emacs
7982 /*** 8. Emacs Lisp library functions ***/
7983
7984 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7985        doc: /* Return t if OBJECT is nil or a coding-system.
7986 See the documentation of `define-coding-system' for information
7987 about coding-system objects.  */)
7988   (Lisp_Object object)
7989 {
7990   if (NILP (object)
7991       || CODING_SYSTEM_ID (object) >= 0)
7992     return Qt;
7993   if (! SYMBOLP (object)
7994       || NILP (Fget (object, Qcoding_system_define_form)))
7995     return Qnil;
7996   return Qt;
7997 }
7998
7999 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8000        Sread_non_nil_coding_system, 1, 1, 0,
8001        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8002   (Lisp_Object prompt)
8003 {
8004   Lisp_Object val;
8005   do
8006     {
8007       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8008                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8009     }
8010   while (SCHARS (val) == 0);
8011   return (Fintern (val, Qnil));
8012 }
8013
8014 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8015        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8016 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8017 Ignores case when completing coding systems (all Emacs coding systems
8018 are lower-case).  */)
8019   (Lisp_Object prompt, Lisp_Object default_coding_system)
8020 {
8021   Lisp_Object val;
8022   int count = SPECPDL_INDEX ();
8023
8024   if (SYMBOLP (default_coding_system))
8025     default_coding_system = SYMBOL_NAME (default_coding_system);
8026   specbind (Qcompletion_ignore_case, Qt);
8027   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8028                           Qt, Qnil, Qcoding_system_history,
8029                           default_coding_system, Qnil);
8030   unbind_to (count, Qnil);
8031   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8032 }
8033
8034 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8035        1, 1, 0,
8036        doc: /* Check validity of CODING-SYSTEM.
8037 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8038 It is valid if it is nil or a symbol defined as a coding system by the
8039 function `define-coding-system'.  */)
8040   (Lisp_Object coding_system)
8041 {
8042   Lisp_Object define_form;
8043
8044   define_form = Fget (coding_system, Qcoding_system_define_form);
8045   if (! NILP (define_form))
8046     {
8047       Fput (coding_system, Qcoding_system_define_form, Qnil);
8048       safe_eval (define_form);
8049     }
8050   if (!NILP (Fcoding_system_p (coding_system)))
8051     return coding_system;
8052   xsignal1 (Qcoding_system_error, coding_system);
8053 }
8054
8055 \f
8056 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8057    HIGHEST is nonzero, return the coding system of the highest
8058    priority among the detected coding systems.  Otherwise return a
8059    list of detected coding systems sorted by their priorities.  If
8060    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8061    multibyte form but contains only ASCII and eight-bit chars.
8062    Otherwise, the bytes are raw bytes.
8063
8064    CODING-SYSTEM controls the detection as below:
8065
8066    If it is nil, detect both text-format and eol-format.  If the
8067    text-format part of CODING-SYSTEM is already specified
8068    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8069    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8070    detect only text-format.  */
8071
8072 Lisp_Object
8073 detect_coding_system (const unsigned char *src,
8074                       EMACS_INT src_chars, EMACS_INT src_bytes,
8075                       int highest, int multibytep,
8076                       Lisp_Object coding_system)
8077 {
8078   const unsigned char *src_end = src + src_bytes;
8079   Lisp_Object attrs, eol_type;
8080   Lisp_Object val = Qnil;
8081   struct coding_system coding;
8082   ptrdiff_t id;
8083   struct coding_detection_info detect_info;
8084   enum coding_category base_category;
8085   int null_byte_found = 0, eight_bit_found = 0;
8086
8087   if (NILP (coding_system))
8088     coding_system = Qundecided;
8089   setup_coding_system (coding_system, &coding);
8090   attrs = CODING_ID_ATTRS (coding.id);
8091   eol_type = CODING_ID_EOL_TYPE (coding.id);
8092   coding_system = CODING_ATTR_BASE_NAME (attrs);
8093
8094   coding.source = src;
8095   coding.src_chars = src_chars;
8096   coding.src_bytes = src_bytes;
8097   coding.src_multibyte = multibytep;
8098   coding.consumed = 0;
8099   coding.mode |= CODING_MODE_LAST_BLOCK;
8100   coding.head_ascii = 0;
8101
8102   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8103
8104   /* At first, detect text-format if necessary.  */
8105   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8106   if (base_category == coding_category_undecided)
8107     {
8108       enum coding_category category IF_LINT (= 0);
8109       struct coding_system *this IF_LINT (= NULL);
8110       int c, i;
8111
8112       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8113       for (; src < src_end; src++)
8114         {
8115           c = *src;
8116           if (c & 0x80)
8117             {
8118               eight_bit_found = 1;
8119               if (null_byte_found)
8120                 break;
8121             }
8122           else if (c < 0x20)
8123             {
8124               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8125                   && ! inhibit_iso_escape_detection
8126                   && ! detect_info.checked)
8127                 {
8128                   if (detect_coding_iso_2022 (&coding, &detect_info))
8129                     {
8130                       /* We have scanned the whole data.  */
8131                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8132                         {
8133                           /* We didn't find an 8-bit code.  We may
8134                              have found a null-byte, but it's very
8135                              rare that a binary file confirm to
8136                              ISO-2022.  */
8137                           src = src_end;
8138                           coding.head_ascii = src - coding.source;
8139                         }
8140                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8141                       break;
8142                     }
8143                 }
8144               else if (! c && !inhibit_null_byte_detection)
8145                 {
8146                   null_byte_found = 1;
8147                   if (eight_bit_found)
8148                     break;
8149                 }
8150               if (! eight_bit_found)
8151                 coding.head_ascii++;
8152             }
8153           else if (! eight_bit_found)
8154             coding.head_ascii++;
8155         }
8156
8157       if (null_byte_found || eight_bit_found
8158           || coding.head_ascii < coding.src_bytes
8159           || detect_info.found)
8160         {
8161           if (coding.head_ascii == coding.src_bytes)
8162             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8163             for (i = 0; i < coding_category_raw_text; i++)
8164               {
8165                 category = coding_priorities[i];
8166                 this = coding_categories + category;
8167                 if (detect_info.found & (1 << category))
8168                   break;
8169               }
8170           else
8171             {
8172               if (null_byte_found)
8173                 {
8174                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8175                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8176                 }
8177               for (i = 0; i < coding_category_raw_text; i++)
8178                 {
8179                   category = coding_priorities[i];
8180                   this = coding_categories + category;
8181
8182                   if (this->id < 0)
8183                     {
8184                       /* No coding system of this category is defined.  */
8185                       detect_info.rejected |= (1 << category);
8186                     }
8187                   else if (category >= coding_category_raw_text)
8188                     continue;
8189                   else if (detect_info.checked & (1 << category))
8190                     {
8191                       if (highest
8192                           && (detect_info.found & (1 << category)))
8193                         break;
8194                     }
8195                   else if ((*(this->detector)) (&coding, &detect_info)
8196                            && highest
8197                            && (detect_info.found & (1 << category)))
8198                     {
8199                       if (category == coding_category_utf_16_auto)
8200                         {
8201                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8202                             category = coding_category_utf_16_le;
8203                           else
8204                             category = coding_category_utf_16_be;
8205                         }
8206                       break;
8207                     }
8208                 }
8209             }
8210         }
8211
8212       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8213           || null_byte_found)
8214         {
8215           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8216           id = CODING_SYSTEM_ID (Qno_conversion);
8217           val = Fcons (make_number (id), Qnil);
8218         }
8219       else if (! detect_info.rejected && ! detect_info.found)
8220         {
8221           detect_info.found = CATEGORY_MASK_ANY;
8222           id = coding_categories[coding_category_undecided].id;
8223           val = Fcons (make_number (id), Qnil);
8224         }
8225       else if (highest)
8226         {
8227           if (detect_info.found)
8228             {
8229               detect_info.found = 1 << category;
8230               val = Fcons (make_number (this->id), Qnil);
8231             }
8232           else
8233             for (i = 0; i < coding_category_raw_text; i++)
8234               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8235                 {
8236                   detect_info.found = 1 << coding_priorities[i];
8237                   id = coding_categories[coding_priorities[i]].id;
8238                   val = Fcons (make_number (id), Qnil);
8239                   break;
8240                 }
8241         }
8242       else
8243         {
8244           int mask = detect_info.rejected | detect_info.found;
8245           int found = 0;
8246
8247           for (i = coding_category_raw_text - 1; i >= 0; i--)
8248             {
8249               category = coding_priorities[i];
8250               if (! (mask & (1 << category)))
8251                 {
8252                   found |= 1 << category;
8253                   id = coding_categories[category].id;
8254                   if (id >= 0)
8255                     val = Fcons (make_number (id), val);
8256                 }
8257             }
8258           for (i = coding_category_raw_text - 1; i >= 0; i--)
8259             {
8260               category = coding_priorities[i];
8261               if (detect_info.found & (1 << category))
8262                 {
8263                   id = coding_categories[category].id;
8264                   val = Fcons (make_number (id), val);
8265                 }
8266             }
8267           detect_info.found |= found;
8268         }
8269     }
8270   else if (base_category == coding_category_utf_8_auto)
8271     {
8272       if (detect_coding_utf_8 (&coding, &detect_info))
8273         {
8274           struct coding_system *this;
8275
8276           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8277             this = coding_categories + coding_category_utf_8_sig;
8278           else
8279             this = coding_categories + coding_category_utf_8_nosig;
8280           val = Fcons (make_number (this->id), Qnil);
8281         }
8282     }
8283   else if (base_category == coding_category_utf_16_auto)
8284     {
8285       if (detect_coding_utf_16 (&coding, &detect_info))
8286         {
8287           struct coding_system *this;
8288
8289           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8290             this = coding_categories + coding_category_utf_16_le;
8291           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8292             this = coding_categories + coding_category_utf_16_be;
8293           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8294             this = coding_categories + coding_category_utf_16_be_nosig;
8295           else
8296             this = coding_categories + coding_category_utf_16_le_nosig;
8297           val = Fcons (make_number (this->id), Qnil);
8298         }
8299     }
8300   else
8301     {
8302       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8303       val = Fcons (make_number (coding.id), Qnil);
8304     }
8305
8306   /* Then, detect eol-format if necessary.  */
8307   {
8308     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8309     Lisp_Object tail;
8310
8311     if (VECTORP (eol_type))
8312       {
8313         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8314           {
8315             if (null_byte_found)
8316               normal_eol = EOL_SEEN_LF;
8317             else
8318               normal_eol = detect_eol (coding.source, src_bytes,
8319                                        coding_category_raw_text);
8320           }
8321         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8322                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8323           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8324                                       coding_category_utf_16_be);
8325         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8326                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8327           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8328                                       coding_category_utf_16_le);
8329       }
8330     else
8331       {
8332         if (EQ (eol_type, Qunix))
8333           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8334         else if (EQ (eol_type, Qdos))
8335           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8336         else
8337           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8338       }
8339
8340     for (tail = val; CONSP (tail); tail = XCDR (tail))
8341       {
8342         enum coding_category category;
8343         int this_eol;
8344
8345         id = XINT (XCAR (tail));
8346         attrs = CODING_ID_ATTRS (id);
8347         category = XINT (CODING_ATTR_CATEGORY (attrs));
8348         eol_type = CODING_ID_EOL_TYPE (id);
8349         if (VECTORP (eol_type))
8350           {
8351             if (category == coding_category_utf_16_be
8352                 || category == coding_category_utf_16_be_nosig)
8353               this_eol = utf_16_be_eol;
8354             else if (category == coding_category_utf_16_le
8355                      || category == coding_category_utf_16_le_nosig)
8356               this_eol = utf_16_le_eol;
8357             else
8358               this_eol = normal_eol;
8359
8360             if (this_eol == EOL_SEEN_LF)
8361               XSETCAR (tail, AREF (eol_type, 0));
8362             else if (this_eol == EOL_SEEN_CRLF)
8363               XSETCAR (tail, AREF (eol_type, 1));
8364             else if (this_eol == EOL_SEEN_CR)
8365               XSETCAR (tail, AREF (eol_type, 2));
8366             else
8367               XSETCAR (tail, CODING_ID_NAME (id));
8368           }
8369         else
8370           XSETCAR (tail, CODING_ID_NAME (id));
8371       }
8372   }
8373
8374   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8375 }
8376
8377
8378 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8379        2, 3, 0,
8380        doc: /* Detect coding system of the text in the region between START and END.
8381 Return a list of possible coding systems ordered by priority.
8382 The coding systems to try and their priorities follows what
8383 the function `coding-system-priority-list' (which see) returns.
8384
8385 If only ASCII characters are found (except for such ISO-2022 control
8386 characters as ESC), it returns a list of single element `undecided'
8387 or its subsidiary coding system according to a detected end-of-line
8388 format.
8389
8390 If optional argument HIGHEST is non-nil, return the coding system of
8391 highest priority.  */)
8392   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8393 {
8394   int from, to;
8395   int from_byte, to_byte;
8396
8397   CHECK_NUMBER_COERCE_MARKER (start);
8398   CHECK_NUMBER_COERCE_MARKER (end);
8399
8400   validate_region (&start, &end);
8401   from = XINT (start), to = XINT (end);
8402   from_byte = CHAR_TO_BYTE (from);
8403   to_byte = CHAR_TO_BYTE (to);
8404
8405   if (from < GPT && to >= GPT)
8406     move_gap_both (to, to_byte);
8407
8408   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8409                                to - from, to_byte - from_byte,
8410                                !NILP (highest),
8411                                !NILP (BVAR (current_buffer
8412                                       , enable_multibyte_characters)),
8413                                Qnil);
8414 }
8415
8416 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8417        1, 2, 0,
8418        doc: /* Detect coding system of the text in STRING.
8419 Return a list of possible coding systems ordered by priority.
8420 The coding systems to try and their priorities follows what
8421 the function `coding-system-priority-list' (which see) returns.
8422
8423 If only ASCII characters are found (except for such ISO-2022 control
8424 characters as ESC), it returns a list of single element `undecided'
8425 or its subsidiary coding system according to a detected end-of-line
8426 format.
8427
8428 If optional argument HIGHEST is non-nil, return the coding system of
8429 highest priority.  */)
8430   (Lisp_Object string, Lisp_Object highest)
8431 {
8432   CHECK_STRING (string);
8433
8434   return detect_coding_system (SDATA (string),
8435                                SCHARS (string), SBYTES (string),
8436                                !NILP (highest), STRING_MULTIBYTE (string),
8437                                Qnil);
8438 }
8439
8440
8441 static inline int
8442 char_encodable_p (int c, Lisp_Object attrs)
8443 {
8444   Lisp_Object tail;
8445   struct charset *charset;
8446   Lisp_Object translation_table;
8447
8448   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8449   if (! NILP (translation_table))
8450     c = translate_char (translation_table, c);
8451   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8452        CONSP (tail); tail = XCDR (tail))
8453     {
8454       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8455       if (CHAR_CHARSET_P (c, charset))
8456         break;
8457     }
8458   return (! NILP (tail));
8459 }
8460
8461
8462 /* Return a list of coding systems that safely encode the text between
8463    START and END.  If EXCLUDE is non-nil, it is a list of coding
8464    systems not to check.  The returned list doesn't contain any such
8465    coding systems.  In any case, if the text contains only ASCII or is
8466    unibyte, return t.  */
8467
8468 DEFUN ("find-coding-systems-region-internal",
8469        Ffind_coding_systems_region_internal,
8470        Sfind_coding_systems_region_internal, 2, 3, 0,
8471        doc: /* Internal use only.  */)
8472   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8473 {
8474   Lisp_Object coding_attrs_list, safe_codings;
8475   EMACS_INT start_byte, end_byte;
8476   const unsigned char *p, *pbeg, *pend;
8477   int c;
8478   Lisp_Object tail, elt, work_table;
8479
8480   if (STRINGP (start))
8481     {
8482       if (!STRING_MULTIBYTE (start)
8483           || SCHARS (start) == SBYTES (start))
8484         return Qt;
8485       start_byte = 0;
8486       end_byte = SBYTES (start);
8487     }
8488   else
8489     {
8490       CHECK_NUMBER_COERCE_MARKER (start);
8491       CHECK_NUMBER_COERCE_MARKER (end);
8492       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8493         args_out_of_range (start, end);
8494       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8495         return Qt;
8496       start_byte = CHAR_TO_BYTE (XINT (start));
8497       end_byte = CHAR_TO_BYTE (XINT (end));
8498       if (XINT (end) - XINT (start) == end_byte - start_byte)
8499         return Qt;
8500
8501       if (XINT (start) < GPT && XINT (end) > GPT)
8502         {
8503           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8504             move_gap_both (XINT (start), start_byte);
8505           else
8506             move_gap_both (XINT (end), end_byte);
8507         }
8508     }
8509
8510   coding_attrs_list = Qnil;
8511   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8512     if (NILP (exclude)
8513         || NILP (Fmemq (XCAR (tail), exclude)))
8514       {
8515         Lisp_Object attrs;
8516
8517         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8518         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8519             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8520           {
8521             ASET (attrs, coding_attr_trans_tbl,
8522                   get_translation_table (attrs, 1, NULL));
8523             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8524           }
8525       }
8526
8527   if (STRINGP (start))
8528     p = pbeg = SDATA (start);
8529   else
8530     p = pbeg = BYTE_POS_ADDR (start_byte);
8531   pend = p + (end_byte - start_byte);
8532
8533   while (p < pend && ASCII_BYTE_P (*p)) p++;
8534   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8535
8536   work_table = Fmake_char_table (Qnil, Qnil);
8537   while (p < pend)
8538     {
8539       if (ASCII_BYTE_P (*p))
8540         p++;
8541       else
8542         {
8543           c = STRING_CHAR_ADVANCE (p);
8544           if (!NILP (char_table_ref (work_table, c)))
8545             /* This character was already checked.  Ignore it.  */
8546             continue;
8547
8548           charset_map_loaded = 0;
8549           for (tail = coding_attrs_list; CONSP (tail);)
8550             {
8551               elt = XCAR (tail);
8552               if (NILP (elt))
8553                 tail = XCDR (tail);
8554               else if (char_encodable_p (c, elt))
8555                 tail = XCDR (tail);
8556               else if (CONSP (XCDR (tail)))
8557                 {
8558                   XSETCAR (tail, XCAR (XCDR (tail)));
8559                   XSETCDR (tail, XCDR (XCDR (tail)));
8560                 }
8561               else
8562                 {
8563                   XSETCAR (tail, Qnil);
8564                   tail = XCDR (tail);
8565                 }
8566             }
8567           if (charset_map_loaded)
8568             {
8569               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8570
8571               if (STRINGP (start))
8572                 pbeg = SDATA (start);
8573               else
8574                 pbeg = BYTE_POS_ADDR (start_byte);
8575               p = pbeg + p_offset;
8576               pend = pbeg + pend_offset;
8577             }
8578           char_table_set (work_table, c, Qt);
8579         }
8580     }
8581
8582   safe_codings = list2 (Qraw_text, Qno_conversion);
8583   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8584     if (! NILP (XCAR (tail)))
8585       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8586
8587   return safe_codings;
8588 }
8589
8590
8591 DEFUN ("unencodable-char-position", Funencodable_char_position,
8592        Sunencodable_char_position, 3, 5, 0,
8593        doc: /*
8594 Return position of first un-encodable character in a region.
8595 START and END specify the region and CODING-SYSTEM specifies the
8596 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8597
8598 If optional 4th argument COUNT is non-nil, it specifies at most how
8599 many un-encodable characters to search.  In this case, the value is a
8600 list of positions.
8601
8602 If optional 5th argument STRING is non-nil, it is a string to search
8603 for un-encodable characters.  In that case, START and END are indexes
8604 to the string.  */)
8605   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8606 {
8607   int n;
8608   struct coding_system coding;
8609   Lisp_Object attrs, charset_list, translation_table;
8610   Lisp_Object positions;
8611   int from, to;
8612   const unsigned char *p, *stop, *pend;
8613   int ascii_compatible;
8614
8615   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8616   attrs = CODING_ID_ATTRS (coding.id);
8617   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8618     return Qnil;
8619   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8620   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8621   translation_table = get_translation_table (attrs, 1, NULL);
8622
8623   if (NILP (string))
8624     {
8625       validate_region (&start, &end);
8626       from = XINT (start);
8627       to = XINT (end);
8628       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8629           || (ascii_compatible
8630               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8631         return Qnil;
8632       p = CHAR_POS_ADDR (from);
8633       pend = CHAR_POS_ADDR (to);
8634       if (from < GPT && to >= GPT)
8635         stop = GPT_ADDR;
8636       else
8637         stop = pend;
8638     }
8639   else
8640     {
8641       CHECK_STRING (string);
8642       CHECK_NATNUM (start);
8643       CHECK_NATNUM (end);
8644       from = XINT (start);
8645       to = XINT (end);
8646       if (from > to
8647           || to > SCHARS (string))
8648         args_out_of_range_3 (string, start, end);
8649       if (! STRING_MULTIBYTE (string))
8650         return Qnil;
8651       p = SDATA (string) + string_char_to_byte (string, from);
8652       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8653       if (ascii_compatible && (to - from) == (pend - p))
8654         return Qnil;
8655     }
8656
8657   if (NILP (count))
8658     n = 1;
8659   else
8660     {
8661       CHECK_NATNUM (count);
8662       n = XINT (count);
8663     }
8664
8665   positions = Qnil;
8666   while (1)
8667     {
8668       int c;
8669
8670       if (ascii_compatible)
8671         while (p < stop && ASCII_BYTE_P (*p))
8672           p++, from++;
8673       if (p >= stop)
8674         {
8675           if (p >= pend)
8676             break;
8677           stop = pend;
8678           p = GAP_END_ADDR;
8679         }
8680
8681       c = STRING_CHAR_ADVANCE (p);
8682       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8683           && ! char_charset (translate_char (translation_table, c),
8684                              charset_list, NULL))
8685         {
8686           positions = Fcons (make_number (from), positions);
8687           n--;
8688           if (n == 0)
8689             break;
8690         }
8691
8692       from++;
8693     }
8694
8695   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8696 }
8697
8698
8699 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8700        Scheck_coding_systems_region, 3, 3, 0,
8701        doc: /* Check if the region is encodable by coding systems.
8702
8703 START and END are buffer positions specifying the region.
8704 CODING-SYSTEM-LIST is a list of coding systems to check.
8705
8706 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8707 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8708 whole region, POS0, POS1, ... are buffer positions where non-encodable
8709 characters are found.
8710
8711 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8712 value is nil.
8713
8714 START may be a string.  In that case, check if the string is
8715 encodable, and the value contains indices to the string instead of
8716 buffer positions.  END is ignored.
8717
8718 If the current buffer (or START if it is a string) is unibyte, the value
8719 is nil.  */)
8720   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8721 {
8722   Lisp_Object list;
8723   EMACS_INT start_byte, end_byte;
8724   int pos;
8725   const unsigned char *p, *pbeg, *pend;
8726   int c;
8727   Lisp_Object tail, elt, attrs;
8728
8729   if (STRINGP (start))
8730     {
8731       if (!STRING_MULTIBYTE (start)
8732           || SCHARS (start) == SBYTES (start))
8733         return Qnil;
8734       start_byte = 0;
8735       end_byte = SBYTES (start);
8736       pos = 0;
8737     }
8738   else
8739     {
8740       CHECK_NUMBER_COERCE_MARKER (start);
8741       CHECK_NUMBER_COERCE_MARKER (end);
8742       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8743         args_out_of_range (start, end);
8744       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8745         return Qnil;
8746       start_byte = CHAR_TO_BYTE (XINT (start));
8747       end_byte = CHAR_TO_BYTE (XINT (end));
8748       if (XINT (end) - XINT (start) == end_byte - start_byte)
8749         return Qnil;
8750
8751       if (XINT (start) < GPT && XINT (end) > GPT)
8752         {
8753           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8754             move_gap_both (XINT (start), start_byte);
8755           else
8756             move_gap_both (XINT (end), end_byte);
8757         }
8758       pos = XINT (start);
8759     }
8760
8761   list = Qnil;
8762   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8763     {
8764       elt = XCAR (tail);
8765       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8766       ASET (attrs, coding_attr_trans_tbl,
8767             get_translation_table (attrs, 1, NULL));
8768       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8769     }
8770
8771   if (STRINGP (start))
8772     p = pbeg = SDATA (start);
8773   else
8774     p = pbeg = BYTE_POS_ADDR (start_byte);
8775   pend = p + (end_byte - start_byte);
8776
8777   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8778   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8779
8780   while (p < pend)
8781     {
8782       if (ASCII_BYTE_P (*p))
8783         p++;
8784       else
8785         {
8786           c = STRING_CHAR_ADVANCE (p);
8787
8788           charset_map_loaded = 0;
8789           for (tail = list; CONSP (tail); tail = XCDR (tail))
8790             {
8791               elt = XCDR (XCAR (tail));
8792               if (! char_encodable_p (c, XCAR (elt)))
8793                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8794             }
8795           if (charset_map_loaded)
8796             {
8797               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8798
8799               if (STRINGP (start))
8800                 pbeg = SDATA (start);
8801               else
8802                 pbeg = BYTE_POS_ADDR (start_byte);
8803               p = pbeg + p_offset;
8804               pend = pbeg + pend_offset;
8805             }
8806         }
8807       pos++;
8808     }
8809
8810   tail = list;
8811   list = Qnil;
8812   for (; CONSP (tail); tail = XCDR (tail))
8813     {
8814       elt = XCAR (tail);
8815       if (CONSP (XCDR (XCDR (elt))))
8816         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8817                       list);
8818     }
8819
8820   return list;
8821 }
8822
8823
8824 static Lisp_Object
8825 code_convert_region (Lisp_Object start, Lisp_Object end,
8826                      Lisp_Object coding_system, Lisp_Object dst_object,
8827                      int encodep, int norecord)
8828 {
8829   struct coding_system coding;
8830   EMACS_INT from, from_byte, to, to_byte;
8831   Lisp_Object src_object;
8832
8833   CHECK_NUMBER_COERCE_MARKER (start);
8834   CHECK_NUMBER_COERCE_MARKER (end);
8835   if (NILP (coding_system))
8836     coding_system = Qno_conversion;
8837   else
8838     CHECK_CODING_SYSTEM (coding_system);
8839   src_object = Fcurrent_buffer ();
8840   if (NILP (dst_object))
8841     dst_object = src_object;
8842   else if (! EQ (dst_object, Qt))
8843     CHECK_BUFFER (dst_object);
8844
8845   validate_region (&start, &end);
8846   from = XFASTINT (start);
8847   from_byte = CHAR_TO_BYTE (from);
8848   to = XFASTINT (end);
8849   to_byte = CHAR_TO_BYTE (to);
8850
8851   setup_coding_system (coding_system, &coding);
8852   coding.mode |= CODING_MODE_LAST_BLOCK;
8853
8854   if (encodep)
8855     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8856                           dst_object);
8857   else
8858     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8859                           dst_object);
8860   if (! norecord)
8861     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8862
8863   return (BUFFERP (dst_object)
8864           ? make_number (coding.produced_char)
8865           : coding.dst_object);
8866 }
8867
8868
8869 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8870        3, 4, "r\nzCoding system: ",
8871        doc: /* Decode the current region from the specified coding system.
8872 When called from a program, takes four arguments:
8873         START, END, CODING-SYSTEM, and DESTINATION.
8874 START and END are buffer positions.
8875
8876 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8877 If nil, the region between START and END is replaced by the decoded text.
8878 If buffer, the decoded text is inserted in that buffer after point (point
8879 does not move).
8880 In those cases, the length of the decoded text is returned.
8881 If DESTINATION is t, the decoded text is returned.
8882
8883 This function sets `last-coding-system-used' to the precise coding system
8884 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8885 not fully specified.)  */)
8886   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8887 {
8888   return code_convert_region (start, end, coding_system, destination, 0, 0);
8889 }
8890
8891 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8892        3, 4, "r\nzCoding system: ",
8893        doc: /* Encode the current region by specified coding system.
8894 When called from a program, takes four arguments:
8895         START, END, CODING-SYSTEM and DESTINATION.
8896 START and END are buffer positions.
8897
8898 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8899 If nil, the region between START and END is replace by the encoded text.
8900 If buffer, the encoded text is inserted in that buffer after point (point
8901 does not move).
8902 In those cases, the length of the encoded text is returned.
8903 If DESTINATION is t, the encoded text is returned.
8904
8905 This function sets `last-coding-system-used' to the precise coding system
8906 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8907 not fully specified.)  */)
8908   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8909 {
8910   return code_convert_region (start, end, coding_system, destination, 1, 0);
8911 }
8912
8913 Lisp_Object
8914 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8915                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8916 {
8917   struct coding_system coding;
8918   EMACS_INT chars, bytes;
8919
8920   CHECK_STRING (string);
8921   if (NILP (coding_system))
8922     {
8923       if (! norecord)
8924         Vlast_coding_system_used = Qno_conversion;
8925       if (NILP (dst_object))
8926         return (nocopy ? Fcopy_sequence (string) : string);
8927     }
8928
8929   if (NILP (coding_system))
8930     coding_system = Qno_conversion;
8931   else
8932     CHECK_CODING_SYSTEM (coding_system);
8933   if (NILP (dst_object))
8934     dst_object = Qt;
8935   else if (! EQ (dst_object, Qt))
8936     CHECK_BUFFER (dst_object);
8937
8938   setup_coding_system (coding_system, &coding);
8939   coding.mode |= CODING_MODE_LAST_BLOCK;
8940   chars = SCHARS (string);
8941   bytes = SBYTES (string);
8942   if (encodep)
8943     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8944   else
8945     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8946   if (! norecord)
8947     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8948
8949   return (BUFFERP (dst_object)
8950           ? make_number (coding.produced_char)
8951           : coding.dst_object);
8952 }
8953
8954
8955 /* Encode or decode STRING according to CODING_SYSTEM.
8956    Do not set Vlast_coding_system_used.
8957
8958    This function is called only from macros DECODE_FILE and
8959    ENCODE_FILE, thus we ignore character composition.  */
8960
8961 Lisp_Object
8962 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8963                               int encodep)
8964 {
8965   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8966 }
8967
8968
8969 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8970        2, 4, 0,
8971        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8972
8973 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8974 if the decoding operation is trivial.
8975
8976 Optional fourth arg BUFFER non-nil means that the decoded text is
8977 inserted in that buffer after point (point does not move).  In this
8978 case, the return value is the length of the decoded text.
8979
8980 This function sets `last-coding-system-used' to the precise coding system
8981 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8982 not fully specified.)  */)
8983   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8984 {
8985   return code_convert_string (string, coding_system, buffer,
8986                               0, ! NILP (nocopy), 0);
8987 }
8988
8989 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8990        2, 4, 0,
8991        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8992
8993 Optional third arg NOCOPY non-nil means it is OK to return STRING
8994 itself if the encoding operation is trivial.
8995
8996 Optional fourth arg BUFFER non-nil means that the encoded text is
8997 inserted in that buffer after point (point does not move).  In this
8998 case, the return value is the length of the encoded text.
8999
9000 This function sets `last-coding-system-used' to the precise coding system
9001 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9002 not fully specified.)  */)
9003   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9004 {
9005   return code_convert_string (string, coding_system, buffer,
9006                               1, ! NILP (nocopy), 0);
9007 }
9008
9009 \f
9010 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9011        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9012 Return the corresponding character.  */)
9013   (Lisp_Object code)
9014 {
9015   Lisp_Object spec, attrs, val;
9016   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9017   EMACS_INT ch;
9018   int c;
9019
9020   CHECK_NATNUM (code);
9021   ch = XFASTINT (code);
9022   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9023   attrs = AREF (spec, 0);
9024
9025   if (ASCII_BYTE_P (ch)
9026       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9027     return code;
9028
9029   val = CODING_ATTR_CHARSET_LIST (attrs);
9030   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9031   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9032   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9033
9034   if (ch <= 0x7F)
9035     {
9036       c = ch;
9037       charset = charset_roman;
9038     }
9039   else if (ch >= 0xA0 && ch < 0xDF)
9040     {
9041       c = ch - 0x80;
9042       charset = charset_kana;
9043     }
9044   else
9045     {
9046       EMACS_INT c1 = ch >> 8;
9047       int c2 = ch & 0xFF;
9048
9049       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9050           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9051         error ("Invalid code: %"pI"d", ch);
9052       c = ch;
9053       SJIS_TO_JIS (c);
9054       charset = charset_kanji;
9055     }
9056   c = DECODE_CHAR (charset, c);
9057   if (c < 0)
9058     error ("Invalid code: %"pI"d", ch);
9059   return make_number (c);
9060 }
9061
9062
9063 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9064        doc: /* Encode a Japanese character CH to shift_jis encoding.
9065 Return the corresponding code in SJIS.  */)
9066   (Lisp_Object ch)
9067 {
9068   Lisp_Object spec, attrs, charset_list;
9069   int c;
9070   struct charset *charset;
9071   unsigned code;
9072
9073   CHECK_CHARACTER (ch);
9074   c = XFASTINT (ch);
9075   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9076   attrs = AREF (spec, 0);
9077
9078   if (ASCII_CHAR_P (c)
9079       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9080     return ch;
9081
9082   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9083   charset = char_charset (c, charset_list, &code);
9084   if (code == CHARSET_INVALID_CODE (charset))
9085     error ("Can't encode by shift_jis encoding: %c", c);
9086   JIS_TO_SJIS (code);
9087
9088   return make_number (code);
9089 }
9090
9091 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9092        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9093 Return the corresponding character.  */)
9094   (Lisp_Object code)
9095 {
9096   Lisp_Object spec, attrs, val;
9097   struct charset *charset_roman, *charset_big5, *charset;
9098   EMACS_INT ch;
9099   int c;
9100
9101   CHECK_NATNUM (code);
9102   ch = XFASTINT (code);
9103   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9104   attrs = AREF (spec, 0);
9105
9106   if (ASCII_BYTE_P (ch)
9107       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9108     return code;
9109
9110   val = CODING_ATTR_CHARSET_LIST (attrs);
9111   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9112   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9113
9114   if (ch <= 0x7F)
9115     {
9116       c = ch;
9117       charset = charset_roman;
9118     }
9119   else
9120     {
9121       EMACS_INT b1 = ch >> 8;
9122       int b2 = ch & 0x7F;
9123       if (b1 < 0xA1 || b1 > 0xFE
9124           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9125         error ("Invalid code: %"pI"d", ch);
9126       c = ch;
9127       charset = charset_big5;
9128     }
9129   c = DECODE_CHAR (charset, c);
9130   if (c < 0)
9131     error ("Invalid code: %"pI"d", ch);
9132   return make_number (c);
9133 }
9134
9135 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9136        doc: /* Encode the Big5 character CH to BIG5 coding system.
9137 Return the corresponding character code in Big5.  */)
9138   (Lisp_Object ch)
9139 {
9140   Lisp_Object spec, attrs, charset_list;
9141   struct charset *charset;
9142   int c;
9143   unsigned code;
9144
9145   CHECK_CHARACTER (ch);
9146   c = XFASTINT (ch);
9147   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9148   attrs = AREF (spec, 0);
9149   if (ASCII_CHAR_P (c)
9150       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9151     return ch;
9152
9153   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9154   charset = char_charset (c, charset_list, &code);
9155   if (code == CHARSET_INVALID_CODE (charset))
9156     error ("Can't encode by Big5 encoding: %c", c);
9157
9158   return make_number (code);
9159 }
9160
9161 \f
9162 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9163        Sset_terminal_coding_system_internal, 1, 2, 0,
9164        doc: /* Internal use only.  */)
9165   (Lisp_Object coding_system, Lisp_Object terminal)
9166 {
9167   struct terminal *term = get_terminal (terminal, 1);
9168   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9169   CHECK_SYMBOL (coding_system);
9170   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9171   /* We had better not send unsafe characters to terminal.  */
9172   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9173   /* Character composition should be disabled.  */
9174   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9175   terminal_coding->src_multibyte = 1;
9176   terminal_coding->dst_multibyte = 0;
9177   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9178     term->charset_list = coding_charset_list (terminal_coding);
9179   else
9180     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9181   return Qnil;
9182 }
9183
9184 DEFUN ("set-safe-terminal-coding-system-internal",
9185        Fset_safe_terminal_coding_system_internal,
9186        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9187        doc: /* Internal use only.  */)
9188   (Lisp_Object coding_system)
9189 {
9190   CHECK_SYMBOL (coding_system);
9191   setup_coding_system (Fcheck_coding_system (coding_system),
9192                        &safe_terminal_coding);
9193   /* Character composition should be disabled.  */
9194   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9195   safe_terminal_coding.src_multibyte = 1;
9196   safe_terminal_coding.dst_multibyte = 0;
9197   return Qnil;
9198 }
9199
9200 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9201        Sterminal_coding_system, 0, 1, 0,
9202        doc: /* Return coding system specified for terminal output on the given terminal.
9203 TERMINAL may be a terminal object, a frame, or nil for the selected
9204 frame's terminal device.  */)
9205   (Lisp_Object terminal)
9206 {
9207   struct coding_system *terminal_coding
9208     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9209   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9210
9211   /* For backward compatibility, return nil if it is `undecided'. */
9212   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9213 }
9214
9215 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9216        Sset_keyboard_coding_system_internal, 1, 2, 0,
9217        doc: /* Internal use only.  */)
9218   (Lisp_Object coding_system, Lisp_Object terminal)
9219 {
9220   struct terminal *t = get_terminal (terminal, 1);
9221   CHECK_SYMBOL (coding_system);
9222   if (NILP (coding_system))
9223     coding_system = Qno_conversion;
9224   else
9225     Fcheck_coding_system (coding_system);
9226   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9227   /* Character composition should be disabled.  */
9228   TERMINAL_KEYBOARD_CODING (t)->common_flags
9229     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9230   return Qnil;
9231 }
9232
9233 DEFUN ("keyboard-coding-system",
9234        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9235        doc: /* Return coding system specified for decoding keyboard input.  */)
9236   (Lisp_Object terminal)
9237 {
9238   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9239                          (get_terminal (terminal, 1))->id);
9240 }
9241
9242 \f
9243 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9244        Sfind_operation_coding_system,  1, MANY, 0,
9245        doc: /* Choose a coding system for an operation based on the target name.
9246 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9247 DECODING-SYSTEM is the coding system to use for decoding
9248 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9249 for encoding (in case OPERATION does encoding).
9250
9251 The first argument OPERATION specifies an I/O primitive:
9252   For file I/O, `insert-file-contents' or `write-region'.
9253   For process I/O, `call-process', `call-process-region', or `start-process'.
9254   For network I/O, `open-network-stream'.
9255
9256 The remaining arguments should be the same arguments that were passed
9257 to the primitive.  Depending on which primitive, one of those arguments
9258 is selected as the TARGET.  For example, if OPERATION does file I/O,
9259 whichever argument specifies the file name is TARGET.
9260
9261 TARGET has a meaning which depends on OPERATION:
9262   For file I/O, TARGET is a file name (except for the special case below).
9263   For process I/O, TARGET is a process name.
9264   For network I/O, TARGET is a service name or a port number.
9265
9266 This function looks up what is specified for TARGET in
9267 `file-coding-system-alist', `process-coding-system-alist',
9268 or `network-coding-system-alist' depending on OPERATION.
9269 They may specify a coding system, a cons of coding systems,
9270 or a function symbol to call.
9271 In the last case, we call the function with one argument,
9272 which is a list of all the arguments given to this function.
9273 If the function can't decide a coding system, it can return
9274 `undecided' so that the normal code-detection is performed.
9275
9276 If OPERATION is `insert-file-contents', the argument corresponding to
9277 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9278 file name to look up, and BUFFER is a buffer that contains the file's
9279 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9280 function to call for FILENAME, that function should examine the
9281 contents of BUFFER instead of reading the file.
9282
9283 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9284   (ptrdiff_t nargs, Lisp_Object *args)
9285 {
9286   Lisp_Object operation, target_idx, target, val;
9287   register Lisp_Object chain;
9288
9289   if (nargs < 2)
9290     error ("Too few arguments");
9291   operation = args[0];
9292   if (!SYMBOLP (operation)
9293       || !NATNUMP (target_idx = Fget (operation, Qtarget_idx)))
9294     error ("Invalid first argument");
9295   if (nargs < 1 + XFASTINT (target_idx))
9296     error ("Too few arguments for operation `%s'",
9297            SDATA (SYMBOL_NAME (operation)));
9298   target = args[XFASTINT (target_idx) + 1];
9299   if (!(STRINGP (target)
9300         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9301             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9302         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9303     error ("Invalid argument %"pI"d of operation `%s'",
9304            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9305   if (CONSP (target))
9306     target = XCAR (target);
9307
9308   chain = ((EQ (operation, Qinsert_file_contents)
9309             || EQ (operation, Qwrite_region))
9310            ? Vfile_coding_system_alist
9311            : (EQ (operation, Qopen_network_stream)
9312               ? Vnetwork_coding_system_alist
9313               : Vprocess_coding_system_alist));
9314   if (NILP (chain))
9315     return Qnil;
9316
9317   for (; CONSP (chain); chain = XCDR (chain))
9318     {
9319       Lisp_Object elt;
9320
9321       elt = XCAR (chain);
9322       if (CONSP (elt)
9323           && ((STRINGP (target)
9324                && STRINGP (XCAR (elt))
9325                && fast_string_match (XCAR (elt), target) >= 0)
9326               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9327         {
9328           val = XCDR (elt);
9329           /* Here, if VAL is both a valid coding system and a valid
9330              function symbol, we return VAL as a coding system.  */
9331           if (CONSP (val))
9332             return val;
9333           if (! SYMBOLP (val))
9334             return Qnil;
9335           if (! NILP (Fcoding_system_p (val)))
9336             return Fcons (val, val);
9337           if (! NILP (Ffboundp (val)))
9338             {
9339               /* We use call1 rather than safe_call1
9340                  so as to get bug reports about functions called here
9341                  which don't handle the current interface.  */
9342               val = call1 (val, Flist (nargs, args));
9343               if (CONSP (val))
9344                 return val;
9345               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9346                 return Fcons (val, val);
9347             }
9348           return Qnil;
9349         }
9350     }
9351   return Qnil;
9352 }
9353
9354 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9355        Sset_coding_system_priority, 0, MANY, 0,
9356        doc: /* Assign higher priority to the coding systems given as arguments.
9357 If multiple coding systems belong to the same category,
9358 all but the first one are ignored.
9359
9360 usage: (set-coding-system-priority &rest coding-systems)  */)
9361   (ptrdiff_t nargs, Lisp_Object *args)
9362 {
9363   ptrdiff_t i, j;
9364   int changed[coding_category_max];
9365   enum coding_category priorities[coding_category_max];
9366
9367   memset (changed, 0, sizeof changed);
9368
9369   for (i = j = 0; i < nargs; i++)
9370     {
9371       enum coding_category category;
9372       Lisp_Object spec, attrs;
9373
9374       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9375       attrs = AREF (spec, 0);
9376       category = XINT (CODING_ATTR_CATEGORY (attrs));
9377       if (changed[category])
9378         /* Ignore this coding system because a coding system of the
9379            same category already had a higher priority.  */
9380         continue;
9381       changed[category] = 1;
9382       priorities[j++] = category;
9383       if (coding_categories[category].id >= 0
9384           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9385         setup_coding_system (args[i], &coding_categories[category]);
9386       Fset (AREF (Vcoding_category_table, category), args[i]);
9387     }
9388
9389   /* Now we have decided top J priorities.  Reflect the order of the
9390      original priorities to the remaining priorities.  */
9391
9392   for (i = j, j = 0; i < coding_category_max; i++, j++)
9393     {
9394       while (j < coding_category_max
9395              && changed[coding_priorities[j]])
9396         j++;
9397       if (j == coding_category_max)
9398         abort ();
9399       priorities[i] = coding_priorities[j];
9400     }
9401
9402   memcpy (coding_priorities, priorities, sizeof priorities);
9403
9404   /* Update `coding-category-list'.  */
9405   Vcoding_category_list = Qnil;
9406   for (i = coding_category_max; i-- > 0; )
9407     Vcoding_category_list
9408       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9409                Vcoding_category_list);
9410
9411   return Qnil;
9412 }
9413
9414 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9415        Scoding_system_priority_list, 0, 1, 0,
9416        doc: /* Return a list of coding systems ordered by their priorities.
9417 The list contains a subset of coding systems; i.e. coding systems
9418 assigned to each coding category (see `coding-category-list').
9419
9420 HIGHESTP non-nil means just return the highest priority one.  */)
9421   (Lisp_Object highestp)
9422 {
9423   int i;
9424   Lisp_Object val;
9425
9426   for (i = 0, val = Qnil; i < coding_category_max; i++)
9427     {
9428       enum coding_category category = coding_priorities[i];
9429       int id = coding_categories[category].id;
9430       Lisp_Object attrs;
9431
9432       if (id < 0)
9433         continue;
9434       attrs = CODING_ID_ATTRS (id);
9435       if (! NILP (highestp))
9436         return CODING_ATTR_BASE_NAME (attrs);
9437       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9438     }
9439   return Fnreverse (val);
9440 }
9441
9442 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9443
9444 static Lisp_Object
9445 make_subsidiaries (Lisp_Object base)
9446 {
9447   Lisp_Object subsidiaries;
9448   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9449   char *buf = (char *) alloca (base_name_len + 6);
9450   int i;
9451
9452   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9453   subsidiaries = Fmake_vector (make_number (3), Qnil);
9454   for (i = 0; i < 3; i++)
9455     {
9456       strcpy (buf + base_name_len, suffixes[i]);
9457       ASET (subsidiaries, i, intern (buf));
9458     }
9459   return subsidiaries;
9460 }
9461
9462
9463 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9464        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9465        doc: /* For internal use only.
9466 usage: (define-coding-system-internal ...)  */)
9467   (ptrdiff_t nargs, Lisp_Object *args)
9468 {
9469   Lisp_Object name;
9470   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9471   Lisp_Object attrs;            /* Vector of attributes.  */
9472   Lisp_Object eol_type;
9473   Lisp_Object aliases;
9474   Lisp_Object coding_type, charset_list, safe_charsets;
9475   enum coding_category category;
9476   Lisp_Object tail, val;
9477   int max_charset_id = 0;
9478   int i;
9479
9480   if (nargs < coding_arg_max)
9481     goto short_args;
9482
9483   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9484
9485   name = args[coding_arg_name];
9486   CHECK_SYMBOL (name);
9487   CODING_ATTR_BASE_NAME (attrs) = name;
9488
9489   val = args[coding_arg_mnemonic];
9490   if (! STRINGP (val))
9491     CHECK_CHARACTER (val);
9492   CODING_ATTR_MNEMONIC (attrs) = val;
9493
9494   coding_type = args[coding_arg_coding_type];
9495   CHECK_SYMBOL (coding_type);
9496   CODING_ATTR_TYPE (attrs) = coding_type;
9497
9498   charset_list = args[coding_arg_charset_list];
9499   if (SYMBOLP (charset_list))
9500     {
9501       if (EQ (charset_list, Qiso_2022))
9502         {
9503           if (! EQ (coding_type, Qiso_2022))
9504             error ("Invalid charset-list");
9505           charset_list = Viso_2022_charset_list;
9506         }
9507       else if (EQ (charset_list, Qemacs_mule))
9508         {
9509           if (! EQ (coding_type, Qemacs_mule))
9510             error ("Invalid charset-list");
9511           charset_list = Vemacs_mule_charset_list;
9512         }
9513       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9514         if (max_charset_id < XFASTINT (XCAR (tail)))
9515           max_charset_id = XFASTINT (XCAR (tail));
9516     }
9517   else
9518     {
9519       charset_list = Fcopy_sequence (charset_list);
9520       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9521         {
9522           struct charset *charset;
9523
9524           val = XCAR (tail);
9525           CHECK_CHARSET_GET_CHARSET (val, charset);
9526           if (EQ (coding_type, Qiso_2022)
9527               ? CHARSET_ISO_FINAL (charset) < 0
9528               : EQ (coding_type, Qemacs_mule)
9529               ? CHARSET_EMACS_MULE_ID (charset) < 0
9530               : 0)
9531             error ("Can't handle charset `%s'",
9532                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9533
9534           XSETCAR (tail, make_number (charset->id));
9535           if (max_charset_id < charset->id)
9536             max_charset_id = charset->id;
9537         }
9538     }
9539   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9540
9541   safe_charsets = make_uninit_string (max_charset_id + 1);
9542   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9543   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9544     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9545   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9546
9547   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9548
9549   val = args[coding_arg_decode_translation_table];
9550   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9551     CHECK_SYMBOL (val);
9552   CODING_ATTR_DECODE_TBL (attrs) = val;
9553
9554   val = args[coding_arg_encode_translation_table];
9555   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9556     CHECK_SYMBOL (val);
9557   CODING_ATTR_ENCODE_TBL (attrs) = val;
9558
9559   val = args[coding_arg_post_read_conversion];
9560   CHECK_SYMBOL (val);
9561   CODING_ATTR_POST_READ (attrs) = val;
9562
9563   val = args[coding_arg_pre_write_conversion];
9564   CHECK_SYMBOL (val);
9565   CODING_ATTR_PRE_WRITE (attrs) = val;
9566
9567   val = args[coding_arg_default_char];
9568   if (NILP (val))
9569     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9570   else
9571     {
9572       CHECK_CHARACTER (val);
9573       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9574     }
9575
9576   val = args[coding_arg_for_unibyte];
9577   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9578
9579   val = args[coding_arg_plist];
9580   CHECK_LIST (val);
9581   CODING_ATTR_PLIST (attrs) = val;
9582
9583   if (EQ (coding_type, Qcharset))
9584     {
9585       /* Generate a lisp vector of 256 elements.  Each element is nil,
9586          integer, or a list of charset IDs.
9587
9588          If Nth element is nil, the byte code N is invalid in this
9589          coding system.
9590
9591          If Nth element is a number NUM, N is the first byte of a
9592          charset whose ID is NUM.
9593
9594          If Nth element is a list of charset IDs, N is the first byte
9595          of one of them.  The list is sorted by dimensions of the
9596          charsets.  A charset of smaller dimension comes first. */
9597       val = Fmake_vector (make_number (256), Qnil);
9598
9599       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9600         {
9601           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9602           int dim = CHARSET_DIMENSION (charset);
9603           int idx = (dim - 1) * 4;
9604
9605           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9606             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9607
9608           for (i = charset->code_space[idx];
9609                i <= charset->code_space[idx + 1]; i++)
9610             {
9611               Lisp_Object tmp, tmp2;
9612               int dim2;
9613
9614               tmp = AREF (val, i);
9615               if (NILP (tmp))
9616                 tmp = XCAR (tail);
9617               else if (NUMBERP (tmp))
9618                 {
9619                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9620                   if (dim < dim2)
9621                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9622                   else
9623                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9624                 }
9625               else
9626                 {
9627                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9628                     {
9629                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9630                       if (dim < dim2)
9631                         break;
9632                     }
9633                   if (NILP (tmp2))
9634                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9635                   else
9636                     {
9637                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9638                       XSETCAR (tmp2, XCAR (tail));
9639                     }
9640                 }
9641               ASET (val, i, tmp);
9642             }
9643         }
9644       ASET (attrs, coding_attr_charset_valids, val);
9645       category = coding_category_charset;
9646     }
9647   else if (EQ (coding_type, Qccl))
9648     {
9649       Lisp_Object valids;
9650
9651       if (nargs < coding_arg_ccl_max)
9652         goto short_args;
9653
9654       val = args[coding_arg_ccl_decoder];
9655       CHECK_CCL_PROGRAM (val);
9656       if (VECTORP (val))
9657         val = Fcopy_sequence (val);
9658       ASET (attrs, coding_attr_ccl_decoder, val);
9659
9660       val = args[coding_arg_ccl_encoder];
9661       CHECK_CCL_PROGRAM (val);
9662       if (VECTORP (val))
9663         val = Fcopy_sequence (val);
9664       ASET (attrs, coding_attr_ccl_encoder, val);
9665
9666       val = args[coding_arg_ccl_valids];
9667       valids = Fmake_string (make_number (256), make_number (0));
9668       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9669         {
9670           int from, to;
9671
9672           val = Fcar (tail);
9673           if (INTEGERP (val))
9674             {
9675               from = to = XINT (val);
9676               if (from < 0 || from > 255)
9677                 args_out_of_range_3 (val, make_number (0), make_number (255));
9678             }
9679           else
9680             {
9681               CHECK_CONS (val);
9682               CHECK_NATNUM_CAR (val);
9683               CHECK_NATNUM_CDR (val);
9684               from = XINT (XCAR (val));
9685               if (from > 255)
9686                 args_out_of_range_3 (XCAR (val),
9687                                      make_number (0), make_number (255));
9688               to = XINT (XCDR (val));
9689               if (to < from || to > 255)
9690                 args_out_of_range_3 (XCDR (val),
9691                                      XCAR (val), make_number (255));
9692             }
9693           for (i = from; i <= to; i++)
9694             SSET (valids, i, 1);
9695         }
9696       ASET (attrs, coding_attr_ccl_valids, valids);
9697
9698       category = coding_category_ccl;
9699     }
9700   else if (EQ (coding_type, Qutf_16))
9701     {
9702       Lisp_Object bom, endian;
9703
9704       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9705
9706       if (nargs < coding_arg_utf16_max)
9707         goto short_args;
9708
9709       bom = args[coding_arg_utf16_bom];
9710       if (! NILP (bom) && ! EQ (bom, Qt))
9711         {
9712           CHECK_CONS (bom);
9713           val = XCAR (bom);
9714           CHECK_CODING_SYSTEM (val);
9715           val = XCDR (bom);
9716           CHECK_CODING_SYSTEM (val);
9717         }
9718       ASET (attrs, coding_attr_utf_bom, bom);
9719
9720       endian = args[coding_arg_utf16_endian];
9721       CHECK_SYMBOL (endian);
9722       if (NILP (endian))
9723         endian = Qbig;
9724       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9725         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9726       ASET (attrs, coding_attr_utf_16_endian, endian);
9727
9728       category = (CONSP (bom)
9729                   ? coding_category_utf_16_auto
9730                   : NILP (bom)
9731                   ? (EQ (endian, Qbig)
9732                      ? coding_category_utf_16_be_nosig
9733                      : coding_category_utf_16_le_nosig)
9734                   : (EQ (endian, Qbig)
9735                      ? coding_category_utf_16_be
9736                      : coding_category_utf_16_le));
9737     }
9738   else if (EQ (coding_type, Qiso_2022))
9739     {
9740       Lisp_Object initial, reg_usage, request, flags;
9741
9742       if (nargs < coding_arg_iso2022_max)
9743         goto short_args;
9744
9745       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9746       CHECK_VECTOR (initial);
9747       for (i = 0; i < 4; i++)
9748         {
9749           val = Faref (initial, make_number (i));
9750           if (! NILP (val))
9751             {
9752               struct charset *charset;
9753
9754               CHECK_CHARSET_GET_CHARSET (val, charset);
9755               ASET (initial, i, make_number (CHARSET_ID (charset)));
9756               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9757                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9758             }
9759           else
9760             ASET (initial, i, make_number (-1));
9761         }
9762
9763       reg_usage = args[coding_arg_iso2022_reg_usage];
9764       CHECK_CONS (reg_usage);
9765       CHECK_NUMBER_CAR (reg_usage);
9766       CHECK_NUMBER_CDR (reg_usage);
9767
9768       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9769       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9770         {
9771           int id;
9772           Lisp_Object tmp1;
9773
9774           val = Fcar (tail);
9775           CHECK_CONS (val);
9776           tmp1 = XCAR (val);
9777           CHECK_CHARSET_GET_ID (tmp1, id);
9778           CHECK_NATNUM_CDR (val);
9779           if (XINT (XCDR (val)) >= 4)
9780             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9781           XSETCAR (val, make_number (id));
9782         }
9783
9784       flags = args[coding_arg_iso2022_flags];
9785       CHECK_NATNUM (flags);
9786       i = XINT (flags);
9787       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9788         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9789
9790       ASET (attrs, coding_attr_iso_initial, initial);
9791       ASET (attrs, coding_attr_iso_usage, reg_usage);
9792       ASET (attrs, coding_attr_iso_request, request);
9793       ASET (attrs, coding_attr_iso_flags, flags);
9794       setup_iso_safe_charsets (attrs);
9795
9796       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9797         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9798                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9799                     ? coding_category_iso_7_else
9800                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9801                     ? coding_category_iso_7
9802                     : coding_category_iso_7_tight);
9803       else
9804         {
9805           int id = XINT (AREF (initial, 1));
9806
9807           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9808                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9809                        || id < 0)
9810                       ? coding_category_iso_8_else
9811                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9812                       ? coding_category_iso_8_1
9813                       : coding_category_iso_8_2);
9814         }
9815       if (category != coding_category_iso_8_1
9816           && category != coding_category_iso_8_2)
9817         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9818     }
9819   else if (EQ (coding_type, Qemacs_mule))
9820     {
9821       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9822         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9823       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9824       category = coding_category_emacs_mule;
9825     }
9826   else if (EQ (coding_type, Qshift_jis))
9827     {
9828
9829       struct charset *charset;
9830
9831       if (XINT (Flength (charset_list)) != 3
9832           && XINT (Flength (charset_list)) != 4)
9833         error ("There should be three or four charsets");
9834
9835       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9836       if (CHARSET_DIMENSION (charset) != 1)
9837         error ("Dimension of charset %s is not one",
9838                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9839       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9840         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9841
9842       charset_list = XCDR (charset_list);
9843       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9844       if (CHARSET_DIMENSION (charset) != 1)
9845         error ("Dimension of charset %s is not one",
9846                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9847
9848       charset_list = XCDR (charset_list);
9849       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9850       if (CHARSET_DIMENSION (charset) != 2)
9851         error ("Dimension of charset %s is not two",
9852                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9853
9854       charset_list = XCDR (charset_list);
9855       if (! NILP (charset_list))
9856         {
9857           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9858           if (CHARSET_DIMENSION (charset) != 2)
9859             error ("Dimension of charset %s is not two",
9860                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9861         }
9862
9863       category = coding_category_sjis;
9864       Vsjis_coding_system = name;
9865     }
9866   else if (EQ (coding_type, Qbig5))
9867     {
9868       struct charset *charset;
9869
9870       if (XINT (Flength (charset_list)) != 2)
9871         error ("There should be just two charsets");
9872
9873       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9874       if (CHARSET_DIMENSION (charset) != 1)
9875         error ("Dimension of charset %s is not one",
9876                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9877       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9878         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9879
9880       charset_list = XCDR (charset_list);
9881       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9882       if (CHARSET_DIMENSION (charset) != 2)
9883         error ("Dimension of charset %s is not two",
9884                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9885
9886       category = coding_category_big5;
9887       Vbig5_coding_system = name;
9888     }
9889   else if (EQ (coding_type, Qraw_text))
9890     {
9891       category = coding_category_raw_text;
9892       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9893     }
9894   else if (EQ (coding_type, Qutf_8))
9895     {
9896       Lisp_Object bom;
9897
9898       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9899
9900       if (nargs < coding_arg_utf8_max)
9901         goto short_args;
9902
9903       bom = args[coding_arg_utf8_bom];
9904       if (! NILP (bom) && ! EQ (bom, Qt))
9905         {
9906           CHECK_CONS (bom);
9907           val = XCAR (bom);
9908           CHECK_CODING_SYSTEM (val);
9909           val = XCDR (bom);
9910           CHECK_CODING_SYSTEM (val);
9911         }
9912       ASET (attrs, coding_attr_utf_bom, bom);
9913
9914       category = (CONSP (bom) ? coding_category_utf_8_auto
9915                   : NILP (bom) ? coding_category_utf_8_nosig
9916                   : coding_category_utf_8_sig);
9917     }
9918   else if (EQ (coding_type, Qundecided))
9919     category = coding_category_undecided;
9920   else
9921     error ("Invalid coding system type: %s",
9922            SDATA (SYMBOL_NAME (coding_type)));
9923
9924   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9925   CODING_ATTR_PLIST (attrs)
9926     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9927                                 CODING_ATTR_PLIST (attrs)));
9928   CODING_ATTR_PLIST (attrs)
9929     = Fcons (QCascii_compatible_p,
9930              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9931                     CODING_ATTR_PLIST (attrs)));
9932
9933   eol_type = args[coding_arg_eol_type];
9934   if (! NILP (eol_type)
9935       && ! EQ (eol_type, Qunix)
9936       && ! EQ (eol_type, Qdos)
9937       && ! EQ (eol_type, Qmac))
9938     error ("Invalid eol-type");
9939
9940   aliases = Fcons (name, Qnil);
9941
9942   if (NILP (eol_type))
9943     {
9944       eol_type = make_subsidiaries (name);
9945       for (i = 0; i < 3; i++)
9946         {
9947           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9948
9949           this_name = AREF (eol_type, i);
9950           this_aliases = Fcons (this_name, Qnil);
9951           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9952           this_spec = Fmake_vector (make_number (3), attrs);
9953           ASET (this_spec, 1, this_aliases);
9954           ASET (this_spec, 2, this_eol_type);
9955           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9956           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9957           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9958           if (NILP (val))
9959             Vcoding_system_alist
9960               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9961                        Vcoding_system_alist);
9962         }
9963     }
9964
9965   spec_vec = Fmake_vector (make_number (3), attrs);
9966   ASET (spec_vec, 1, aliases);
9967   ASET (spec_vec, 2, eol_type);
9968
9969   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9970   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9971   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9972   if (NILP (val))
9973     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9974                                   Vcoding_system_alist);
9975
9976   {
9977     int id = coding_categories[category].id;
9978
9979     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9980       setup_coding_system (name, &coding_categories[category]);
9981   }
9982
9983   return Qnil;
9984
9985  short_args:
9986   return Fsignal (Qwrong_number_of_arguments,
9987                   Fcons (intern ("define-coding-system-internal"),
9988                          make_number (nargs)));
9989 }
9990
9991
9992 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9993        3, 3, 0,
9994        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9995   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
9996 {
9997   Lisp_Object spec, attrs;
9998
9999   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10000   attrs = AREF (spec, 0);
10001   if (EQ (prop, QCmnemonic))
10002     {
10003       if (! STRINGP (val))
10004         CHECK_CHARACTER (val);
10005       CODING_ATTR_MNEMONIC (attrs) = val;
10006     }
10007   else if (EQ (prop, QCdefault_char))
10008     {
10009       if (NILP (val))
10010         val = make_number (' ');
10011       else
10012         CHECK_CHARACTER (val);
10013       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10014     }
10015   else if (EQ (prop, QCdecode_translation_table))
10016     {
10017       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10018         CHECK_SYMBOL (val);
10019       CODING_ATTR_DECODE_TBL (attrs) = val;
10020     }
10021   else if (EQ (prop, QCencode_translation_table))
10022     {
10023       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10024         CHECK_SYMBOL (val);
10025       CODING_ATTR_ENCODE_TBL (attrs) = val;
10026     }
10027   else if (EQ (prop, QCpost_read_conversion))
10028     {
10029       CHECK_SYMBOL (val);
10030       CODING_ATTR_POST_READ (attrs) = val;
10031     }
10032   else if (EQ (prop, QCpre_write_conversion))
10033     {
10034       CHECK_SYMBOL (val);
10035       CODING_ATTR_PRE_WRITE (attrs) = val;
10036     }
10037   else if (EQ (prop, QCascii_compatible_p))
10038     {
10039       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10040     }
10041
10042   CODING_ATTR_PLIST (attrs)
10043     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10044   return val;
10045 }
10046
10047
10048 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10049        Sdefine_coding_system_alias, 2, 2, 0,
10050        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10051   (Lisp_Object alias, Lisp_Object coding_system)
10052 {
10053   Lisp_Object spec, aliases, eol_type, val;
10054
10055   CHECK_SYMBOL (alias);
10056   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10057   aliases = AREF (spec, 1);
10058   /* ALIASES should be a list of length more than zero, and the first
10059      element is a base coding system.  Append ALIAS at the tail of the
10060      list.  */
10061   while (!NILP (XCDR (aliases)))
10062     aliases = XCDR (aliases);
10063   XSETCDR (aliases, Fcons (alias, Qnil));
10064
10065   eol_type = AREF (spec, 2);
10066   if (VECTORP (eol_type))
10067     {
10068       Lisp_Object subsidiaries;
10069       int i;
10070
10071       subsidiaries = make_subsidiaries (alias);
10072       for (i = 0; i < 3; i++)
10073         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10074                                      AREF (eol_type, i));
10075     }
10076
10077   Fputhash (alias, spec, Vcoding_system_hash_table);
10078   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10079   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10080   if (NILP (val))
10081     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10082                                   Vcoding_system_alist);
10083
10084   return Qnil;
10085 }
10086
10087 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10088        1, 1, 0,
10089        doc: /* Return the base of CODING-SYSTEM.
10090 Any alias or subsidiary coding system is not a base coding system.  */)
10091   (Lisp_Object coding_system)
10092 {
10093   Lisp_Object spec, attrs;
10094
10095   if (NILP (coding_system))
10096     return (Qno_conversion);
10097   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10098   attrs = AREF (spec, 0);
10099   return CODING_ATTR_BASE_NAME (attrs);
10100 }
10101
10102 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10103        1, 1, 0,
10104        doc: "Return the property list of CODING-SYSTEM.")
10105   (Lisp_Object coding_system)
10106 {
10107   Lisp_Object spec, attrs;
10108
10109   if (NILP (coding_system))
10110     coding_system = Qno_conversion;
10111   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10112   attrs = AREF (spec, 0);
10113   return CODING_ATTR_PLIST (attrs);
10114 }
10115
10116
10117 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10118        1, 1, 0,
10119        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10120   (Lisp_Object coding_system)
10121 {
10122   Lisp_Object spec;
10123
10124   if (NILP (coding_system))
10125     coding_system = Qno_conversion;
10126   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10127   return AREF (spec, 1);
10128 }
10129
10130 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10131        Scoding_system_eol_type, 1, 1, 0,
10132        doc: /* Return eol-type of CODING-SYSTEM.
10133 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10134
10135 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10136 and CR respectively.
10137
10138 A vector value indicates that a format of end-of-line should be
10139 detected automatically.  Nth element of the vector is the subsidiary
10140 coding system whose eol-type is N.  */)
10141   (Lisp_Object coding_system)
10142 {
10143   Lisp_Object spec, eol_type;
10144   int n;
10145
10146   if (NILP (coding_system))
10147     coding_system = Qno_conversion;
10148   if (! CODING_SYSTEM_P (coding_system))
10149     return Qnil;
10150   spec = CODING_SYSTEM_SPEC (coding_system);
10151   eol_type = AREF (spec, 2);
10152   if (VECTORP (eol_type))
10153     return Fcopy_sequence (eol_type);
10154   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10155   return make_number (n);
10156 }
10157
10158 #endif /* emacs */
10159
10160 \f
10161 /*** 9. Post-amble ***/
10162
10163 void
10164 init_coding_once (void)
10165 {
10166   int i;
10167
10168   for (i = 0; i < coding_category_max; i++)
10169     {
10170       coding_categories[i].id = -1;
10171       coding_priorities[i] = i;
10172     }
10173
10174   /* ISO2022 specific initialize routine.  */
10175   for (i = 0; i < 0x20; i++)
10176     iso_code_class[i] = ISO_control_0;
10177   for (i = 0x21; i < 0x7F; i++)
10178     iso_code_class[i] = ISO_graphic_plane_0;
10179   for (i = 0x80; i < 0xA0; i++)
10180     iso_code_class[i] = ISO_control_1;
10181   for (i = 0xA1; i < 0xFF; i++)
10182     iso_code_class[i] = ISO_graphic_plane_1;
10183   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10184   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10185   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10186   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10187   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10188   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10189   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10190   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10191   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10192
10193   for (i = 0; i < 256; i++)
10194     {
10195       emacs_mule_bytes[i] = 1;
10196     }
10197   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10198   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10199   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10200   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10201 }
10202
10203 #ifdef emacs
10204
10205 void
10206 syms_of_coding (void)
10207 {
10208   staticpro (&Vcoding_system_hash_table);
10209   {
10210     Lisp_Object args[2];
10211     args[0] = QCtest;
10212     args[1] = Qeq;
10213     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10214   }
10215
10216   staticpro (&Vsjis_coding_system);
10217   Vsjis_coding_system = Qnil;
10218
10219   staticpro (&Vbig5_coding_system);
10220   Vbig5_coding_system = Qnil;
10221
10222   staticpro (&Vcode_conversion_reused_workbuf);
10223   Vcode_conversion_reused_workbuf = Qnil;
10224
10225   staticpro (&Vcode_conversion_workbuf_name);
10226   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10227
10228   reused_workbuf_in_use = 0;
10229
10230   DEFSYM (Qcharset, "charset");
10231   DEFSYM (Qtarget_idx, "target-idx");
10232   DEFSYM (Qcoding_system_history, "coding-system-history");
10233   Fset (Qcoding_system_history, Qnil);
10234
10235   /* Target FILENAME is the first argument.  */
10236   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10237   /* Target FILENAME is the third argument.  */
10238   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10239
10240   DEFSYM (Qcall_process, "call-process");
10241   /* Target PROGRAM is the first argument.  */
10242   Fput (Qcall_process, Qtarget_idx, make_number (0));
10243
10244   DEFSYM (Qcall_process_region, "call-process-region");
10245   /* Target PROGRAM is the third argument.  */
10246   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10247
10248   DEFSYM (Qstart_process, "start-process");
10249   /* Target PROGRAM is the third argument.  */
10250   Fput (Qstart_process, Qtarget_idx, make_number (2));
10251
10252   DEFSYM (Qopen_network_stream, "open-network-stream");
10253   /* Target SERVICE is the fourth argument.  */
10254   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10255
10256   DEFSYM (Qcoding_system, "coding-system");
10257   DEFSYM (Qcoding_aliases, "coding-aliases");
10258
10259   DEFSYM (Qeol_type, "eol-type");
10260   DEFSYM (Qunix, "unix");
10261   DEFSYM (Qdos, "dos");
10262
10263   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10264   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10265   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10266   DEFSYM (Qdefault_char, "default-char");
10267   DEFSYM (Qundecided, "undecided");
10268   DEFSYM (Qno_conversion, "no-conversion");
10269   DEFSYM (Qraw_text, "raw-text");
10270
10271   DEFSYM (Qiso_2022, "iso-2022");
10272
10273   DEFSYM (Qutf_8, "utf-8");
10274   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10275
10276   DEFSYM (Qutf_16, "utf-16");
10277   DEFSYM (Qbig, "big");
10278   DEFSYM (Qlittle, "little");
10279
10280   DEFSYM (Qshift_jis, "shift-jis");
10281   DEFSYM (Qbig5, "big5");
10282
10283   DEFSYM (Qcoding_system_p, "coding-system-p");
10284
10285   DEFSYM (Qcoding_system_error, "coding-system-error");
10286   Fput (Qcoding_system_error, Qerror_conditions,
10287         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10288   Fput (Qcoding_system_error, Qerror_message,
10289         make_pure_c_string ("Invalid coding system"));
10290
10291   /* Intern this now in case it isn't already done.
10292      Setting this variable twice is harmless.
10293      But don't staticpro it here--that is done in alloc.c.  */
10294   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10295
10296   DEFSYM (Qtranslation_table, "translation-table");
10297   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10298   DEFSYM (Qtranslation_table_id, "translation-table-id");
10299   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10300   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10301
10302   DEFSYM (Qvalid_codes, "valid-codes");
10303
10304   DEFSYM (Qemacs_mule, "emacs-mule");
10305
10306   DEFSYM (QCcategory, ":category");
10307   DEFSYM (QCmnemonic, ":mnemonic");
10308   DEFSYM (QCdefault_char, ":default-char");
10309   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10310   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10311   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10312   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10313   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10314
10315   Vcoding_category_table
10316     = Fmake_vector (make_number (coding_category_max), Qnil);
10317   staticpro (&Vcoding_category_table);
10318   /* Followings are target of code detection.  */
10319   ASET (Vcoding_category_table, coding_category_iso_7,
10320         intern_c_string ("coding-category-iso-7"));
10321   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10322         intern_c_string ("coding-category-iso-7-tight"));
10323   ASET (Vcoding_category_table, coding_category_iso_8_1,
10324         intern_c_string ("coding-category-iso-8-1"));
10325   ASET (Vcoding_category_table, coding_category_iso_8_2,
10326         intern_c_string ("coding-category-iso-8-2"));
10327   ASET (Vcoding_category_table, coding_category_iso_7_else,
10328         intern_c_string ("coding-category-iso-7-else"));
10329   ASET (Vcoding_category_table, coding_category_iso_8_else,
10330         intern_c_string ("coding-category-iso-8-else"));
10331   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10332         intern_c_string ("coding-category-utf-8-auto"));
10333   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10334         intern_c_string ("coding-category-utf-8"));
10335   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10336         intern_c_string ("coding-category-utf-8-sig"));
10337   ASET (Vcoding_category_table, coding_category_utf_16_be,
10338         intern_c_string ("coding-category-utf-16-be"));
10339   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10340         intern_c_string ("coding-category-utf-16-auto"));
10341   ASET (Vcoding_category_table, coding_category_utf_16_le,
10342         intern_c_string ("coding-category-utf-16-le"));
10343   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10344         intern_c_string ("coding-category-utf-16-be-nosig"));
10345   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10346         intern_c_string ("coding-category-utf-16-le-nosig"));
10347   ASET (Vcoding_category_table, coding_category_charset,
10348         intern_c_string ("coding-category-charset"));
10349   ASET (Vcoding_category_table, coding_category_sjis,
10350         intern_c_string ("coding-category-sjis"));
10351   ASET (Vcoding_category_table, coding_category_big5,
10352         intern_c_string ("coding-category-big5"));
10353   ASET (Vcoding_category_table, coding_category_ccl,
10354         intern_c_string ("coding-category-ccl"));
10355   ASET (Vcoding_category_table, coding_category_emacs_mule,
10356         intern_c_string ("coding-category-emacs-mule"));
10357   /* Followings are NOT target of code detection.  */
10358   ASET (Vcoding_category_table, coding_category_raw_text,
10359         intern_c_string ("coding-category-raw-text"));
10360   ASET (Vcoding_category_table, coding_category_undecided,
10361         intern_c_string ("coding-category-undecided"));
10362
10363   DEFSYM (Qinsufficient_source, "insufficient-source");
10364   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10365   DEFSYM (Qinvalid_source, "invalid-source");
10366   DEFSYM (Qinterrupted, "interrupted");
10367   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10368   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10369
10370   defsubr (&Scoding_system_p);
10371   defsubr (&Sread_coding_system);
10372   defsubr (&Sread_non_nil_coding_system);
10373   defsubr (&Scheck_coding_system);
10374   defsubr (&Sdetect_coding_region);
10375   defsubr (&Sdetect_coding_string);
10376   defsubr (&Sfind_coding_systems_region_internal);
10377   defsubr (&Sunencodable_char_position);
10378   defsubr (&Scheck_coding_systems_region);
10379   defsubr (&Sdecode_coding_region);
10380   defsubr (&Sencode_coding_region);
10381   defsubr (&Sdecode_coding_string);
10382   defsubr (&Sencode_coding_string);
10383   defsubr (&Sdecode_sjis_char);
10384   defsubr (&Sencode_sjis_char);
10385   defsubr (&Sdecode_big5_char);
10386   defsubr (&Sencode_big5_char);
10387   defsubr (&Sset_terminal_coding_system_internal);
10388   defsubr (&Sset_safe_terminal_coding_system_internal);
10389   defsubr (&Sterminal_coding_system);
10390   defsubr (&Sset_keyboard_coding_system_internal);
10391   defsubr (&Skeyboard_coding_system);
10392   defsubr (&Sfind_operation_coding_system);
10393   defsubr (&Sset_coding_system_priority);
10394   defsubr (&Sdefine_coding_system_internal);
10395   defsubr (&Sdefine_coding_system_alias);
10396   defsubr (&Scoding_system_put);
10397   defsubr (&Scoding_system_base);
10398   defsubr (&Scoding_system_plist);
10399   defsubr (&Scoding_system_aliases);
10400   defsubr (&Scoding_system_eol_type);
10401   defsubr (&Scoding_system_priority_list);
10402
10403   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10404                doc: /* List of coding systems.
10405
10406 Do not alter the value of this variable manually.  This variable should be
10407 updated by the functions `define-coding-system' and
10408 `define-coding-system-alias'.  */);
10409   Vcoding_system_list = Qnil;
10410
10411   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10412                doc: /* Alist of coding system names.
10413 Each element is one element list of coding system name.
10414 This variable is given to `completing-read' as COLLECTION argument.
10415
10416 Do not alter the value of this variable manually.  This variable should be
10417 updated by the functions `make-coding-system' and
10418 `define-coding-system-alias'.  */);
10419   Vcoding_system_alist = Qnil;
10420
10421   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10422                doc: /* List of coding-categories (symbols) ordered by priority.
10423
10424 On detecting a coding system, Emacs tries code detection algorithms
10425 associated with each coding-category one by one in this order.  When
10426 one algorithm agrees with a byte sequence of source text, the coding
10427 system bound to the corresponding coding-category is selected.
10428
10429 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10430   {
10431     int i;
10432
10433     Vcoding_category_list = Qnil;
10434     for (i = coding_category_max - 1; i >= 0; i--)
10435       Vcoding_category_list
10436         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10437                  Vcoding_category_list);
10438   }
10439
10440   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10441                doc: /* Specify the coding system for read operations.
10442 It is useful to bind this variable with `let', but do not set it globally.
10443 If the value is a coding system, it is used for decoding on read operation.
10444 If not, an appropriate element is used from one of the coding system alists.
10445 There are three such tables: `file-coding-system-alist',
10446 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10447   Vcoding_system_for_read = Qnil;
10448
10449   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10450                doc: /* Specify the coding system for write operations.
10451 Programs bind this variable with `let', but you should not set it globally.
10452 If the value is a coding system, it is used for encoding of output,
10453 when writing it to a file and when sending it to a file or subprocess.
10454
10455 If this does not specify a coding system, an appropriate element
10456 is used from one of the coding system alists.
10457 There are three such tables: `file-coding-system-alist',
10458 `process-coding-system-alist', and `network-coding-system-alist'.
10459 For output to files, if the above procedure does not specify a coding system,
10460 the value of `buffer-file-coding-system' is used.  */);
10461   Vcoding_system_for_write = Qnil;
10462
10463   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10464                doc: /*
10465 Coding system used in the latest file or process I/O.  */);
10466   Vlast_coding_system_used = Qnil;
10467
10468   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10469                doc: /*
10470 Error status of the last code conversion.
10471
10472 When an error was detected in the last code conversion, this variable
10473 is set to one of the following symbols.
10474   `insufficient-source'
10475   `inconsistent-eol'
10476   `invalid-source'
10477   `interrupted'
10478   `insufficient-memory'
10479 When no error was detected, the value doesn't change.  So, to check
10480 the error status of a code conversion by this variable, you must
10481 explicitly set this variable to nil before performing code
10482 conversion.  */);
10483   Vlast_code_conversion_error = Qnil;
10484
10485   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10486                doc: /*
10487 *Non-nil means always inhibit code conversion of end-of-line format.
10488 See info node `Coding Systems' and info node `Text and Binary' concerning
10489 such conversion.  */);
10490   inhibit_eol_conversion = 0;
10491
10492   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10493                doc: /*
10494 Non-nil means process buffer inherits coding system of process output.
10495 Bind it to t if the process output is to be treated as if it were a file
10496 read from some filesystem.  */);
10497   inherit_process_coding_system = 0;
10498
10499   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10500                doc: /*
10501 Alist to decide a coding system to use for a file I/O operation.
10502 The format is ((PATTERN . VAL) ...),
10503 where PATTERN is a regular expression matching a file name,
10504 VAL is a coding system, a cons of coding systems, or a function symbol.
10505 If VAL is a coding system, it is used for both decoding and encoding
10506 the file contents.
10507 If VAL is a cons of coding systems, the car part is used for decoding,
10508 and the cdr part is used for encoding.
10509 If VAL is a function symbol, the function must return a coding system
10510 or a cons of coding systems which are used as above.  The function is
10511 called with an argument that is a list of the arguments with which
10512 `find-operation-coding-system' was called.  If the function can't decide
10513 a coding system, it can return `undecided' so that the normal
10514 code-detection is performed.
10515
10516 See also the function `find-operation-coding-system'
10517 and the variable `auto-coding-alist'.  */);
10518   Vfile_coding_system_alist = Qnil;
10519
10520   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10521                doc: /*
10522 Alist to decide a coding system to use for a process I/O operation.
10523 The format is ((PATTERN . VAL) ...),
10524 where PATTERN is a regular expression matching a program name,
10525 VAL is a coding system, a cons of coding systems, or a function symbol.
10526 If VAL is a coding system, it is used for both decoding what received
10527 from the program and encoding what sent to the program.
10528 If VAL is a cons of coding systems, the car part is used for decoding,
10529 and the cdr part is used for encoding.
10530 If VAL is a function symbol, the function must return a coding system
10531 or a cons of coding systems which are used as above.
10532
10533 See also the function `find-operation-coding-system'.  */);
10534   Vprocess_coding_system_alist = Qnil;
10535
10536   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10537                doc: /*
10538 Alist to decide a coding system to use for a network I/O operation.
10539 The format is ((PATTERN . VAL) ...),
10540 where PATTERN is a regular expression matching a network service name
10541 or is a port number to connect to,
10542 VAL is a coding system, a cons of coding systems, or a function symbol.
10543 If VAL is a coding system, it is used for both decoding what received
10544 from the network stream and encoding what sent to the network stream.
10545 If VAL is a cons of coding systems, the car part is used for decoding,
10546 and the cdr part is used for encoding.
10547 If VAL is a function symbol, the function must return a coding system
10548 or a cons of coding systems which are used as above.
10549
10550 See also the function `find-operation-coding-system'.  */);
10551   Vnetwork_coding_system_alist = Qnil;
10552
10553   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10554                doc: /* Coding system to use with system messages.
10555 Also used for decoding keyboard input on X Window system.  */);
10556   Vlocale_coding_system = Qnil;
10557
10558   /* The eol mnemonics are reset in startup.el system-dependently.  */
10559   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10560                doc: /*
10561 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10562   eol_mnemonic_unix = make_pure_c_string (":");
10563
10564   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10565                doc: /*
10566 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10567   eol_mnemonic_dos = make_pure_c_string ("\\");
10568
10569   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10570                doc: /*
10571 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10572   eol_mnemonic_mac = make_pure_c_string ("/");
10573
10574   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10575                doc: /*
10576 *String displayed in mode line when end-of-line format is not yet determined.  */);
10577   eol_mnemonic_undecided = make_pure_c_string (":");
10578
10579   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10580                doc: /*
10581 *Non-nil enables character translation while encoding and decoding.  */);
10582   Venable_character_translation = Qt;
10583
10584   DEFVAR_LISP ("standard-translation-table-for-decode",
10585                Vstandard_translation_table_for_decode,
10586                doc: /* Table for translating characters while decoding.  */);
10587   Vstandard_translation_table_for_decode = Qnil;
10588
10589   DEFVAR_LISP ("standard-translation-table-for-encode",
10590                Vstandard_translation_table_for_encode,
10591                doc: /* Table for translating characters while encoding.  */);
10592   Vstandard_translation_table_for_encode = Qnil;
10593
10594   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10595                doc: /* Alist of charsets vs revision numbers.
10596 While encoding, if a charset (car part of an element) is found,
10597 designate it with the escape sequence identifying revision (cdr part
10598 of the element).  */);
10599   Vcharset_revision_table = Qnil;
10600
10601   DEFVAR_LISP ("default-process-coding-system",
10602                Vdefault_process_coding_system,
10603                doc: /* Cons of coding systems used for process I/O by default.
10604 The car part is used for decoding a process output,
10605 the cdr part is used for encoding a text to be sent to a process.  */);
10606   Vdefault_process_coding_system = Qnil;
10607
10608   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10609                doc: /*
10610 Table of extra Latin codes in the range 128..159 (inclusive).
10611 This is a vector of length 256.
10612 If Nth element is non-nil, the existence of code N in a file
10613 \(or output of subprocess) doesn't prevent it to be detected as
10614 a coding system of ISO 2022 variant which has a flag
10615 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10616 or reading output of a subprocess.
10617 Only 128th through 159th elements have a meaning.  */);
10618   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10619
10620   DEFVAR_LISP ("select-safe-coding-system-function",
10621                Vselect_safe_coding_system_function,
10622                doc: /*
10623 Function to call to select safe coding system for encoding a text.
10624
10625 If set, this function is called to force a user to select a proper
10626 coding system which can encode the text in the case that a default
10627 coding system used in each operation can't encode the text.  The
10628 function should take care that the buffer is not modified while
10629 the coding system is being selected.
10630
10631 The default value is `select-safe-coding-system' (which see).  */);
10632   Vselect_safe_coding_system_function = Qnil;
10633
10634   DEFVAR_BOOL ("coding-system-require-warning",
10635                coding_system_require_warning,
10636                doc: /* Internal use only.
10637 If non-nil, on writing a file, `select-safe-coding-system-function' is
10638 called even if `coding-system-for-write' is non-nil.  The command
10639 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10640   coding_system_require_warning = 0;
10641
10642
10643   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10644                inhibit_iso_escape_detection,
10645                doc: /*
10646 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10647
10648 When Emacs reads text, it tries to detect how the text is encoded.
10649 This code detection is sensitive to escape sequences.  If Emacs sees
10650 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10651 of the ISO2022 encodings, and decodes text by the corresponding coding
10652 system (e.g. `iso-2022-7bit').
10653
10654 However, there may be a case that you want to read escape sequences in
10655 a file as is.  In such a case, you can set this variable to non-nil.
10656 Then the code detection will ignore any escape sequences, and no text is
10657 detected as encoded in some ISO-2022 encoding.  The result is that all
10658 escape sequences become visible in a buffer.
10659
10660 The default value is nil, and it is strongly recommended not to change
10661 it.  That is because many Emacs Lisp source files that contain
10662 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10663 in Emacs's distribution, and they won't be decoded correctly on
10664 reading if you suppress escape sequence detection.
10665
10666 The other way to read escape sequences in a file without decoding is
10667 to explicitly specify some coding system that doesn't use ISO-2022
10668 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10669   inhibit_iso_escape_detection = 0;
10670
10671   DEFVAR_BOOL ("inhibit-null-byte-detection",
10672                inhibit_null_byte_detection,
10673                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10674 By default, Emacs treats it as binary data, and does not attempt to
10675 decode it.  The effect is as if you specified `no-conversion' for
10676 reading that text.
10677
10678 Set this to non-nil when a regular text happens to include null bytes.
10679 Examples are Index nodes of Info files and null-byte delimited output
10680 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10681 decode text as usual.  */);
10682   inhibit_null_byte_detection = 0;
10683
10684   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10685                doc: /* Char table for translating self-inserting characters.
10686 This is applied to the result of input methods, not their input.
10687 See also `keyboard-translate-table'.
10688
10689 Use of this variable for character code unification was rendered
10690 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10691 internal character representation.  */);
10692     Vtranslation_table_for_input = Qnil;
10693
10694   {
10695     Lisp_Object args[coding_arg_max];
10696     Lisp_Object plist[16];
10697     int i;
10698
10699     for (i = 0; i < coding_arg_max; i++)
10700       args[i] = Qnil;
10701
10702     plist[0] = intern_c_string (":name");
10703     plist[1] = args[coding_arg_name] = Qno_conversion;
10704     plist[2] = intern_c_string (":mnemonic");
10705     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10706     plist[4] = intern_c_string (":coding-type");
10707     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10708     plist[6] = intern_c_string (":ascii-compatible-p");
10709     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10710     plist[8] = intern_c_string (":default-char");
10711     plist[9] = args[coding_arg_default_char] = make_number (0);
10712     plist[10] = intern_c_string (":for-unibyte");
10713     plist[11] = args[coding_arg_for_unibyte] = Qt;
10714     plist[12] = intern_c_string (":docstring");
10715     plist[13] = make_pure_c_string ("Do no conversion.\n\
10716 \n\
10717 When you visit a file with this coding, the file is read into a\n\
10718 unibyte buffer as is, thus each byte of a file is treated as a\n\
10719 character.");
10720     plist[14] = intern_c_string (":eol-type");
10721     plist[15] = args[coding_arg_eol_type] = Qunix;
10722     args[coding_arg_plist] = Flist (16, plist);
10723     Fdefine_coding_system_internal (coding_arg_max, args);
10724
10725     plist[1] = args[coding_arg_name] = Qundecided;
10726     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10727     plist[5] = args[coding_arg_coding_type] = Qundecided;
10728     /* This is already set.
10729        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10730     plist[8] = intern_c_string (":charset-list");
10731     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10732     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10733     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10734     plist[15] = args[coding_arg_eol_type] = Qnil;
10735     args[coding_arg_plist] = Flist (16, plist);
10736     Fdefine_coding_system_internal (coding_arg_max, args);
10737   }
10738
10739   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10740
10741   {
10742     int i;
10743
10744     for (i = 0; i < coding_category_max; i++)
10745       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10746   }
10747 #if defined (DOS_NT)
10748   system_eol_type = Qdos;
10749 #else
10750   system_eol_type = Qunix;
10751 #endif
10752   staticpro (&system_eol_type);
10753 }
10754
10755 char *
10756 emacs_strerror (int error_number)
10757 {
10758   char *str;
10759
10760   synchronize_system_messages_locale ();
10761   str = strerror (error_number);
10762
10763   if (! NILP (Vlocale_coding_system))
10764     {
10765       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10766                                                       Vlocale_coding_system,
10767                                                       0);
10768       str = SSDATA (dec);
10769     }
10770
10771   return str;
10772 }
10773
10774 #endif /* emacs */