src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   EMACS_INT consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   EMACS_INT produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static void coding_set_destination (struct coding_system *);
 852 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 853 static void coding_alloc_by_making_gap (struct coding_system *,
 854                                         EMACS_INT, EMACS_INT);
 855 static unsigned char *alloc_destination (struct coding_system *,
 856                                          EMACS_INT, unsigned char *);
 857 static void setup_iso_safe_charsets (Lisp_Object);
 858 static unsigned char *encode_designation_at_bol (struct coding_system *,
 859                                                  int *, unsigned char *);
 860 static int detect_eol (const unsigned char *,
 861                        EMACS_INT, enum coding_category);
 862 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 863 static void decode_eol (struct coding_system *);
 864 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 865 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 866 static int produce_chars (struct coding_system *, Lisp_Object, int);
 867 static inline void produce_charset (struct coding_system *, int *,
 868                                     EMACS_INT);
 869 static void produce_annotation (struct coding_system *, EMACS_INT);
 870 static int decode_coding (struct coding_system *);
 871 static inline int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 872                                                   struct coding_system *,
 873                                                   int *, EMACS_INT *);
 874 static inline int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 875                                               struct coding_system *,
 876                                               int *, EMACS_INT *);
 877 static void consume_chars (struct coding_system *, Lisp_Object, int);
 878 static int encode_coding (struct coding_system *);
 879 static Lisp_Object make_conversion_work_buffer (int);
 880 static Lisp_Object code_conversion_restore (Lisp_Object);
 881 static inline int char_encodable_p (int, Lisp_Object);
 882 static Lisp_Object make_subsidiaries (Lisp_Object);
 883
 884 static void
 885 record_conversion_result (struct coding_system *coding,
 886                           enum coding_result_code result)
 887 {
 888   coding->result = result;
 889   switch (result)
 890     {
 891     case CODING_RESULT_INSUFFICIENT_SRC:
 892       Vlast_code_conversion_error = Qinsufficient_source;
 893       break;
 894     case CODING_RESULT_INCONSISTENT_EOL:
 895       Vlast_code_conversion_error = Qinconsistent_eol;
 896       break;
 897     case CODING_RESULT_INVALID_SRC:
 898       Vlast_code_conversion_error = Qinvalid_source;
 899       break;
 900     case CODING_RESULT_INTERRUPT:
 901       Vlast_code_conversion_error = Qinterrupted;
 902       break;
 903     case CODING_RESULT_INSUFFICIENT_MEM:
 904       Vlast_code_conversion_error = Qinsufficient_memory;
 905       break;
 906     case CODING_RESULT_INSUFFICIENT_DST:
 907       /* Don't record this error in Vlast_code_conversion_error
 908          because it happens just temporarily and is resolved when the
 909          whole conversion is finished.  */
 910       break;
 911     case CODING_RESULT_SUCCESS:
 912       break;
 913     default:
 914       Vlast_code_conversion_error = intern ("Unknown error");
 915     }
 916 }
 917
 918 /* This wrapper macro is used to preserve validity of pointers into
 919    buffer text across calls to decode_char, which could cause
 920    relocation of buffers if it loads a charset map, because loading a
 921    charset map allocates large structures.  */
 922 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 923   do {                                                                       \
 924     charset_map_loaded = 0;                                                  \
 925     c = DECODE_CHAR (charset, code);                                         \
 926     if (charset_map_loaded)                                                  \
 927       {                                                                      \
 928         const unsigned char *orig = coding->source;                          \
 929         EMACS_INT offset;                                                    \
 930                                                                              \
 931         coding_set_source (coding);                                          \
 932         offset = coding->source - orig;                                      \
 933         src += offset;                                                       \
 934         src_base += offset;                                                  \
 935         src_end += offset;                                                   \
 936       }                                                                      \
 937   } while (0)
 938
 939
 940 /* If there are at least BYTES length of room at dst, allocate memory
 941    for coding->destination and update dst and dst_end.  We don't have
 942    to take care of coding->source which will be relocated.  It is
 943    handled by calling coding_set_source in encode_coding.  */
 944
 945 #define ASSURE_DESTINATION(bytes)                               \
 946   do {                                                          \
 947     if (dst + (bytes) >= dst_end)                               \
 948       {                                                         \
 949         EMACS_INT more_bytes = charbuf_end - charbuf + (bytes); \
 950                                                                 \
 951         dst = alloc_destination (coding, more_bytes, dst);      \
 952         dst_end = coding->destination + coding->dst_bytes;      \
 953       }                                                         \
 954   } while (0)
 955
 956
 957 /* Store multibyte form of the character C in P, and advance P to the
 958    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 959    never calls MAYBE_UNIFY_CHAR.  */
 960
 961 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 962   do {                                          \
 963     if ((c) <= MAX_1_BYTE_CHAR)                 \
 964       *(p)++ = (c);                             \
 965     else if ((c) <= MAX_2_BYTE_CHAR)            \
 966       *(p)++ = (0xC0 | ((c) >> 6)),             \
 967         *(p)++ = (0x80 | ((c) & 0x3F));         \
 968     else if ((c) <= MAX_3_BYTE_CHAR)            \
 969       *(p)++ = (0xE0 | ((c) >> 12)),            \
 970         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 971         *(p)++ = (0x80 | ((c) & 0x3F));         \
 972     else if ((c) <= MAX_4_BYTE_CHAR)            \
 973       *(p)++ = (0xF0 | (c >> 18)),              \
 974         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 975         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 976         *(p)++ = (0x80 | (c & 0x3F));           \
 977     else if ((c) <= MAX_5_BYTE_CHAR)            \
 978       *(p)++ = 0xF8,                            \
 979         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 980         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 981         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 982         *(p)++ = (0x80 | (c & 0x3F));           \
 983     else                                        \
 984       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 985   } while (0)
 986
 987
 988 /* Return the character code of character whose multibyte form is at
 989    P, and advance P to the end of the multibyte form.  This is like
 990    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 991
 992 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 993   (!((p)[0] & 0x80)                                             \
 994    ? *(p)++                                                     \
 995    : ! ((p)[0] & 0x20)                                          \
 996    ? ((p) += 2,                                                 \
 997       ((((p)[-2] & 0x1F) << 6)                                  \
 998        | ((p)[-1] & 0x3F)                                       \
 999        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1000    : ! ((p)[0] & 0x10)                                          \
1001    ? ((p) += 3,                                                 \
1002       ((((p)[-3] & 0x0F) << 12)                                 \
1003        | (((p)[-2] & 0x3F) << 6)                                \
1004        | ((p)[-1] & 0x3F)))                                     \
1005    : ! ((p)[0] & 0x08)                                          \
1006    ? ((p) += 4,                                                 \
1007       ((((p)[-4] & 0xF) << 18)                                  \
1008        | (((p)[-3] & 0x3F) << 12)                               \
1009        | (((p)[-2] & 0x3F) << 6)                                \
1010        | ((p)[-1] & 0x3F)))                                     \
1011    : ((p) += 5,                                                 \
1012       ((((p)[-4] & 0x3F) << 18)                                 \
1013        | (((p)[-3] & 0x3F) << 12)                               \
1014        | (((p)[-2] & 0x3F) << 6)                                \
1015        | ((p)[-1] & 0x3F))))
1016
1017
1018 static void
1019 coding_set_source (struct coding_system *coding)
1020 {
1021   if (BUFFERP (coding->src_object))
1022     {
1023       struct buffer *buf = XBUFFER (coding->src_object);
1024
1025       if (coding->src_pos < 0)
1026         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1027       else
1028         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1029     }
1030   else if (STRINGP (coding->src_object))
1031     {
1032       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1033     }
1034   else
1035     {
1036       /* Otherwise, the source is C string and is never relocated
1037          automatically.  Thus we don't have to update anything.  */
1038     }
1039 }
1040
1041 static void
1042 coding_set_destination (struct coding_system *coding)
1043 {
1044   if (BUFFERP (coding->dst_object))
1045     {
1046       if (coding->src_pos < 0)
1047         {
1048           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1049           coding->dst_bytes = (GAP_END_ADDR
1050                                - (coding->src_bytes - coding->consumed)
1051                                - coding->destination);
1052         }
1053       else
1054         {
1055           /* We are sure that coding->dst_pos_byte is before the gap
1056              of the buffer. */
1057           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1058                                  + coding->dst_pos_byte - BEG_BYTE);
1059           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1060                                - coding->destination);
1061         }
1062     }
1063   else
1064     {
1065       /* Otherwise, the destination is C string and is never relocated
1066          automatically.  Thus we don't have to update anything.  */
1067     }
1068 }
1069
1070
1071 static void
1072 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1073 {
1074   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1075     string_overflow ();
1076   coding->destination = (unsigned char *) xrealloc (coding->destination,
1077                                                     coding->dst_bytes + bytes);
1078   coding->dst_bytes += bytes;
1079 }
1080
1081 static void
1082 coding_alloc_by_making_gap (struct coding_system *coding,
1083                             EMACS_INT gap_head_used, EMACS_INT bytes)
1084 {
1085   if (EQ (coding->src_object, coding->dst_object))
1086     {
1087       /* The gap may contain the produced data at the head and not-yet
1088          consumed data at the tail.  To preserve those data, we at
1089          first make the gap size to zero, then increase the gap
1090          size.  */
1091       EMACS_INT add = GAP_SIZE;
1092
1093       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1094       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1095       make_gap (bytes);
1096       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1097       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1098     }
1099   else
1100     {
1101       Lisp_Object this_buffer;
1102
1103       this_buffer = Fcurrent_buffer ();
1104       set_buffer_internal (XBUFFER (coding->dst_object));
1105       make_gap (bytes);
1106       set_buffer_internal (XBUFFER (this_buffer));
1107     }
1108 }
1109
1110
1111 static unsigned char *
1112 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1113                    unsigned char *dst)
1114 {
1115   EMACS_INT offset = dst - coding->destination;
1116
1117   if (BUFFERP (coding->dst_object))
1118     {
1119       struct buffer *buf = XBUFFER (coding->dst_object);
1120
1121       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1122     }
1123   else
1124     coding_alloc_by_realloc (coding, nbytes);
1125   coding_set_destination (coding);
1126   dst = coding->destination + offset;
1127   return dst;
1128 }
1129
1130 /** Macros for annotations.  */
1131
1132 /* An annotation data is stored in the array coding->charbuf in this
1133    format:
1134      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1135    LENGTH is the number of elements in the annotation.
1136    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1137    NCHARS is the number of characters in the text annotated.
1138
1139    The format of the following elements depend on ANNOTATION_MASK.
1140
1141    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1142    follows:
1143      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1144
1145    NBYTES is the number of bytes specified in the header part of
1146    old-style emacs-mule encoding, or 0 for the other kind of
1147    composition.
1148
1149    METHOD is one of enum composition_method.
1150
1151    Optional COMPOSITION-COMPONENTS are characters and composition
1152    rules.
1153
1154    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1155    follows.
1156
1157    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1158    recover from an invalid annotation, and should be skipped by
1159    produce_annotation.  */
1160
1161 /* Maximum length of the header of annotation data.  */
1162 #define MAX_ANNOTATION_LENGTH 5
1163
1164 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1165   do {                                                  \
1166     *(buf)++ = -(len);                                  \
1167     *(buf)++ = (mask);                                  \
1168     *(buf)++ = (nchars);                                \
1169     coding->annotated = 1;                              \
1170   } while (0);
1171
1172 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1173   do {                                                                      \
1174     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1175     *buf++ = nbytes;                                                        \
1176     *buf++ = method;                                                        \
1177   } while (0)
1178
1179
1180 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1181   do {                                                                  \
1182     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1183     *buf++ = id;                                                        \
1184   } while (0)
1185
1186 \f
1187 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1188
1189
1190
1191 \f
1192 /*** 3. UTF-8 ***/
1193
1194 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1195    Check if a text is encoded in UTF-8.  If it is, return 1, else
1196    return 0.  */
1197
1198 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1199 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1200 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1201 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1202 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1203 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1204
1205 #define UTF_8_BOM_1 0xEF
1206 #define UTF_8_BOM_2 0xBB
1207 #define UTF_8_BOM_3 0xBF
1208
1209 static int
1210 detect_coding_utf_8 (struct coding_system *coding,
1211                      struct coding_detection_info *detect_info)
1212 {
1213   const unsigned char *src = coding->source, *src_base;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   int multibytep = coding->src_multibyte;
1216   EMACS_INT consumed_chars = 0;
1217   int bom_found = 0;
1218   int found = 0;
1219
1220   detect_info->checked |= CATEGORY_MASK_UTF_8;
1221   /* A coding system of this category is always ASCII compatible.  */
1222   src += coding->head_ascii;
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4;
1227
1228       src_base = src;
1229       ONE_MORE_BYTE (c);
1230       if (c < 0 || UTF_8_1_OCTET_P (c))
1231         continue;
1232       ONE_MORE_BYTE (c1);
1233       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1234         break;
1235       if (UTF_8_2_OCTET_LEADING_P (c))
1236         {
1237           found = 1;
1238           continue;
1239         }
1240       ONE_MORE_BYTE (c2);
1241       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1242         break;
1243       if (UTF_8_3_OCTET_LEADING_P (c))
1244         {
1245           found = 1;
1246           if (src_base == coding->source
1247               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1248             bom_found = 1;
1249           continue;
1250         }
1251       ONE_MORE_BYTE (c3);
1252       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1253         break;
1254       if (UTF_8_4_OCTET_LEADING_P (c))
1255         {
1256           found = 1;
1257           continue;
1258         }
1259       ONE_MORE_BYTE (c4);
1260       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1261         break;
1262       if (UTF_8_5_OCTET_LEADING_P (c))
1263         {
1264           found = 1;
1265           continue;
1266         }
1267       break;
1268     }
1269   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1270   return 0;
1271
1272  no_more_source:
1273   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1274     {
1275       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1276       return 0;
1277     }
1278   if (bom_found)
1279     {
1280       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1281       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1282     }
1283   else
1284     {
1285       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1286       if (found)
1287         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1288     }
1289   return 1;
1290 }
1291
1292
1293 static void
1294 decode_coding_utf_8 (struct coding_system *coding)
1295 {
1296   const unsigned char *src = coding->source + coding->consumed;
1297   const unsigned char *src_end = coding->source + coding->src_bytes;
1298   const unsigned char *src_base;
1299   int *charbuf = coding->charbuf + coding->charbuf_used;
1300   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1301   EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
1302   int multibytep = coding->src_multibyte;
1303   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1304   int eol_dos =
1305     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1306   int byte_after_cr = -1;
1307
1308   if (bom != utf_without_bom)
1309     {
1310       int c1, c2, c3;
1311
1312       src_base = src;
1313       ONE_MORE_BYTE (c1);
1314       if (! UTF_8_3_OCTET_LEADING_P (c1))
1315         src = src_base;
1316       else
1317         {
1318           ONE_MORE_BYTE (c2);
1319           if (! UTF_8_EXTRA_OCTET_P (c2))
1320             src = src_base;
1321           else
1322             {
1323               ONE_MORE_BYTE (c3);
1324               if (! UTF_8_EXTRA_OCTET_P (c3))
1325                 src = src_base;
1326               else
1327                 {
1328                   if ((c1 != UTF_8_BOM_1)
1329                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1330                     src = src_base;
1331                   else
1332                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1333                 }
1334             }
1335         }
1336     }
1337   CODING_UTF_8_BOM (coding) = utf_without_bom;
1338
1339   while (1)
1340     {
1341       int c, c1, c2, c3, c4, c5;
1342
1343       src_base = src;
1344       consumed_chars_base = consumed_chars;
1345
1346       if (charbuf >= charbuf_end)
1347         {
1348           if (byte_after_cr >= 0)
1349             src_base--;
1350           break;
1351         }
1352
1353       if (byte_after_cr >= 0)
1354         c1 = byte_after_cr, byte_after_cr = -1;
1355       else
1356         ONE_MORE_BYTE (c1);
1357       if (c1 < 0)
1358         {
1359           c = - c1;
1360         }
1361       else if (UTF_8_1_OCTET_P (c1))
1362         {
1363           if (eol_dos && c1 == '\r')
1364             ONE_MORE_BYTE (byte_after_cr);
1365           c = c1;
1366         }
1367       else
1368         {
1369           ONE_MORE_BYTE (c2);
1370           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1371             goto invalid_code;
1372           if (UTF_8_2_OCTET_LEADING_P (c1))
1373             {
1374               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1375               /* Reject overlong sequences here and below.  Encoders
1376                  producing them are incorrect, they can be misleading,
1377                  and they mess up read/write invariance.  */
1378               if (c < 128)
1379                 goto invalid_code;
1380             }
1381           else
1382             {
1383               ONE_MORE_BYTE (c3);
1384               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1385                 goto invalid_code;
1386               if (UTF_8_3_OCTET_LEADING_P (c1))
1387                 {
1388                   c = (((c1 & 0xF) << 12)
1389                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1390                   if (c < 0x800
1391                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1392                     goto invalid_code;
1393                 }
1394               else
1395                 {
1396                   ONE_MORE_BYTE (c4);
1397                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1398                     goto invalid_code;
1399                   if (UTF_8_4_OCTET_LEADING_P (c1))
1400                     {
1401                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1402                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1403                     if (c < 0x10000)
1404                       goto invalid_code;
1405                     }
1406                   else
1407                     {
1408                       ONE_MORE_BYTE (c5);
1409                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1410                         goto invalid_code;
1411                       if (UTF_8_5_OCTET_LEADING_P (c1))
1412                         {
1413                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1414                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1415                                | (c5 & 0x3F));
1416                           if ((c > MAX_CHAR) || (c < 0x200000))
1417                             goto invalid_code;
1418                         }
1419                       else
1420                         goto invalid_code;
1421                     }
1422                 }
1423             }
1424         }
1425
1426       *charbuf++ = c;
1427       continue;
1428
1429     invalid_code:
1430       src = src_base;
1431       consumed_chars = consumed_chars_base;
1432       ONE_MORE_BYTE (c);
1433       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1434       coding->errors++;
1435     }
1436
1437  no_more_source:
1438   coding->consumed_char += consumed_chars_base;
1439   coding->consumed = src_base - coding->source;
1440   coding->charbuf_used = charbuf - coding->charbuf;
1441 }
1442
1443
1444 static int
1445 encode_coding_utf_8 (struct coding_system *coding)
1446 {
1447   int multibytep = coding->dst_multibyte;
1448   int *charbuf = coding->charbuf;
1449   int *charbuf_end = charbuf + coding->charbuf_used;
1450   unsigned char *dst = coding->destination + coding->produced;
1451   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1452   EMACS_INT produced_chars = 0;
1453   int c;
1454
1455   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1456     {
1457       ASSURE_DESTINATION (3);
1458       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1459       CODING_UTF_8_BOM (coding) = utf_without_bom;
1460     }
1461
1462   if (multibytep)
1463     {
1464       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1465
1466       while (charbuf < charbuf_end)
1467         {
1468           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1469
1470           ASSURE_DESTINATION (safe_room);
1471           c = *charbuf++;
1472           if (CHAR_BYTE8_P (c))
1473             {
1474               c = CHAR_TO_BYTE8 (c);
1475               EMIT_ONE_BYTE (c);
1476             }
1477           else
1478             {
1479               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1480               for (p = str; p < pend; p++)
1481                 EMIT_ONE_BYTE (*p);
1482             }
1483         }
1484     }
1485   else
1486     {
1487       int safe_room = MAX_MULTIBYTE_LENGTH;
1488
1489       while (charbuf < charbuf_end)
1490         {
1491           ASSURE_DESTINATION (safe_room);
1492           c = *charbuf++;
1493           if (CHAR_BYTE8_P (c))
1494             *dst++ = CHAR_TO_BYTE8 (c);
1495           else
1496             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1497           produced_chars++;
1498         }
1499     }
1500   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1501   coding->produced_char += produced_chars;
1502   coding->produced = dst - coding->destination;
1503   return 0;
1504 }
1505
1506
1507 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1508    Check if a text is encoded in one of UTF-16 based coding systems.
1509    If it is, return 1, else return 0.  */
1510
1511 #define UTF_16_HIGH_SURROGATE_P(val) \
1512   (((val) & 0xFC00) == 0xD800)
1513
1514 #define UTF_16_LOW_SURROGATE_P(val) \
1515   (((val) & 0xFC00) == 0xDC00)
1516
1517
1518 static int
1519 detect_coding_utf_16 (struct coding_system *coding,
1520                       struct coding_detection_info *detect_info)
1521 {
1522   const unsigned char *src = coding->source;
1523   const unsigned char *src_end = coding->source + coding->src_bytes;
1524   int multibytep = coding->src_multibyte;
1525   int c1, c2;
1526
1527   detect_info->checked |= CATEGORY_MASK_UTF_16;
1528   if (coding->mode & CODING_MODE_LAST_BLOCK
1529       && (coding->src_chars & 1))
1530     {
1531       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1532       return 0;
1533     }
1534
1535   TWO_MORE_BYTES (c1, c2);
1536   if ((c1 == 0xFF) && (c2 == 0xFE))
1537     {
1538       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1539                              | CATEGORY_MASK_UTF_16_AUTO);
1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1541                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1542                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1543     }
1544   else if ((c1 == 0xFE) && (c2 == 0xFF))
1545     {
1546       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1547                              | CATEGORY_MASK_UTF_16_AUTO);
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1549                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1550                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1551     }
1552   else if (c2 < 0)
1553     {
1554       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1555       return 0;
1556     }
1557   else
1558     {
1559       /* We check the dispersion of Eth and Oth bytes where E is even and
1560          O is odd.  If both are high, we assume binary data.*/
1561       unsigned char e[256], o[256];
1562       unsigned e_num = 1, o_num = 1;
1563
1564       memset (e, 0, 256);
1565       memset (o, 0, 256);
1566       e[c1] = 1;
1567       o[c2] = 1;
1568
1569       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1570                                 |CATEGORY_MASK_UTF_16_BE
1571                                 | CATEGORY_MASK_UTF_16_LE);
1572
1573       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1574              != CATEGORY_MASK_UTF_16)
1575         {
1576           TWO_MORE_BYTES (c1, c2);
1577           if (c2 < 0)
1578             break;
1579           if (! e[c1])
1580             {
1581               e[c1] = 1;
1582               e_num++;
1583               if (e_num >= 128)
1584                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1585             }
1586           if (! o[c2])
1587             {
1588               o[c2] = 1;
1589               o_num++;
1590               if (o_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1592             }
1593         }
1594       return 0;
1595     }
1596
1597  no_more_source:
1598   return 1;
1599 }
1600
1601 static void
1602 decode_coding_utf_16 (struct coding_system *coding)
1603 {
1604   const unsigned char *src = coding->source + coding->consumed;
1605   const unsigned char *src_end = coding->source + coding->src_bytes;
1606   const unsigned char *src_base;
1607   int *charbuf = coding->charbuf + coding->charbuf_used;
1608   /* We may produces at most 3 chars in one loop.  */
1609   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1610   EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
1611   int multibytep = coding->src_multibyte;
1612   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1613   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1614   int surrogate = CODING_UTF_16_SURROGATE (coding);
1615   int eol_dos =
1616     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1617   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1618
1619   if (bom == utf_with_bom)
1620     {
1621       int c, c1, c2;
1622
1623       src_base = src;
1624       ONE_MORE_BYTE (c1);
1625       ONE_MORE_BYTE (c2);
1626       c = (c1 << 8) | c2;
1627
1628       if (endian == utf_16_big_endian
1629           ? c != 0xFEFF : c != 0xFFFE)
1630         {
1631           /* The first two bytes are not BOM.  Treat them as bytes
1632              for a normal character.  */
1633           src = src_base;
1634           coding->errors++;
1635         }
1636       CODING_UTF_16_BOM (coding) = utf_without_bom;
1637     }
1638   else if (bom == utf_detect_bom)
1639     {
1640       /* We have already tried to detect BOM and failed in
1641          detect_coding.  */
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644
1645   while (1)
1646     {
1647       int c, c1, c2;
1648
1649       src_base = src;
1650       consumed_chars_base = consumed_chars;
1651
1652       if (charbuf >= charbuf_end)
1653         {
1654           if (byte_after_cr1 >= 0)
1655             src_base -= 2;
1656           break;
1657         }
1658
1659       if (byte_after_cr1 >= 0)
1660         c1 = byte_after_cr1, byte_after_cr1 = -1;
1661       else
1662         ONE_MORE_BYTE (c1);
1663       if (c1 < 0)
1664         {
1665           *charbuf++ = -c1;
1666           continue;
1667         }
1668       if (byte_after_cr2 >= 0)
1669         c2 = byte_after_cr2, byte_after_cr2 = -1;
1670       else
1671         ONE_MORE_BYTE (c2);
1672       if (c2 < 0)
1673         {
1674           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1675           *charbuf++ = -c2;
1676           continue;
1677         }
1678       c = (endian == utf_16_big_endian
1679            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1680
1681       if (surrogate)
1682         {
1683           if (! UTF_16_LOW_SURROGATE_P (c))
1684             {
1685               if (endian == utf_16_big_endian)
1686                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1687               else
1688                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1689               *charbuf++ = c1;
1690               *charbuf++ = c2;
1691               coding->errors++;
1692               if (UTF_16_HIGH_SURROGATE_P (c))
1693                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1694               else
1695                 *charbuf++ = c;
1696             }
1697           else
1698             {
1699               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1700               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1701               *charbuf++ = 0x10000 + c;
1702             }
1703         }
1704       else
1705         {
1706           if (UTF_16_HIGH_SURROGATE_P (c))
1707             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1708           else
1709             {
1710               if (eol_dos && c == '\r')
1711                 {
1712                   ONE_MORE_BYTE (byte_after_cr1);
1713                   ONE_MORE_BYTE (byte_after_cr2);
1714                 }
1715               *charbuf++ = c;
1716             }
1717         }
1718     }
1719
1720  no_more_source:
1721   coding->consumed_char += consumed_chars_base;
1722   coding->consumed = src_base - coding->source;
1723   coding->charbuf_used = charbuf - coding->charbuf;
1724 }
1725
1726 static int
1727 encode_coding_utf_16 (struct coding_system *coding)
1728 {
1729   int multibytep = coding->dst_multibyte;
1730   int *charbuf = coding->charbuf;
1731   int *charbuf_end = charbuf + coding->charbuf_used;
1732   unsigned char *dst = coding->destination + coding->produced;
1733   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1734   int safe_room = 8;
1735   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1736   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1737   EMACS_INT produced_chars = 0;
1738   int c;
1739
1740   if (bom != utf_without_bom)
1741     {
1742       ASSURE_DESTINATION (safe_room);
1743       if (big_endian)
1744         EMIT_TWO_BYTES (0xFE, 0xFF);
1745       else
1746         EMIT_TWO_BYTES (0xFF, 0xFE);
1747       CODING_UTF_16_BOM (coding) = utf_without_bom;
1748     }
1749
1750   while (charbuf < charbuf_end)
1751     {
1752       ASSURE_DESTINATION (safe_room);
1753       c = *charbuf++;
1754       if (c > MAX_UNICODE_CHAR)
1755         c = coding->default_char;
1756
1757       if (c < 0x10000)
1758         {
1759           if (big_endian)
1760             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1761           else
1762             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1763         }
1764       else
1765         {
1766           int c1, c2;
1767
1768           c -= 0x10000;
1769           c1 = (c >> 10) + 0xD800;
1770           c2 = (c & 0x3FF) + 0xDC00;
1771           if (big_endian)
1772             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1773           else
1774             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1775         }
1776     }
1777   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1778   coding->produced = dst - coding->destination;
1779   coding->produced_char += produced_chars;
1780   return 0;
1781 }
1782
1783 \f
1784 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1785
1786 /* Emacs' internal format for representation of multiple character
1787    sets is a kind of multi-byte encoding, i.e. characters are
1788    represented by variable-length sequences of one-byte codes.
1789
1790    ASCII characters and control characters (e.g. `tab', `newline') are
1791    represented by one-byte sequences which are their ASCII codes, in
1792    the range 0x00 through 0x7F.
1793
1794    8-bit characters of the range 0x80..0x9F are represented by
1795    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1796    code + 0x20).
1797
1798    8-bit characters of the range 0xA0..0xFF are represented by
1799    one-byte sequences which are their 8-bit code.
1800
1801    The other characters are represented by a sequence of `base
1802    leading-code', optional `extended leading-code', and one or two
1803    `position-code's.  The length of the sequence is determined by the
1804    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1805    whereas extended leading-code and position-code take the range 0xA0
1806    through 0xFF.  See `charset.h' for more details about leading-code
1807    and position-code.
1808
1809    --- CODE RANGE of Emacs' internal format ---
1810    character set        range
1811    -------------        -----
1812    ascii                0x00..0x7F
1813    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1814    eight-bit-graphic    0xA0..0xBF
1815    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1816    ---------------------------------------------
1817
1818    As this is the internal character representation, the format is
1819    usually not used externally (i.e. in a file or in a data sent to a
1820    process).  But, it is possible to have a text externally in this
1821    format (i.e. by encoding by the coding system `emacs-mule').
1822
1823    In that case, a sequence of one-byte codes has a slightly different
1824    form.
1825
1826    At first, all characters in eight-bit-control are represented by
1827    one-byte sequences which are their 8-bit code.
1828
1829    Next, character composition data are represented by the byte
1830    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1831    where,
1832         METHOD is 0xF2 plus one of composition method (enum
1833         composition_method),
1834
1835         BYTES is 0xA0 plus a byte length of this composition data,
1836
1837         CHARS is 0xA0 plus a number of characters composed by this
1838         data,
1839
1840         COMPONENTs are characters of multibyte form or composition
1841         rules encoded by two-byte of ASCII codes.
1842
1843    In addition, for backward compatibility, the following formats are
1844    also recognized as composition data on decoding.
1845
1846    0x80 MSEQ ...
1847    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1848
1849    Here,
1850         MSEQ is a multibyte form but in these special format:
1851           ASCII: 0xA0 ASCII_CODE+0x80,
1852           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1853         RULE is a one byte code of the range 0xA0..0xF0 that
1854         represents a composition rule.
1855   */
1856
1857 char emacs_mule_bytes[256];
1858
1859
1860 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1861    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1862    else return 0.  */
1863
1864 static int
1865 detect_coding_emacs_mule (struct coding_system *coding,
1866                           struct coding_detection_info *detect_info)
1867 {
1868   const unsigned char *src = coding->source, *src_base;
1869   const unsigned char *src_end = coding->source + coding->src_bytes;
1870   int multibytep = coding->src_multibyte;
1871   EMACS_INT consumed_chars = 0;
1872   int c;
1873   int found = 0;
1874
1875   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1876   /* A coding system of this category is always ASCII compatible.  */
1877   src += coding->head_ascii;
1878
1879   while (1)
1880     {
1881       src_base = src;
1882       ONE_MORE_BYTE (c);
1883       if (c < 0)
1884         continue;
1885       if (c == 0x80)
1886         {
1887           /* Perhaps the start of composite character.  We simply skip
1888              it because analyzing it is too heavy for detecting.  But,
1889              at least, we check that the composite character
1890              constitutes of more than 4 bytes.  */
1891           const unsigned char *src_start;
1892
1893         repeat:
1894           src_start = src;
1895           do
1896             {
1897               ONE_MORE_BYTE (c);
1898             }
1899           while (c >= 0xA0);
1900
1901           if (src - src_start <= 4)
1902             break;
1903           found = CATEGORY_MASK_EMACS_MULE;
1904           if (c == 0x80)
1905             goto repeat;
1906         }
1907
1908       if (c < 0x80)
1909         {
1910           if (c < 0x20
1911               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1912             break;
1913         }
1914       else
1915         {
1916           int more_bytes = emacs_mule_bytes[c] - 1;
1917
1918           while (more_bytes > 0)
1919             {
1920               ONE_MORE_BYTE (c);
1921               if (c < 0xA0)
1922                 {
1923                   src--;        /* Unread the last byte.  */
1924                   break;
1925                 }
1926               more_bytes--;
1927             }
1928           if (more_bytes != 0)
1929             break;
1930           found = CATEGORY_MASK_EMACS_MULE;
1931         }
1932     }
1933   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934   return 0;
1935
1936  no_more_source:
1937   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1938     {
1939       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1940       return 0;
1941     }
1942   detect_info->found |= found;
1943   return 1;
1944 }
1945
1946
1947 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1948    character.  If CMP_STATUS indicates that we must expect MSEQ or
1949    RULE described above, decode it and return the negative value of
1950    the decoded character or rule.  If an invalid byte is found, return
1951    -1.  If SRC is too short, return -2.  */
1952
1953 static int
1954 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1955                  int *nbytes, int *nchars, int *id,
1956                  struct composition_status *cmp_status)
1957 {
1958   const unsigned char *src_end = coding->source + coding->src_bytes;
1959   const unsigned char *src_base = src;
1960   int multibytep = coding->src_multibyte;
1961   int charset_ID;
1962   unsigned code;
1963   int c;
1964   int consumed_chars = 0;
1965   int mseq_found = 0;
1966
1967   ONE_MORE_BYTE (c);
1968   if (c < 0)
1969     {
1970       c = -c;
1971       charset_ID = emacs_mule_charset[0];
1972     }
1973   else
1974     {
1975       if (c >= 0xA0)
1976         {
1977           if (cmp_status->state != COMPOSING_NO
1978               && cmp_status->old_form)
1979             {
1980               if (cmp_status->state == COMPOSING_CHAR)
1981                 {
1982                   if (c == 0xA0)
1983                     {
1984                       ONE_MORE_BYTE (c);
1985                       c -= 0x80;
1986                       if (c < 0)
1987                         goto invalid_code;
1988                     }
1989                   else
1990                     c -= 0x20;
1991                   mseq_found = 1;
1992                 }
1993               else
1994                 {
1995                   *nbytes = src - src_base;
1996                   *nchars = consumed_chars;
1997                   return -c;
1998                 }
1999             }
2000           else
2001             goto invalid_code;
2002         }
2003
2004       switch (emacs_mule_bytes[c])
2005         {
2006         case 2:
2007           if ((charset_ID = emacs_mule_charset[c]) < 0)
2008             goto invalid_code;
2009           ONE_MORE_BYTE (c);
2010           if (c < 0xA0)
2011             goto invalid_code;
2012           code = c & 0x7F;
2013           break;
2014
2015         case 3:
2016           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2017               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2018             {
2019               ONE_MORE_BYTE (c);
2020               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = c & 0x7F;
2026             }
2027           else
2028             {
2029               if ((charset_ID = emacs_mule_charset[c]) < 0)
2030                 goto invalid_code;
2031               ONE_MORE_BYTE (c);
2032               if (c < 0xA0)
2033                 goto invalid_code;
2034               code = (c & 0x7F) << 8;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code |= c & 0x7F;
2039             }
2040           break;
2041
2042         case 4:
2043           ONE_MORE_BYTE (c);
2044           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2045             goto invalid_code;
2046           ONE_MORE_BYTE (c);
2047           if (c < 0xA0)
2048             goto invalid_code;
2049           code = (c & 0x7F) << 8;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code |= c & 0x7F;
2054           break;
2055
2056         case 1:
2057           code = c;
2058           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2059           break;
2060
2061         default:
2062           abort ();
2063         }
2064       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2065                           CHARSET_FROM_ID (charset_ID), code, c);
2066       if (c < 0)
2067         goto invalid_code;
2068     }
2069   *nbytes = src - src_base;
2070   *nchars = consumed_chars;
2071   if (id)
2072     *id = charset_ID;
2073   return (mseq_found ? -c : c);
2074
2075  no_more_source:
2076   return -2;
2077
2078  invalid_code:
2079   return -1;
2080 }
2081
2082
2083 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2084
2085 /* Handle these composition sequence ('|': the end of header elements,
2086    BYTES and CHARS >= 0xA0):
2087
2088    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2089    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2090    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2091
2092    and these old form:
2093
2094    (4) relative composition: 0x80 | MSEQ ... MSEQ
2095    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2096
2097    When the starter 0x80 and the following header elements are found,
2098    this annotation header is produced.
2099
2100         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2101
2102    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2103    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2104
2105    Then, upon reading the following elements, these codes are produced
2106    until the composition end is found:
2107
2108    (1) CHAR ... CHAR
2109    (2) ALT ... ALT CHAR ... CHAR
2110    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2111    (4) CHAR ... CHAR
2112    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2113
2114    When the composition end is found, LENGTH and NCHARS in the
2115    annotation header is updated as below:
2116
2117    (1) LENGTH: unchanged, NCHARS: unchanged
2118    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2119    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2120    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2121    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2122
2123    If an error is found while composing, the annotation header is
2124    changed to the original composition header (plus filler -1s) as
2125    below:
2126
2127    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2128    (5)          [ 0x80 0xFF -1 -1- -1 ]
2129
2130    and the sequence [ -2 DECODED-RULE ] is changed to the original
2131    byte sequence as below:
2132         o the original byte sequence is B: [ B -1 ]
2133         o the original byte sequence is B1 B2: [ B1 B2 ]
2134
2135    Most of the routines are implemented by macros because many
2136    variables and labels in the caller decode_coding_emacs_mule must be
2137    accessible, and they are usually called just once (thus doesn't
2138    increase the size of compiled object).  */
2139
2140 /* Decode a composition rule represented by C as a component of
2141    composition sequence of Emacs 20 style.  Set RULE to the decoded
2142    rule. */
2143
2144 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2145   do {                                                  \
2146     int gref, nref;                                     \
2147                                                         \
2148     c -= 0xA0;                                          \
2149     if (c < 0 || c >= 81)                               \
2150       goto invalid_code;                                \
2151     gref = c / 9, nref = c % 9;                         \
2152     if (gref == 4) gref = 10;                           \
2153     if (nref == 4) nref = 10;                           \
2154     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2155   } while (0)
2156
2157
2158 /* Decode a composition rule represented by C and the following byte
2159    at SRC as a component of composition sequence of Emacs 21 style.
2160    Set RULE to the decoded rule.  */
2161
2162 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2163   do {                                                  \
2164     int gref, nref;                                     \
2165                                                         \
2166     gref = c - 0x20;                                    \
2167     if (gref < 0 || gref >= 81)                         \
2168       goto invalid_code;                                \
2169     ONE_MORE_BYTE (c);                                  \
2170     nref = c - 0x20;                                    \
2171     if (nref < 0 || nref >= 81)                         \
2172       goto invalid_code;                                \
2173     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2174   } while (0)
2175
2176
2177 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2178    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2179    byte length of this composition information, CHARS is the number of
2180    characters composed by this composition.  */
2181
2182 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2183   do {                                                                  \
2184     enum composition_method method = c - 0xF2;                          \
2185     int nbytes, nchars;                                                 \
2186                                                                         \
2187     ONE_MORE_BYTE (c);                                                  \
2188     if (c < 0)                                                          \
2189       goto invalid_code;                                                \
2190     nbytes = c - 0xA0;                                                  \
2191     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2192       goto invalid_code;                                                \
2193     ONE_MORE_BYTE (c);                                                  \
2194     nchars = c - 0xA0;                                                  \
2195     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2196       goto invalid_code;                                                \
2197     cmp_status->old_form = 0;                                           \
2198     cmp_status->method = method;                                        \
2199     if (method == COMPOSITION_RELATIVE)                                 \
2200       cmp_status->state = COMPOSING_CHAR;                               \
2201     else                                                                \
2202       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2203     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2204     cmp_status->nchars = nchars;                                        \
2205     cmp_status->ncomps = nbytes - 4;                                    \
2206     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2207   } while (0)
2208
2209
2210 /* Start of Emacs 20 style format for relative composition.  */
2211
2212 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2213   do {                                                          \
2214     cmp_status->old_form = 1;                                   \
2215     cmp_status->method = COMPOSITION_RELATIVE;                  \
2216     cmp_status->state = COMPOSING_CHAR;                         \
2217     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2218     cmp_status->nchars = cmp_status->ncomps = 0;                \
2219     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2220   } while (0)
2221
2222
2223 /* Start of Emacs 20 style format for rule-base composition.  */
2224
2225 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2226   do {                                                          \
2227     cmp_status->old_form = 1;                                   \
2228     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2229     cmp_status->state = COMPOSING_CHAR;                         \
2230     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2231     cmp_status->nchars = cmp_status->ncomps = 0;                \
2232     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2233   } while (0)
2234
2235
2236 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2237   do {                                                  \
2238     const unsigned char *current_src = src;             \
2239                                                         \
2240     ONE_MORE_BYTE (c);                                  \
2241     if (c < 0)                                          \
2242       goto invalid_code;                                \
2243     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2244         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2245       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2246     else if (c < 0xA0)                                  \
2247       goto invalid_code;                                \
2248     else if (c < 0xC0)                                  \
2249       {                                                 \
2250         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2251         /* Re-read C as a composition component.  */    \
2252         src = current_src;                              \
2253       }                                                 \
2254     else if (c == 0xFF)                                 \
2255       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2256     else                                                \
2257       goto invalid_code;                                \
2258   } while (0)
2259
2260 #define EMACS_MULE_COMPOSITION_END()                            \
2261   do {                                                          \
2262     int idx = - cmp_status->length;                             \
2263                                                                 \
2264     if (cmp_status->old_form)                                   \
2265       charbuf[idx + 2] = cmp_status->nchars;                    \
2266     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2267       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2268     cmp_status->state = COMPOSING_NO;                           \
2269   } while (0)
2270
2271
2272 static int
2273 emacs_mule_finish_composition (int *charbuf,
2274                                struct composition_status *cmp_status)
2275 {
2276   int idx = - cmp_status->length;
2277   int new_chars;
2278
2279   if (cmp_status->old_form && cmp_status->nchars > 0)
2280     {
2281       charbuf[idx + 2] = cmp_status->nchars;
2282       new_chars = 0;
2283       if (cmp_status->method == COMPOSITION_WITH_RULE
2284           && cmp_status->state == COMPOSING_CHAR)
2285         {
2286           /* The last rule was invalid.  */
2287           int rule = charbuf[-1] + 0xA0;
2288
2289           charbuf[-2] = BYTE8_TO_CHAR (rule);
2290           charbuf[-1] = -1;
2291           new_chars = 1;
2292         }
2293     }
2294   else
2295     {
2296       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2297
2298       if (cmp_status->method == COMPOSITION_WITH_RULE)
2299         {
2300           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2301           charbuf[idx++] = -3;
2302           charbuf[idx++] = 0;
2303           new_chars = 1;
2304         }
2305       else
2306         {
2307           int nchars = charbuf[idx + 1] + 0xA0;
2308           int nbytes = charbuf[idx + 2] + 0xA0;
2309
2310           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2311           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2312           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2313           charbuf[idx++] = -1;
2314           new_chars = 4;
2315         }
2316     }
2317   cmp_status->state = COMPOSING_NO;
2318   return new_chars;
2319 }
2320
2321 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2322   do {                                                                    \
2323     if (cmp_status->state != COMPOSING_NO)                                \
2324       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2325   } while (0)
2326
2327
2328 static void
2329 decode_coding_emacs_mule (struct coding_system *coding)
2330 {
2331   const unsigned char *src = coding->source + coding->consumed;
2332   const unsigned char *src_end = coding->source + coding->src_bytes;
2333   const unsigned char *src_base;
2334   int *charbuf = coding->charbuf + coding->charbuf_used;
2335   /* We may produce two annotations (charset and composition) in one
2336      loop and one more charset annotation at the end.  */
2337   int *charbuf_end
2338     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2339       /* We can produce up to 2 characters in a loop.  */
2340       - 1;
2341   EMACS_INT consumed_chars = 0, consumed_chars_base;
2342   int multibytep = coding->src_multibyte;
2343   EMACS_INT char_offset = coding->produced_char;
2344   EMACS_INT last_offset = char_offset;
2345   int last_id = charset_ascii;
2346   int eol_dos =
2347     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2348   int byte_after_cr = -1;
2349   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2350
2351   if (cmp_status->state != COMPOSING_NO)
2352     {
2353       int i;
2354
2355       if (charbuf_end - charbuf < cmp_status->length)
2356         abort ();
2357       for (i = 0; i < cmp_status->length; i++)
2358         *charbuf++ = cmp_status->carryover[i];
2359       coding->annotated = 1;
2360     }
2361
2362   while (1)
2363     {
2364       int c, id IF_LINT (= 0);
2365
2366       src_base = src;
2367       consumed_chars_base = consumed_chars;
2368
2369       if (charbuf >= charbuf_end)
2370         {
2371           if (byte_after_cr >= 0)
2372             src_base--;
2373           break;
2374         }
2375
2376       if (byte_after_cr >= 0)
2377         c = byte_after_cr, byte_after_cr = -1;
2378       else
2379         ONE_MORE_BYTE (c);
2380
2381       if (c < 0 || c == 0x80)
2382         {
2383           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2384           if (c < 0)
2385             {
2386               *charbuf++ = -c;
2387               char_offset++;
2388             }
2389           else
2390             DECODE_EMACS_MULE_COMPOSITION_START ();
2391           continue;
2392         }
2393
2394       if (c < 0x80)
2395         {
2396           if (eol_dos && c == '\r')
2397             ONE_MORE_BYTE (byte_after_cr);
2398           id = charset_ascii;
2399           if (cmp_status->state != COMPOSING_NO)
2400             {
2401               if (cmp_status->old_form)
2402                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2404                 cmp_status->ncomps--;
2405             }
2406         }
2407       else
2408         {
2409           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2410           /* emacs_mule_char can load a charset map from a file, which
2411              allocates a large structure and might cause buffer text
2412              to be relocated as result.  Thus, we need to remember the
2413              original pointer to buffer text, and fix up all related
2414              pointers after the call.  */
2415           const unsigned char *orig = coding->source;
2416           EMACS_INT offset;
2417
2418           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2419                                cmp_status);
2420           offset = coding->source - orig;
2421           if (offset)
2422             {
2423               src += offset;
2424               src_base += offset;
2425               src_end += offset;
2426             }
2427           if (c < 0)
2428             {
2429               if (c == -1)
2430                 goto invalid_code;
2431               if (c == -2)
2432                 break;
2433             }
2434           src = src_base + nbytes;
2435           consumed_chars = consumed_chars_base + nchars;
2436           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2437             cmp_status->ncomps -= nchars;
2438         }
2439
2440       /* Now if C >= 0, we found a normally encoded character, if C <
2441          0, we found an old-style composition component character or
2442          rule.  */
2443
2444       if (cmp_status->state == COMPOSING_NO)
2445         {
2446           if (last_id != id)
2447             {
2448               if (last_id != charset_ascii)
2449                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2450                                   last_id);
2451               last_id = id;
2452               last_offset = char_offset;
2453             }
2454           *charbuf++ = c;
2455           char_offset++;
2456         }
2457       else if (cmp_status->state == COMPOSING_CHAR)
2458         {
2459           if (cmp_status->old_form)
2460             {
2461               if (c >= 0)
2462                 {
2463                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2464                   *charbuf++ = c;
2465                   char_offset++;
2466                 }
2467               else
2468                 {
2469                   *charbuf++ = -c;
2470                   cmp_status->nchars++;
2471                   cmp_status->length++;
2472                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2473                     EMACS_MULE_COMPOSITION_END ();
2474                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2475                     cmp_status->state = COMPOSING_RULE;
2476                 }
2477             }
2478           else
2479             {
2480               *charbuf++ = c;
2481               cmp_status->length++;
2482               cmp_status->nchars--;
2483               if (cmp_status->nchars == 0)
2484                 EMACS_MULE_COMPOSITION_END ();
2485             }
2486         }
2487       else if (cmp_status->state == COMPOSING_RULE)
2488         {
2489           int rule;
2490
2491           if (c >= 0)
2492             {
2493               EMACS_MULE_COMPOSITION_END ();
2494               *charbuf++ = c;
2495               char_offset++;
2496             }
2497           else
2498             {
2499               c = -c;
2500               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2501               if (rule < 0)
2502                 goto invalid_code;
2503               *charbuf++ = -2;
2504               *charbuf++ = rule;
2505               cmp_status->length += 2;
2506               cmp_status->state = COMPOSING_CHAR;
2507             }
2508         }
2509       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2510         {
2511           *charbuf++ = c;
2512           cmp_status->length++;
2513           if (cmp_status->ncomps == 0)
2514             cmp_status->state = COMPOSING_CHAR;
2515           else if (cmp_status->ncomps > 0)
2516             {
2517               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2518                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2519             }
2520           else
2521             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2522         }
2523       else                      /* COMPOSING_COMPONENT_RULE */
2524         {
2525           int rule;
2526
2527           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2528           if (rule < 0)
2529             goto invalid_code;
2530           *charbuf++ = -2;
2531           *charbuf++ = rule;
2532           cmp_status->length += 2;
2533           cmp_status->ncomps--;
2534           if (cmp_status->ncomps > 0)
2535             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2536           else
2537             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2538         }
2539       continue;
2540
2541     invalid_code:
2542       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543       src = src_base;
2544       consumed_chars = consumed_chars_base;
2545       ONE_MORE_BYTE (c);
2546       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2547       char_offset++;
2548       coding->errors++;
2549     }
2550
2551  no_more_source:
2552   if (cmp_status->state != COMPOSING_NO)
2553     {
2554       if (coding->mode & CODING_MODE_LAST_BLOCK)
2555         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2556       else
2557         {
2558           int i;
2559
2560           charbuf -= cmp_status->length;
2561           for (i = 0; i < cmp_status->length; i++)
2562             cmp_status->carryover[i] = charbuf[i];
2563         }
2564     }
2565   if (last_id != charset_ascii)
2566     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2567   coding->consumed_char += consumed_chars_base;
2568   coding->consumed = src_base - coding->source;
2569   coding->charbuf_used = charbuf - coding->charbuf;
2570 }
2571
2572
2573 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2574   do {                                          \
2575     if (id < 0xA0)                              \
2576       codes[0] = id, codes[1] = 0;              \
2577     else if (id < 0xE0)                         \
2578       codes[0] = 0x9A, codes[1] = id;           \
2579     else if (id < 0xF0)                         \
2580       codes[0] = 0x9B, codes[1] = id;           \
2581     else if (id < 0xF5)                         \
2582       codes[0] = 0x9C, codes[1] = id;           \
2583     else                                        \
2584       codes[0] = 0x9D, codes[1] = id;           \
2585   } while (0);
2586
2587
2588 static int
2589 encode_coding_emacs_mule (struct coding_system *coding)
2590 {
2591   int multibytep = coding->dst_multibyte;
2592   int *charbuf = coding->charbuf;
2593   int *charbuf_end = charbuf + coding->charbuf_used;
2594   unsigned char *dst = coding->destination + coding->produced;
2595   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2596   int safe_room = 8;
2597   EMACS_INT produced_chars = 0;
2598   Lisp_Object attrs, charset_list;
2599   int c;
2600   int preferred_charset_id = -1;
2601
2602   CODING_GET_INFO (coding, attrs, charset_list);
2603   if (! EQ (charset_list, Vemacs_mule_charset_list))
2604     {
2605       CODING_ATTR_CHARSET_LIST (attrs)
2606         = charset_list = Vemacs_mule_charset_list;
2607     }
2608
2609   while (charbuf < charbuf_end)
2610     {
2611       ASSURE_DESTINATION (safe_room);
2612       c = *charbuf++;
2613
2614       if (c < 0)
2615         {
2616           /* Handle an annotation.  */
2617           switch (*charbuf)
2618             {
2619             case CODING_ANNOTATE_COMPOSITION_MASK:
2620               /* Not yet implemented.  */
2621               break;
2622             case CODING_ANNOTATE_CHARSET_MASK:
2623               preferred_charset_id = charbuf[3];
2624               if (preferred_charset_id >= 0
2625                   && NILP (Fmemq (make_number (preferred_charset_id),
2626                                   charset_list)))
2627                 preferred_charset_id = -1;
2628               break;
2629             default:
2630               abort ();
2631             }
2632           charbuf += -c - 1;
2633           continue;
2634         }
2635
2636       if (ASCII_CHAR_P (c))
2637         EMIT_ONE_ASCII_BYTE (c);
2638       else if (CHAR_BYTE8_P (c))
2639         {
2640           c = CHAR_TO_BYTE8 (c);
2641           EMIT_ONE_BYTE (c);
2642         }
2643       else
2644         {
2645           struct charset *charset;
2646           unsigned code;
2647           int dimension;
2648           int emacs_mule_id;
2649           unsigned char leading_codes[2];
2650
2651           if (preferred_charset_id >= 0)
2652             {
2653               charset = CHARSET_FROM_ID (preferred_charset_id);
2654               if (CHAR_CHARSET_P (c, charset))
2655                 code = ENCODE_CHAR (charset, c);
2656               else
2657                 charset = char_charset (c, charset_list, &code);
2658             }
2659           else
2660             charset = char_charset (c, charset_list, &code);
2661           if (! charset)
2662             {
2663               c = coding->default_char;
2664               if (ASCII_CHAR_P (c))
2665                 {
2666                   EMIT_ONE_ASCII_BYTE (c);
2667                   continue;
2668                 }
2669               charset = char_charset (c, charset_list, &code);
2670             }
2671           dimension = CHARSET_DIMENSION (charset);
2672           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2673           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2674           EMIT_ONE_BYTE (leading_codes[0]);
2675           if (leading_codes[1])
2676             EMIT_ONE_BYTE (leading_codes[1]);
2677           if (dimension == 1)
2678             EMIT_ONE_BYTE (code | 0x80);
2679           else
2680             {
2681               code |= 0x8080;
2682               EMIT_ONE_BYTE (code >> 8);
2683               EMIT_ONE_BYTE (code & 0xFF);
2684             }
2685         }
2686     }
2687   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2688   coding->produced_char += produced_chars;
2689   coding->produced = dst - coding->destination;
2690   return 0;
2691 }
2692
2693 \f
2694 /*** 7. ISO2022 handlers ***/
2695
2696 /* The following note describes the coding system ISO2022 briefly.
2697    Since the intention of this note is to help understand the
2698    functions in this file, some parts are NOT ACCURATE or are OVERLY
2699    SIMPLIFIED.  For thorough understanding, please refer to the
2700    original document of ISO2022.  This is equivalent to the standard
2701    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2702
2703    ISO2022 provides many mechanisms to encode several character sets
2704    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2705    is encoded using bytes less than 128.  This may make the encoded
2706    text a little bit longer, but the text passes more easily through
2707    several types of gateway, some of which strip off the MSB (Most
2708    Significant Bit).
2709
2710    There are two kinds of character sets: control character sets and
2711    graphic character sets.  The former contain control characters such
2712    as `newline' and `escape' to provide control functions (control
2713    functions are also provided by escape sequences).  The latter
2714    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2715    two control character sets and many graphic character sets.
2716
2717    Graphic character sets are classified into one of the following
2718    four classes, according to the number of bytes (DIMENSION) and
2719    number of characters in one dimension (CHARS) of the set:
2720    - DIMENSION1_CHARS94
2721    - DIMENSION1_CHARS96
2722    - DIMENSION2_CHARS94
2723    - DIMENSION2_CHARS96
2724
2725    In addition, each character set is assigned an identification tag,
2726    unique for each set, called the "final character" (denoted as <F>
2727    hereafter).  The <F> of each character set is decided by ECMA(*)
2728    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2729    (0x30..0x3F are for private use only).
2730
2731    Note (*): ECMA = European Computer Manufacturers Association
2732
2733    Here are examples of graphic character sets [NAME(<F>)]:
2734         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2735         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2736         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2737         o DIMENSION2_CHARS96 -- none for the moment
2738
2739    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2740         C0 [0x00..0x1F] -- control character plane 0
2741         GL [0x20..0x7F] -- graphic character plane 0
2742         C1 [0x80..0x9F] -- control character plane 1
2743         GR [0xA0..0xFF] -- graphic character plane 1
2744
2745    A control character set is directly designated and invoked to C0 or
2746    C1 by an escape sequence.  The most common case is that:
2747    - ISO646's  control character set is designated/invoked to C0, and
2748    - ISO6429's control character set is designated/invoked to C1,
2749    and usually these designations/invocations are omitted in encoded
2750    text.  In a 7-bit environment, only C0 can be used, and a control
2751    character for C1 is encoded by an appropriate escape sequence to
2752    fit into the environment.  All control characters for C1 are
2753    defined to have corresponding escape sequences.
2754
2755    A graphic character set is at first designated to one of four
2756    graphic registers (G0 through G3), then these graphic registers are
2757    invoked to GL or GR.  These designations and invocations can be
2758    done independently.  The most common case is that G0 is invoked to
2759    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2760    these invocations and designations are omitted in encoded text.
2761    In a 7-bit environment, only GL can be used.
2762
2763    When a graphic character set of CHARS94 is invoked to GL, codes
2764    0x20 and 0x7F of the GL area work as control characters SPACE and
2765    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2766    be used.
2767
2768    There are two ways of invocation: locking-shift and single-shift.
2769    With locking-shift, the invocation lasts until the next different
2770    invocation, whereas with single-shift, the invocation affects the
2771    following character only and doesn't affect the locking-shift
2772    state.  Invocations are done by the following control characters or
2773    escape sequences:
2774
2775    ----------------------------------------------------------------------
2776    abbrev  function                  cntrl escape seq   description
2777    ----------------------------------------------------------------------
2778    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2779    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2780    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2781    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2782    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2783    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2784    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2785    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2786    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2787    ----------------------------------------------------------------------
2788    (*) These are not used by any known coding system.
2789
2790    Control characters for these functions are defined by macros
2791    ISO_CODE_XXX in `coding.h'.
2792
2793    Designations are done by the following escape sequences:
2794    ----------------------------------------------------------------------
2795    escape sequence      description
2796    ----------------------------------------------------------------------
2797    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2798    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2799    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2800    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2801    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2802    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2803    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2804    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2805    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2806    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2807    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2808    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2809    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2810    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2811    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2812    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2813    ----------------------------------------------------------------------
2814
2815    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2816    of dimension 1, chars 94, and final character <F>, etc...
2817
2818    Note (*): Although these designations are not allowed in ISO2022,
2819    Emacs accepts them on decoding, and produces them on encoding
2820    CHARS96 character sets in a coding system which is characterized as
2821    7-bit environment, non-locking-shift, and non-single-shift.
2822
2823    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2824    '(' must be omitted.  We refer to this as "short-form" hereafter.
2825
2826    Now you may notice that there are a lot of ways of encoding the
2827    same multilingual text in ISO2022.  Actually, there exist many
2828    coding systems such as Compound Text (used in X11's inter client
2829    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2830    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2831    localized platforms), and all of these are variants of ISO2022.
2832
2833    In addition to the above, Emacs handles two more kinds of escape
2834    sequences: ISO6429's direction specification and Emacs' private
2835    sequence for specifying character composition.
2836
2837    ISO6429's direction specification takes the following form:
2838         o CSI ']'      -- end of the current direction
2839         o CSI '0' ']'  -- end of the current direction
2840         o CSI '1' ']'  -- start of left-to-right text
2841         o CSI '2' ']'  -- start of right-to-left text
2842    The control character CSI (0x9B: control sequence introducer) is
2843    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2844
2845    Character composition specification takes the following form:
2846         o ESC '0' -- start relative composition
2847         o ESC '1' -- end composition
2848         o ESC '2' -- start rule-base composition (*)
2849         o ESC '3' -- start relative composition with alternate chars  (**)
2850         o ESC '4' -- start rule-base composition with alternate chars  (**)
2851   Since these are not standard escape sequences of any ISO standard,
2852   the use of them with these meanings is restricted to Emacs only.
2853
2854   (*) This form is used only in Emacs 20.7 and older versions,
2855   but newer versions can safely decode it.
2856   (**) This form is used only in Emacs 21.1 and newer versions,
2857   and older versions can't decode it.
2858
2859   Here's a list of example usages of these composition escape
2860   sequences (categorized by `enum composition_method').
2861
2862   COMPOSITION_RELATIVE:
2863         ESC 0 CHAR [ CHAR ] ESC 1
2864   COMPOSITION_WITH_RULE:
2865         ESC 2 CHAR [ RULE CHAR ] ESC 1
2866   COMPOSITION_WITH_ALTCHARS:
2867         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2868   COMPOSITION_WITH_RULE_ALTCHARS:
2869         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2870
2871 static enum iso_code_class_type iso_code_class[256];
2872
2873 #define SAFE_CHARSET_P(coding, id)      \
2874   ((id) <= (coding)->max_charset_id     \
2875    && (coding)->safe_charsets[id] != 255)
2876
2877 static void
2878 setup_iso_safe_charsets (Lisp_Object attrs)
2879 {
2880   Lisp_Object charset_list, safe_charsets;
2881   Lisp_Object request;
2882   Lisp_Object reg_usage;
2883   Lisp_Object tail;
2884   int reg94, reg96;
2885   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2886   int max_charset_id;
2887
2888   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2889   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2890       && ! EQ (charset_list, Viso_2022_charset_list))
2891     {
2892       CODING_ATTR_CHARSET_LIST (attrs)
2893         = charset_list = Viso_2022_charset_list;
2894       ASET (attrs, coding_attr_safe_charsets, Qnil);
2895     }
2896
2897   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2898     return;
2899
2900   max_charset_id = 0;
2901   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2902     {
2903       int id = XINT (XCAR (tail));
2904       if (max_charset_id < id)
2905         max_charset_id = id;
2906     }
2907
2908   safe_charsets = make_uninit_string (max_charset_id + 1);
2909   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2910   request = AREF (attrs, coding_attr_iso_request);
2911   reg_usage = AREF (attrs, coding_attr_iso_usage);
2912   reg94 = XINT (XCAR (reg_usage));
2913   reg96 = XINT (XCDR (reg_usage));
2914
2915   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2916     {
2917       Lisp_Object id;
2918       Lisp_Object reg;
2919       struct charset *charset;
2920
2921       id = XCAR (tail);
2922       charset = CHARSET_FROM_ID (XINT (id));
2923       reg = Fcdr (Fassq (id, request));
2924       if (! NILP (reg))
2925         SSET (safe_charsets, XINT (id), XINT (reg));
2926       else if (charset->iso_chars_96)
2927         {
2928           if (reg96 < 4)
2929             SSET (safe_charsets, XINT (id), reg96);
2930         }
2931       else
2932         {
2933           if (reg94 < 4)
2934             SSET (safe_charsets, XINT (id), reg94);
2935         }
2936     }
2937   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2938 }
2939
2940
2941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2942    Check if a text is encoded in one of ISO-2022 based coding systems.
2943    If it is, return 1, else return 0.  */
2944
2945 static int
2946 detect_coding_iso_2022 (struct coding_system *coding,
2947                         struct coding_detection_info *detect_info)
2948 {
2949   const unsigned char *src = coding->source, *src_base = src;
2950   const unsigned char *src_end = coding->source + coding->src_bytes;
2951   int multibytep = coding->src_multibyte;
2952   int single_shifting = 0;
2953   int id;
2954   int c, c1;
2955   EMACS_INT consumed_chars = 0;
2956   int i;
2957   int rejected = 0;
2958   int found = 0;
2959   int composition_count = -1;
2960
2961   detect_info->checked |= CATEGORY_MASK_ISO;
2962
2963   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2964     {
2965       struct coding_system *this = &(coding_categories[i]);
2966       Lisp_Object attrs, val;
2967
2968       if (this->id < 0)
2969         continue;
2970       attrs = CODING_ID_ATTRS (this->id);
2971       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2972           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2973         setup_iso_safe_charsets (attrs);
2974       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2975       this->max_charset_id = SCHARS (val) - 1;
2976       this->safe_charsets = SDATA (val);
2977     }
2978
2979   /* A coding system of this category is always ASCII compatible.  */
2980   src += coding->head_ascii;
2981
2982   while (rejected != CATEGORY_MASK_ISO)
2983     {
2984       src_base = src;
2985       ONE_MORE_BYTE (c);
2986       switch (c)
2987         {
2988         case ISO_CODE_ESC:
2989           if (inhibit_iso_escape_detection)
2990             break;
2991           single_shifting = 0;
2992           ONE_MORE_BYTE (c);
2993           if (c == 'N' || c == 'O')
2994             {
2995               /* ESC <Fe> for SS2 or SS3.  */
2996               single_shifting = 1;
2997               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2998             }
2999           else if (c == '1')
3000             {
3001               /* End of composition.  */
3002               if (composition_count < 0
3003                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3004                 /* Invalid */
3005                 break;
3006               composition_count = -1;
3007               found |= CATEGORY_MASK_ISO;
3008             }
3009           else if (c >= '0' && c <= '4')
3010             {
3011               /* ESC <Fp> for start/end composition.  */
3012               composition_count = 0;
3013             }
3014           else
3015             {
3016               if (c >= '(' && c <= '/')
3017                 {
3018                   /* Designation sequence for a charset of dimension 1.  */
3019                   ONE_MORE_BYTE (c1);
3020                   if (c1 < ' ' || c1 >= 0x80
3021                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3022                     /* Invalid designation sequence.  Just ignore.  */
3023                     break;
3024                 }
3025               else if (c == '$')
3026                 {
3027                   /* Designation sequence for a charset of dimension 2.  */
3028                   ONE_MORE_BYTE (c);
3029                   if (c >= '@' && c <= 'B')
3030                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3031                     id = iso_charset_table[1][0][c];
3032                   else if (c >= '(' && c <= '/')
3033                     {
3034                       ONE_MORE_BYTE (c1);
3035                       if (c1 < ' ' || c1 >= 0x80
3036                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3037                         /* Invalid designation sequence.  Just ignore.  */
3038                         break;
3039                     }
3040                   else
3041                     /* Invalid designation sequence.  Just ignore it.  */
3042                     break;
3043                 }
3044               else
3045                 {
3046                   /* Invalid escape sequence.  Just ignore it.  */
3047                   break;
3048                 }
3049
3050               /* We found a valid designation sequence for CHARSET.  */
3051               rejected |= CATEGORY_MASK_ISO_8BIT;
3052               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3053                                   id))
3054                 found |= CATEGORY_MASK_ISO_7;
3055               else
3056                 rejected |= CATEGORY_MASK_ISO_7;
3057               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3058                                   id))
3059                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3060               else
3061                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3062               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3063                                   id))
3064                 found |= CATEGORY_MASK_ISO_7_ELSE;
3065               else
3066                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3067               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3068                                   id))
3069                 found |= CATEGORY_MASK_ISO_8_ELSE;
3070               else
3071                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3072             }
3073           break;
3074
3075         case ISO_CODE_SO:
3076         case ISO_CODE_SI:
3077           /* Locking shift out/in.  */
3078           if (inhibit_iso_escape_detection)
3079             break;
3080           single_shifting = 0;
3081           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3082           break;
3083
3084         case ISO_CODE_CSI:
3085           /* Control sequence introducer.  */
3086           single_shifting = 0;
3087           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088           found |= CATEGORY_MASK_ISO_8_ELSE;
3089           goto check_extra_latin;
3090
3091         case ISO_CODE_SS2:
3092         case ISO_CODE_SS3:
3093           /* Single shift.   */
3094           if (inhibit_iso_escape_detection)
3095             break;
3096           single_shifting = 0;
3097           rejected |= CATEGORY_MASK_ISO_7BIT;
3098           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3099               & CODING_ISO_FLAG_SINGLE_SHIFT)
3100             {
3101               found |= CATEGORY_MASK_ISO_8_1;
3102               single_shifting = 1;
3103             }
3104           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3105               & CODING_ISO_FLAG_SINGLE_SHIFT)
3106             {
3107               found |= CATEGORY_MASK_ISO_8_2;
3108               single_shifting = 1;
3109             }
3110           if (single_shifting)
3111             break;
3112         check_extra_latin:
3113           if (! VECTORP (Vlatin_extra_code_table)
3114               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3115             {
3116               rejected = CATEGORY_MASK_ISO;
3117               break;
3118             }
3119           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3120               & CODING_ISO_FLAG_LATIN_EXTRA)
3121             found |= CATEGORY_MASK_ISO_8_1;
3122           else
3123             rejected |= CATEGORY_MASK_ISO_8_1;
3124           rejected |= CATEGORY_MASK_ISO_8_2;
3125           break;
3126
3127         default:
3128           if (c < 0)
3129             continue;
3130           if (c < 0x80)
3131             {
3132               if (composition_count >= 0)
3133                 composition_count++;
3134               single_shifting = 0;
3135               break;
3136             }
3137           if (c >= 0xA0)
3138             {
3139               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3140               found |= CATEGORY_MASK_ISO_8_1;
3141               /* Check the length of succeeding codes of the range
3142                  0xA0..0FF.  If the byte length is even, we include
3143                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3144                  only when we are not single shifting.  */
3145               if (! single_shifting
3146                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3147                 {
3148                   int len = 1;
3149                   while (src < src_end)
3150                     {
3151                       src_base = src;
3152                       ONE_MORE_BYTE (c);
3153                       if (c < 0xA0)
3154                         {
3155                           src = src_base;
3156                           break;
3157                         }
3158                       len++;
3159                     }
3160
3161                   if (len & 1 && src < src_end)
3162                     {
3163                       rejected |= CATEGORY_MASK_ISO_8_2;
3164                       if (composition_count >= 0)
3165                         composition_count += len;
3166                     }
3167                   else
3168                     {
3169                       found |= CATEGORY_MASK_ISO_8_2;
3170                       if (composition_count >= 0)
3171                         composition_count += len / 2;
3172                     }
3173                 }
3174               break;
3175             }
3176         }
3177     }
3178   detect_info->rejected |= CATEGORY_MASK_ISO;
3179   return 0;
3180
3181  no_more_source:
3182   detect_info->rejected |= rejected;
3183   detect_info->found |= (found & ~rejected);
3184   return 1;
3185 }
3186
3187
3188 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3189    escape sequence should be kept.  */
3190 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3191   do {                                                                  \
3192     int id, prev;                                                       \
3193                                                                         \
3194     if (final < '0' || final >= 128                                     \
3195         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3196         || !SAFE_CHARSET_P (coding, id))                                \
3197       {                                                                 \
3198         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3199         chars_96 = -1;                                                  \
3200         break;                                                          \
3201       }                                                                 \
3202     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3203     if (id == charset_jisx0201_roman)                                   \
3204       {                                                                 \
3205         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3206           id = charset_ascii;                                           \
3207       }                                                                 \
3208     else if (id == charset_jisx0208_1978)                               \
3209       {                                                                 \
3210         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3211           id = charset_jisx0208;                                        \
3212       }                                                                 \
3213     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3214     /* If there was an invalid designation to REG previously, and this  \
3215        designation is ASCII to REG, we should keep this designation     \
3216        sequence.  */                                                    \
3217     if (prev == -2 && id == charset_ascii)                              \
3218       chars_96 = -1;                                                    \
3219   } while (0)
3220
3221
3222 /* Handle these composition sequence (ALT: alternate char):
3223
3224    (1) relative composition: ESC 0 CHAR ... ESC 1
3225    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3226    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3227    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3228
3229    When the start sequence (ESC 0/2/3/4) is found, this annotation
3230    header is produced.
3231
3232         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3233
3234    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3235    produced until the end sequence (ESC 1) is found:
3236
3237    (1) CHAR ... CHAR
3238    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3239    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3240    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3241
3242    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3243    annotation header is updated as below:
3244
3245    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3246    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3247    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3248    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3249
3250    If an error is found while composing, the annotation header is
3251    changed to:
3252
3253         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3254
3255    and the sequence [ -2 DECODED-RULE ] is changed to the original
3256    byte sequence as below:
3257         o the original byte sequence is B: [ B -1 ]
3258         o the original byte sequence is B1 B2: [ B1 B2 ]
3259    and the sequence [ -1 -1 ] is changed to the original byte
3260    sequence:
3261         [ ESC '0' ]
3262 */
3263
3264 /* Decode a composition rule C1 and maybe one more byte from the
3265    source, and set RULE to the encoded composition rule.  If the rule
3266    is invalid, goto invalid_code.  */
3267
3268 #define DECODE_COMPOSITION_RULE(rule)                                   \
3269   do {                                                                  \
3270     rule = c1 - 32;                                                     \
3271     if (rule < 0)                                                       \
3272       goto invalid_code;                                                \
3273     if (rule < 81)              /* old format (before ver.21) */        \
3274       {                                                                 \
3275         int gref = (rule) / 9;                                          \
3276         int nref = (rule) % 9;                                          \
3277         if (gref == 4) gref = 10;                                       \
3278         if (nref == 4) nref = 10;                                       \
3279         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3280       }                                                                 \
3281     else                        /* new format (after ver.21) */         \
3282       {                                                                 \
3283         int b;                                                          \
3284                                                                         \
3285         ONE_MORE_BYTE (b);                                              \
3286         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3287           goto invalid_code;                                            \
3288         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3289         rule += 0x100;   /* Distinguish it from the old format.  */     \
3290       }                                                                 \
3291   } while (0)
3292
3293 #define ENCODE_COMPOSITION_RULE(rule)                           \
3294   do {                                                          \
3295     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3296                                                                 \
3297     if (rule < 0x100)           /* old format */                \
3298       {                                                         \
3299         if (gref == 10) gref = 4;                               \
3300         if (nref == 10) nref = 4;                               \
3301         charbuf[idx] = 32 + gref * 9 + nref;                    \
3302         charbuf[idx + 1] = -1;                                  \
3303         new_chars++;                                            \
3304       }                                                         \
3305     else                                /* new format */        \
3306       {                                                         \
3307         charbuf[idx] = 32 + 81 + gref;                          \
3308         charbuf[idx + 1] = 32 + nref;                           \
3309         new_chars += 2;                                         \
3310       }                                                         \
3311   } while (0)
3312
3313 /* Finish the current composition as invalid.  */
3314
3315 static int finish_composition (int *, struct composition_status *);
3316
3317 static int
3318 finish_composition (int *charbuf, struct composition_status *cmp_status)
3319 {
3320   int idx = - cmp_status->length;
3321   int new_chars;
3322
3323   /* Recover the original ESC sequence */
3324   charbuf[idx++] = ISO_CODE_ESC;
3325   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3326                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3327                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3328                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3329                     : '4');
3330   charbuf[idx++] = -2;
3331   charbuf[idx++] = 0;
3332   charbuf[idx++] = -1;
3333   new_chars = cmp_status->nchars;
3334   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3335     for (; idx < 0; idx++)
3336       {
3337         int elt = charbuf[idx];
3338
3339         if (elt == -2)
3340           {
3341             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3342             idx++;
3343           }
3344         else if (elt == -1)
3345           {
3346             charbuf[idx++] = ISO_CODE_ESC;
3347             charbuf[idx] = '0';
3348             new_chars += 2;
3349           }
3350       }
3351   cmp_status->state = COMPOSING_NO;
3352   return new_chars;
3353 }
3354
3355 /* If characters are under composition, finish the composition.  */
3356 #define MAYBE_FINISH_COMPOSITION()                              \
3357   do {                                                          \
3358     if (cmp_status->state != COMPOSING_NO)                      \
3359       char_offset += finish_composition (charbuf, cmp_status);  \
3360   } while (0)
3361
3362 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3363
3364    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3365    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3366    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3367    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3368
3369    Produce this annotation sequence now:
3370
3371    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3372 */
3373
3374 #define DECODE_COMPOSITION_START(c1)                                       \
3375   do {                                                                     \
3376     if (c1 == '0'                                                          \
3377         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3378              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3379             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3380                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3381       {                                                                    \
3382         *charbuf++ = -1;                                                   \
3383         *charbuf++= -1;                                                    \
3384         cmp_status->state = COMPOSING_CHAR;                                \
3385         cmp_status->length += 2;                                           \
3386       }                                                                    \
3387     else                                                                   \
3388       {                                                                    \
3389         MAYBE_FINISH_COMPOSITION ();                                       \
3390         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3391                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3392                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3393                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3394         cmp_status->state                                                  \
3395           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3396         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3397         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3398         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3399         coding->annotated = 1;                                             \
3400       }                                                                    \
3401   } while (0)
3402
3403
3404 /* Handle composition end sequence ESC 1.  */
3405
3406 #define DECODE_COMPOSITION_END()                                        \
3407   do {                                                                  \
3408     if (cmp_status->nchars == 0                                         \
3409         || ((cmp_status->state == COMPOSING_CHAR)                       \
3410             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3411       {                                                                 \
3412         MAYBE_FINISH_COMPOSITION ();                                    \
3413         goto invalid_code;                                              \
3414       }                                                                 \
3415     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3416       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3417     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3418       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3419     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3420     char_offset += cmp_status->nchars;                                  \
3421     cmp_status->state = COMPOSING_NO;                                   \
3422   } while (0)
3423
3424 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3425
3426 #define STORE_COMPOSITION_RULE(rule)    \
3427   do {                                  \
3428     *charbuf++ = -2;                    \
3429     *charbuf++ = rule;                  \
3430     cmp_status->length += 2;            \
3431     cmp_status->state--;                \
3432   } while (0)
3433
3434 /* Store a composed char or a component char C in charbuf, and update
3435    cmp_status.  */
3436
3437 #define STORE_COMPOSITION_CHAR(c)                                       \
3438   do {                                                                  \
3439     *charbuf++ = (c);                                                   \
3440     cmp_status->length++;                                               \
3441     if (cmp_status->state == COMPOSING_CHAR)                            \
3442       cmp_status->nchars++;                                             \
3443     else                                                                \
3444       cmp_status->ncomps++;                                             \
3445     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3446         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3447             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3448       cmp_status->state++;                                              \
3449   } while (0)
3450
3451
3452 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3453
3454 static void
3455 decode_coding_iso_2022 (struct coding_system *coding)
3456 {
3457   const unsigned char *src = coding->source + coding->consumed;
3458   const unsigned char *src_end = coding->source + coding->src_bytes;
3459   const unsigned char *src_base;
3460   int *charbuf = coding->charbuf + coding->charbuf_used;
3461   /* We may produce two annotations (charset and composition) in one
3462      loop and one more charset annotation at the end.  */
3463   int *charbuf_end
3464     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3465   EMACS_INT consumed_chars = 0, consumed_chars_base;
3466   int multibytep = coding->src_multibyte;
3467   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3468   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3469   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3470   int charset_id_2, charset_id_3;
3471   struct charset *charset;
3472   int c;
3473   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3474   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3475   EMACS_INT char_offset = coding->produced_char;
3476   EMACS_INT last_offset = char_offset;
3477   int last_id = charset_ascii;
3478   int eol_dos =
3479     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3480   int byte_after_cr = -1;
3481   int i;
3482
3483   setup_iso_safe_charsets (attrs);
3484   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3485
3486   if (cmp_status->state != COMPOSING_NO)
3487     {
3488       if (charbuf_end - charbuf < cmp_status->length)
3489         abort ();
3490       for (i = 0; i < cmp_status->length; i++)
3491         *charbuf++ = cmp_status->carryover[i];
3492       coding->annotated = 1;
3493     }
3494
3495   while (1)
3496     {
3497       int c1, c2, c3;
3498
3499       src_base = src;
3500       consumed_chars_base = consumed_chars;
3501
3502       if (charbuf >= charbuf_end)
3503         {
3504           if (byte_after_cr >= 0)
3505             src_base--;
3506           break;
3507         }
3508
3509       if (byte_after_cr >= 0)
3510         c1 = byte_after_cr, byte_after_cr = -1;
3511       else
3512         ONE_MORE_BYTE (c1);
3513       if (c1 < 0)
3514         goto invalid_code;
3515
3516       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3517         {
3518           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3519           char_offset++;
3520           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3521           continue;
3522         }
3523
3524       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3525         {
3526           if (c1 == ISO_CODE_ESC)
3527             {
3528               if (src + 1 >= src_end)
3529                 goto no_more_source;
3530               *charbuf++ = ISO_CODE_ESC;
3531               char_offset++;
3532               if (src[0] == '%' && src[1] == '@')
3533                 {
3534                   src += 2;
3535                   consumed_chars += 2;
3536                   char_offset += 2;
3537                   /* We are sure charbuf can contain two more chars. */
3538                   *charbuf++ = '%';
3539                   *charbuf++ = '@';
3540                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3541                 }
3542             }
3543           else
3544             {
3545               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546               char_offset++;
3547             }
3548           continue;
3549         }
3550
3551       if ((cmp_status->state == COMPOSING_RULE
3552            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3553           && c1 != ISO_CODE_ESC)
3554         {
3555           int rule;
3556
3557           DECODE_COMPOSITION_RULE (rule);
3558           STORE_COMPOSITION_RULE (rule);
3559           continue;
3560         }
3561
3562       /* We produce at most one character.  */
3563       switch (iso_code_class [c1])
3564         {
3565         case ISO_0x20_or_0x7F:
3566           if (charset_id_0 < 0
3567               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3568             /* This is SPACE or DEL.  */
3569             charset = CHARSET_FROM_ID (charset_ascii);
3570           else
3571             charset = CHARSET_FROM_ID (charset_id_0);
3572           break;
3573
3574         case ISO_graphic_plane_0:
3575           if (charset_id_0 < 0)
3576             charset = CHARSET_FROM_ID (charset_ascii);
3577           else
3578             charset = CHARSET_FROM_ID (charset_id_0);
3579           break;
3580
3581         case ISO_0xA0_or_0xFF:
3582           if (charset_id_1 < 0
3583               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3584               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3585             goto invalid_code;
3586           /* This is a graphic character, we fall down ... */
3587
3588         case ISO_graphic_plane_1:
3589           if (charset_id_1 < 0)
3590             goto invalid_code;
3591           charset = CHARSET_FROM_ID (charset_id_1);
3592           break;
3593
3594         case ISO_control_0:
3595           if (eol_dos && c1 == '\r')
3596             ONE_MORE_BYTE (byte_after_cr);
3597           MAYBE_FINISH_COMPOSITION ();
3598           charset = CHARSET_FROM_ID (charset_ascii);
3599           break;
3600
3601         case ISO_control_1:
3602           goto invalid_code;
3603
3604         case ISO_shift_out:
3605           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3606               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3607             goto invalid_code;
3608           CODING_ISO_INVOCATION (coding, 0) = 1;
3609           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3610           continue;
3611
3612         case ISO_shift_in:
3613           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3614             goto invalid_code;
3615           CODING_ISO_INVOCATION (coding, 0) = 0;
3616           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3617           continue;
3618
3619         case ISO_single_shift_2_7:
3620           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3621             goto invalid_code;
3622         case ISO_single_shift_2:
3623           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3624             goto invalid_code;
3625           /* SS2 is handled as an escape sequence of ESC 'N' */
3626           c1 = 'N';
3627           goto label_escape_sequence;
3628
3629         case ISO_single_shift_3:
3630           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3631             goto invalid_code;
3632           /* SS2 is handled as an escape sequence of ESC 'O' */
3633           c1 = 'O';
3634           goto label_escape_sequence;
3635
3636         case ISO_control_sequence_introducer:
3637           /* CSI is handled as an escape sequence of ESC '[' ...  */
3638           c1 = '[';
3639           goto label_escape_sequence;
3640
3641         case ISO_escape:
3642           ONE_MORE_BYTE (c1);
3643         label_escape_sequence:
3644           /* Escape sequences handled here are invocation,
3645              designation, direction specification, and character
3646              composition specification.  */
3647           switch (c1)
3648             {
3649             case '&':           /* revision of following character set */
3650               ONE_MORE_BYTE (c1);
3651               if (!(c1 >= '@' && c1 <= '~'))
3652                 goto invalid_code;
3653               ONE_MORE_BYTE (c1);
3654               if (c1 != ISO_CODE_ESC)
3655                 goto invalid_code;
3656               ONE_MORE_BYTE (c1);
3657               goto label_escape_sequence;
3658
3659             case '$':           /* designation of 2-byte character set */
3660               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3661                 goto invalid_code;
3662               {
3663                 int reg, chars96;
3664
3665                 ONE_MORE_BYTE (c1);
3666                 if (c1 >= '@' && c1 <= 'B')
3667                   {     /* designation of JISX0208.1978, GB2312.1980,
3668                            or JISX0208.1980 */
3669                     reg = 0, chars96 = 0;
3670                   }
3671                 else if (c1 >= 0x28 && c1 <= 0x2B)
3672                   { /* designation of DIMENSION2_CHARS94 character set */
3673                     reg = c1 - 0x28, chars96 = 0;
3674                     ONE_MORE_BYTE (c1);
3675                   }
3676                 else if (c1 >= 0x2C && c1 <= 0x2F)
3677                   { /* designation of DIMENSION2_CHARS96 character set */
3678                     reg = c1 - 0x2C, chars96 = 1;
3679                     ONE_MORE_BYTE (c1);
3680                   }
3681                 else
3682                   goto invalid_code;
3683                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3684                 /* We must update these variables now.  */
3685                 if (reg == 0)
3686                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3687                 else if (reg == 1)
3688                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3689                 if (chars96 < 0)
3690                   goto invalid_code;
3691               }
3692               continue;
3693
3694             case 'n':           /* invocation of locking-shift-2 */
3695               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3696                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3697                 goto invalid_code;
3698               CODING_ISO_INVOCATION (coding, 0) = 2;
3699               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3700               continue;
3701
3702             case 'o':           /* invocation of locking-shift-3 */
3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3704                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3705                 goto invalid_code;
3706               CODING_ISO_INVOCATION (coding, 0) = 3;
3707               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3708               continue;
3709
3710             case 'N':           /* invocation of single-shift-2 */
3711               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3712                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713                 goto invalid_code;
3714               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3715               if (charset_id_2 < 0)
3716                 charset = CHARSET_FROM_ID (charset_ascii);
3717               else
3718                 charset = CHARSET_FROM_ID (charset_id_2);
3719               ONE_MORE_BYTE (c1);
3720               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3721                 goto invalid_code;
3722               break;
3723
3724             case 'O':           /* invocation of single-shift-3 */
3725               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3726                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3727                 goto invalid_code;
3728               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3729               if (charset_id_3 < 0)
3730                 charset = CHARSET_FROM_ID (charset_ascii);
3731               else
3732                 charset = CHARSET_FROM_ID (charset_id_3);
3733               ONE_MORE_BYTE (c1);
3734               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3735                 goto invalid_code;
3736               break;
3737
3738             case '0': case '2': case '3': case '4': /* start composition */
3739               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3740                 goto invalid_code;
3741               if (last_id != charset_ascii)
3742                 {
3743                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3744                   last_id = charset_ascii;
3745                   last_offset = char_offset;
3746                 }
3747               DECODE_COMPOSITION_START (c1);
3748               continue;
3749
3750             case '1':           /* end composition */
3751               if (cmp_status->state == COMPOSING_NO)
3752                 goto invalid_code;
3753               DECODE_COMPOSITION_END ();
3754               continue;
3755
3756             case '[':           /* specification of direction */
3757               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3758                 goto invalid_code;
3759               /* For the moment, nested direction is not supported.
3760                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3761                  left-to-right, and nonzero means right-to-left.  */
3762               ONE_MORE_BYTE (c1);
3763               switch (c1)
3764                 {
3765                 case ']':       /* end of the current direction */
3766                   coding->mode &= ~CODING_MODE_DIRECTION;
3767
3768                 case '0':       /* end of the current direction */
3769                 case '1':       /* start of left-to-right direction */
3770                   ONE_MORE_BYTE (c1);
3771                   if (c1 == ']')
3772                     coding->mode &= ~CODING_MODE_DIRECTION;
3773                   else
3774                     goto invalid_code;
3775                   break;
3776
3777                 case '2':       /* start of right-to-left direction */
3778                   ONE_MORE_BYTE (c1);
3779                   if (c1 == ']')
3780                     coding->mode |= CODING_MODE_DIRECTION;
3781                   else
3782                     goto invalid_code;
3783                   break;
3784
3785                 default:
3786                   goto invalid_code;
3787                 }
3788               continue;
3789
3790             case '%':
3791               ONE_MORE_BYTE (c1);
3792               if (c1 == '/')
3793                 {
3794                   /* CTEXT extended segment:
3795                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3796                      We keep these bytes as is for the moment.
3797                      They may be decoded by post-read-conversion.  */
3798                   int dim, M, L;
3799                   int size;
3800
3801                   ONE_MORE_BYTE (dim);
3802                   if (dim < '0' || dim > '4')
3803                     goto invalid_code;
3804                   ONE_MORE_BYTE (M);
3805                   if (M < 128)
3806                     goto invalid_code;
3807                   ONE_MORE_BYTE (L);
3808                   if (L < 128)
3809                     goto invalid_code;
3810                   size = ((M - 128) * 128) + (L - 128);
3811                   if (charbuf + 6 > charbuf_end)
3812                     goto break_loop;
3813                   *charbuf++ = ISO_CODE_ESC;
3814                   *charbuf++ = '%';
3815                   *charbuf++ = '/';
3816                   *charbuf++ = dim;
3817                   *charbuf++ = BYTE8_TO_CHAR (M);
3818                   *charbuf++ = BYTE8_TO_CHAR (L);
3819                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3820                 }
3821               else if (c1 == 'G')
3822                 {
3823                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3824                      ESC % G --UTF-8-BYTES-- ESC % @
3825                      We keep these bytes as is for the moment.
3826                      They may be decoded by post-read-conversion.  */
3827                   if (charbuf + 3 > charbuf_end)
3828                     goto break_loop;
3829                   *charbuf++ = ISO_CODE_ESC;
3830                   *charbuf++ = '%';
3831                   *charbuf++ = 'G';
3832                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3833                 }
3834               else
3835                 goto invalid_code;
3836               continue;
3837               break;
3838
3839             default:
3840               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3841                 goto invalid_code;
3842               {
3843                 int reg, chars96;
3844
3845                 if (c1 >= 0x28 && c1 <= 0x2B)
3846                   { /* designation of DIMENSION1_CHARS94 character set */
3847                     reg = c1 - 0x28, chars96 = 0;
3848                     ONE_MORE_BYTE (c1);
3849                   }
3850                 else if (c1 >= 0x2C && c1 <= 0x2F)
3851                   { /* designation of DIMENSION1_CHARS96 character set */
3852                     reg = c1 - 0x2C, chars96 = 1;
3853                     ONE_MORE_BYTE (c1);
3854                   }
3855                 else
3856                   goto invalid_code;
3857                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3858                 /* We must update these variables now.  */
3859                 if (reg == 0)
3860                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3861                 else if (reg == 1)
3862                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3863                 if (chars96 < 0)
3864                   goto invalid_code;
3865               }
3866               continue;
3867             }
3868           break;
3869
3870         default:
3871           abort ();
3872         }
3873
3874       if (cmp_status->state == COMPOSING_NO
3875           && charset->id != charset_ascii
3876           && last_id != charset->id)
3877         {
3878           if (last_id != charset_ascii)
3879             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3880           last_id = charset->id;
3881           last_offset = char_offset;
3882         }
3883
3884       /* Now we know CHARSET and 1st position code C1 of a character.
3885          Produce a decoded character while getting 2nd and 3rd
3886          position codes C2, C3 if necessary.  */
3887       if (CHARSET_DIMENSION (charset) > 1)
3888         {
3889           ONE_MORE_BYTE (c2);
3890           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3891               || ((c1 & 0x80) != (c2 & 0x80)))
3892             /* C2 is not in a valid range.  */
3893             goto invalid_code;
3894           if (CHARSET_DIMENSION (charset) == 2)
3895             c1 = (c1 << 8) | c2;
3896           else
3897             {
3898               ONE_MORE_BYTE (c3);
3899               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3900                   || ((c1 & 0x80) != (c3 & 0x80)))
3901                 /* C3 is not in a valid range.  */
3902                 goto invalid_code;
3903               c1 = (c1 << 16) | (c2 << 8) | c2;
3904             }
3905         }
3906       c1 &= 0x7F7F7F;
3907       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3908       if (c < 0)
3909         {
3910           MAYBE_FINISH_COMPOSITION ();
3911           for (; src_base < src; src_base++, char_offset++)
3912             {
3913               if (ASCII_BYTE_P (*src_base))
3914                 *charbuf++ = *src_base;
3915               else
3916                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3917             }
3918         }
3919       else if (cmp_status->state == COMPOSING_NO)
3920         {
3921           *charbuf++ = c;
3922           char_offset++;
3923         }
3924       else if ((cmp_status->state == COMPOSING_CHAR
3925                 ? cmp_status->nchars
3926                 : cmp_status->ncomps)
3927                >= MAX_COMPOSITION_COMPONENTS)
3928         {
3929           /* Too long composition.  */
3930           MAYBE_FINISH_COMPOSITION ();
3931           *charbuf++ = c;
3932           char_offset++;
3933         }
3934       else
3935         STORE_COMPOSITION_CHAR (c);
3936       continue;
3937
3938     invalid_code:
3939       MAYBE_FINISH_COMPOSITION ();
3940       src = src_base;
3941       consumed_chars = consumed_chars_base;
3942       ONE_MORE_BYTE (c);
3943       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3944       char_offset++;
3945       coding->errors++;
3946       continue;
3947
3948     break_loop:
3949       break;
3950     }
3951
3952  no_more_source:
3953   if (cmp_status->state != COMPOSING_NO)
3954     {
3955       if (coding->mode & CODING_MODE_LAST_BLOCK)
3956         MAYBE_FINISH_COMPOSITION ();
3957       else
3958         {
3959           charbuf -= cmp_status->length;
3960           for (i = 0; i < cmp_status->length; i++)
3961             cmp_status->carryover[i] = charbuf[i];
3962         }
3963     }
3964   else if (last_id != charset_ascii)
3965     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3966   coding->consumed_char += consumed_chars_base;
3967   coding->consumed = src_base - coding->source;
3968   coding->charbuf_used = charbuf - coding->charbuf;
3969 }
3970
3971
3972 /* ISO2022 encoding stuff.  */
3973
3974 /*
3975    It is not enough to say just "ISO2022" on encoding, we have to
3976    specify more details.  In Emacs, each coding system of ISO2022
3977    variant has the following specifications:
3978         1. Initial designation to G0 thru G3.
3979         2. Allows short-form designation?
3980         3. ASCII should be designated to G0 before control characters?
3981         4. ASCII should be designated to G0 at end of line?
3982         5. 7-bit environment or 8-bit environment?
3983         6. Use locking-shift?
3984         7. Use Single-shift?
3985    And the following two are only for Japanese:
3986         8. Use ASCII in place of JIS0201-1976-Roman?
3987         9. Use JISX0208-1983 in place of JISX0208-1978?
3988    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3989    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3990    details.
3991 */
3992
3993 /* Produce codes (escape sequence) for designating CHARSET to graphic
3994    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3995    '@', 'A', or 'B' and the coding system CODING allows, produce
3996    designation sequence of short-form.  */
3997
3998 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3999   do {                                                                  \
4000     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4001     const char *intermediate_char_94 = "()*+";                          \
4002     const char *intermediate_char_96 = ",-./";                          \
4003     int revision = -1;                                                  \
4004                                                                         \
4005     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4006       revision = CHARSET_ISO_REVISION (charset);                        \
4007                                                                         \
4008     if (revision >= 0)                                                  \
4009       {                                                                 \
4010         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4011         EMIT_ONE_BYTE ('@' + revision);                                 \
4012       }                                                                 \
4013     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4014     if (CHARSET_DIMENSION (charset) == 1)                               \
4015       {                                                                 \
4016         int b;                                                          \
4017         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4018           b = intermediate_char_94[reg];                                \
4019         else                                                            \
4020           b = intermediate_char_96[reg];                                \
4021         EMIT_ONE_ASCII_BYTE (b);                                        \
4022       }                                                                 \
4023     else                                                                \
4024       {                                                                 \
4025         EMIT_ONE_ASCII_BYTE ('$');                                      \
4026         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4027           {                                                             \
4028             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4029                 || reg != 0                                             \
4030                 || final_char < '@' || final_char > 'B')                \
4031               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4032           }                                                             \
4033         else                                                            \
4034           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4035       }                                                                 \
4036     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4037                                                                         \
4038     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4039   } while (0)
4040
4041
4042 /* The following two macros produce codes (control character or escape
4043    sequence) for ISO2022 single-shift functions (single-shift-2 and
4044    single-shift-3).  */
4045
4046 #define ENCODE_SINGLE_SHIFT_2                                           \
4047   do {                                                                  \
4048     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4049       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4050     else                                                                \
4051       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4052     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4053   } while (0)
4054
4055
4056 #define ENCODE_SINGLE_SHIFT_3                                           \
4057   do {                                                                  \
4058     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4059       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4060     else                                                                \
4061       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4062     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4063   } while (0)
4064
4065
4066 /* The following four macros produce codes (control character or
4067    escape sequence) for ISO2022 locking-shift functions (shift-in,
4068    shift-out, locking-shift-2, and locking-shift-3).  */
4069
4070 #define ENCODE_SHIFT_IN                                 \
4071   do {                                                  \
4072     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4073     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4074   } while (0)
4075
4076
4077 #define ENCODE_SHIFT_OUT                                \
4078   do {                                                  \
4079     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4080     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4081   } while (0)
4082
4083
4084 #define ENCODE_LOCKING_SHIFT_2                          \
4085   do {                                                  \
4086     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4087     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4088   } while (0)
4089
4090
4091 #define ENCODE_LOCKING_SHIFT_3                          \
4092   do {                                                  \
4093     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4094     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4095   } while (0)
4096
4097
4098 /* Produce codes for a DIMENSION1 character whose character set is
4099    CHARSET and whose position-code is C1.  Designation and invocation
4100    sequences are also produced in advance if necessary.  */
4101
4102 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4103   do {                                                                  \
4104     int id = CHARSET_ID (charset);                                      \
4105                                                                         \
4106     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4107         && id == charset_ascii)                                         \
4108       {                                                                 \
4109         id = charset_jisx0201_roman;                                    \
4110         charset = CHARSET_FROM_ID (id);                                 \
4111       }                                                                 \
4112                                                                         \
4113     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4114       {                                                                 \
4115         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4116           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4117         else                                                            \
4118           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4119         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4120         break;                                                          \
4121       }                                                                 \
4122     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4123       {                                                                 \
4124         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4125         break;                                                          \
4126       }                                                                 \
4127     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4128       {                                                                 \
4129         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4130         break;                                                          \
4131       }                                                                 \
4132     else                                                                \
4133       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4134          must invoke it, or, at first, designate it to some graphic     \
4135          register.  Then repeat the loop to actually produce the        \
4136          character.  */                                                 \
4137       dst = encode_invocation_designation (charset, coding, dst,        \
4138                                            &produced_chars);            \
4139   } while (1)
4140
4141
4142 /* Produce codes for a DIMENSION2 character whose character set is
4143    CHARSET and whose position-codes are C1 and C2.  Designation and
4144    invocation codes are also produced in advance if necessary.  */
4145
4146 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4147   do {                                                                  \
4148     int id = CHARSET_ID (charset);                                      \
4149                                                                         \
4150     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4151         && id == charset_jisx0208)                                      \
4152       {                                                                 \
4153         id = charset_jisx0208_1978;                                     \
4154         charset = CHARSET_FROM_ID (id);                                 \
4155       }                                                                 \
4156                                                                         \
4157     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4158       {                                                                 \
4159         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4160           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4161         else                                                            \
4162           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4163         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4167       {                                                                 \
4168         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4169         break;                                                          \
4170       }                                                                 \
4171     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4172       {                                                                 \
4173         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4174         break;                                                          \
4175       }                                                                 \
4176     else                                                                \
4177       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4178          must invoke it, or, at first, designate it to some graphic     \
4179          register.  Then repeat the loop to actually produce the        \
4180          character.  */                                                 \
4181       dst = encode_invocation_designation (charset, coding, dst,        \
4182                                            &produced_chars);            \
4183   } while (1)
4184
4185
4186 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4187   do {                                                                     \
4188     int code = ENCODE_CHAR ((charset), (c));                               \
4189                                                                            \
4190     if (CHARSET_DIMENSION (charset) == 1)                                  \
4191       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4192     else                                                                   \
4193       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4194   } while (0)
4195
4196
4197 /* Produce designation and invocation codes at a place pointed by DST
4198    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4199    Return new DST.  */
4200
4201 static unsigned char *
4202 encode_invocation_designation (struct charset *charset,
4203                                struct coding_system *coding,
4204                                unsigned char *dst, EMACS_INT *p_nchars)
4205 {
4206   int multibytep = coding->dst_multibyte;
4207   EMACS_INT produced_chars = *p_nchars;
4208   int reg;                      /* graphic register number */
4209   int id = CHARSET_ID (charset);
4210
4211   /* At first, check designations.  */
4212   for (reg = 0; reg < 4; reg++)
4213     if (id == CODING_ISO_DESIGNATION (coding, reg))
4214       break;
4215
4216   if (reg >= 4)
4217     {
4218       /* CHARSET is not yet designated to any graphic registers.  */
4219       /* At first check the requested designation.  */
4220       reg = CODING_ISO_REQUEST (coding, id);
4221       if (reg < 0)
4222         /* Since CHARSET requests no special designation, designate it
4223            to graphic register 0.  */
4224         reg = 0;
4225
4226       ENCODE_DESIGNATION (charset, reg, coding);
4227     }
4228
4229   if (CODING_ISO_INVOCATION (coding, 0) != reg
4230       && CODING_ISO_INVOCATION (coding, 1) != reg)
4231     {
4232       /* Since the graphic register REG is not invoked to any graphic
4233          planes, invoke it to graphic plane 0.  */
4234       switch (reg)
4235         {
4236         case 0:                 /* graphic register 0 */
4237           ENCODE_SHIFT_IN;
4238           break;
4239
4240         case 1:                 /* graphic register 1 */
4241           ENCODE_SHIFT_OUT;
4242           break;
4243
4244         case 2:                 /* graphic register 2 */
4245           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4246             ENCODE_SINGLE_SHIFT_2;
4247           else
4248             ENCODE_LOCKING_SHIFT_2;
4249           break;
4250
4251         case 3:                 /* graphic register 3 */
4252           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4253             ENCODE_SINGLE_SHIFT_3;
4254           else
4255             ENCODE_LOCKING_SHIFT_3;
4256           break;
4257         }
4258     }
4259
4260   *p_nchars = produced_chars;
4261   return dst;
4262 }
4263
4264
4265 /* Produce codes for designation and invocation to reset the graphic
4266    planes and registers to initial state.  */
4267 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4268   do {                                                                  \
4269     int reg;                                                            \
4270     struct charset *charset;                                            \
4271                                                                         \
4272     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4273       ENCODE_SHIFT_IN;                                                  \
4274     for (reg = 0; reg < 4; reg++)                                       \
4275       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4276           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4277               != CODING_ISO_INITIAL (coding, reg)))                     \
4278         {                                                               \
4279           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4280           ENCODE_DESIGNATION (charset, reg, coding);                    \
4281         }                                                               \
4282   } while (0)
4283
4284
4285 /* Produce designation sequences of charsets in the line started from
4286    SRC to a place pointed by DST, and return updated DST.
4287
4288    If the current block ends before any end-of-line, we may fail to
4289    find all the necessary designations.  */
4290
4291 static unsigned char *
4292 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4293                            unsigned char *dst)
4294 {
4295   struct charset *charset;
4296   /* Table of charsets to be designated to each graphic register.  */
4297   int r[4];
4298   int c, found = 0, reg;
4299   EMACS_INT produced_chars = 0;
4300   int multibytep = coding->dst_multibyte;
4301   Lisp_Object attrs;
4302   Lisp_Object charset_list;
4303
4304   attrs = CODING_ID_ATTRS (coding->id);
4305   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4306   if (EQ (charset_list, Qiso_2022))
4307     charset_list = Viso_2022_charset_list;
4308
4309   for (reg = 0; reg < 4; reg++)
4310     r[reg] = -1;
4311
4312   while (found < 4)
4313     {
4314       int id;
4315
4316       c = *charbuf++;
4317       if (c == '\n')
4318         break;
4319       charset = char_charset (c, charset_list, NULL);
4320       id = CHARSET_ID (charset);
4321       reg = CODING_ISO_REQUEST (coding, id);
4322       if (reg >= 0 && r[reg] < 0)
4323         {
4324           found++;
4325           r[reg] = id;
4326         }
4327     }
4328
4329   if (found)
4330     {
4331       for (reg = 0; reg < 4; reg++)
4332         if (r[reg] >= 0
4333             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4334           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4335     }
4336
4337   return dst;
4338 }
4339
4340 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4341
4342 static int
4343 encode_coding_iso_2022 (struct coding_system *coding)
4344 {
4345   int multibytep = coding->dst_multibyte;
4346   int *charbuf = coding->charbuf;
4347   int *charbuf_end = charbuf + coding->charbuf_used;
4348   unsigned char *dst = coding->destination + coding->produced;
4349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4350   int safe_room = 16;
4351   int bol_designation
4352     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4353        && CODING_ISO_BOL (coding));
4354   EMACS_INT produced_chars = 0;
4355   Lisp_Object attrs, eol_type, charset_list;
4356   int ascii_compatible;
4357   int c;
4358   int preferred_charset_id = -1;
4359
4360   CODING_GET_INFO (coding, attrs, charset_list);
4361   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4362   if (VECTORP (eol_type))
4363     eol_type = Qunix;
4364
4365   setup_iso_safe_charsets (attrs);
4366   /* Charset list may have been changed.  */
4367   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4368   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4369
4370   ascii_compatible
4371     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4372        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4373                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4374
4375   while (charbuf < charbuf_end)
4376     {
4377       ASSURE_DESTINATION (safe_room);
4378
4379       if (bol_designation)
4380         {
4381           unsigned char *dst_prev = dst;
4382
4383           /* We have to produce designation sequences if any now.  */
4384           dst = encode_designation_at_bol (coding, charbuf, dst);
4385           bol_designation = 0;
4386           /* We are sure that designation sequences are all ASCII bytes.  */
4387           produced_chars += dst - dst_prev;
4388         }
4389
4390       c = *charbuf++;
4391
4392       if (c < 0)
4393         {
4394           /* Handle an annotation.  */
4395           switch (*charbuf)
4396             {
4397             case CODING_ANNOTATE_COMPOSITION_MASK:
4398               /* Not yet implemented.  */
4399               break;
4400             case CODING_ANNOTATE_CHARSET_MASK:
4401               preferred_charset_id = charbuf[2];
4402               if (preferred_charset_id >= 0
4403                   && NILP (Fmemq (make_number (preferred_charset_id),
4404                                   charset_list)))
4405                 preferred_charset_id = -1;
4406               break;
4407             default:
4408               abort ();
4409             }
4410           charbuf += -c - 1;
4411           continue;
4412         }
4413
4414       /* Now encode the character C.  */
4415       if (c < 0x20 || c == 0x7F)
4416         {
4417           if (c == '\n'
4418               || (c == '\r' && EQ (eol_type, Qmac)))
4419             {
4420               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4421                 ENCODE_RESET_PLANE_AND_REGISTER ();
4422               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4423                 {
4424                   int i;
4425
4426                   for (i = 0; i < 4; i++)
4427                     CODING_ISO_DESIGNATION (coding, i)
4428                       = CODING_ISO_INITIAL (coding, i);
4429                 }
4430               bol_designation
4431                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4432             }
4433           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4434             ENCODE_RESET_PLANE_AND_REGISTER ();
4435           EMIT_ONE_ASCII_BYTE (c);
4436         }
4437       else if (ASCII_CHAR_P (c))
4438         {
4439           if (ascii_compatible)
4440             EMIT_ONE_ASCII_BYTE (c);
4441           else
4442             {
4443               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4444               ENCODE_ISO_CHARACTER (charset, c);
4445             }
4446         }
4447       else if (CHAR_BYTE8_P (c))
4448         {
4449           c = CHAR_TO_BYTE8 (c);
4450           EMIT_ONE_BYTE (c);
4451         }
4452       else
4453         {
4454           struct charset *charset;
4455
4456           if (preferred_charset_id >= 0)
4457             {
4458               charset = CHARSET_FROM_ID (preferred_charset_id);
4459               if (! CHAR_CHARSET_P (c, charset))
4460                 charset = char_charset (c, charset_list, NULL);
4461             }
4462           else
4463             charset = char_charset (c, charset_list, NULL);
4464           if (!charset)
4465             {
4466               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4467                 {
4468                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4469                   charset = CHARSET_FROM_ID (charset_ascii);
4470                 }
4471               else
4472                 {
4473                   c = coding->default_char;
4474                   charset = char_charset (c, charset_list, NULL);
4475                 }
4476             }
4477           ENCODE_ISO_CHARACTER (charset, c);
4478         }
4479     }
4480
4481   if (coding->mode & CODING_MODE_LAST_BLOCK
4482       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4483     {
4484       ASSURE_DESTINATION (safe_room);
4485       ENCODE_RESET_PLANE_AND_REGISTER ();
4486     }
4487   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4488   CODING_ISO_BOL (coding) = bol_designation;
4489   coding->produced_char += produced_chars;
4490   coding->produced = dst - coding->destination;
4491   return 0;
4492 }
4493
4494 \f
4495 /*** 8,9. SJIS and BIG5 handlers ***/
4496
4497 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4498    quite widely.  So, for the moment, Emacs supports them in the bare
4499    C code.  But, in the future, they may be supported only by CCL.  */
4500
4501 /* SJIS is a coding system encoding three character sets: ASCII, right
4502    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4503    as is.  A character of charset katakana-jisx0201 is encoded by
4504    "position-code + 0x80".  A character of charset japanese-jisx0208
4505    is encoded in 2-byte but two position-codes are divided and shifted
4506    so that it fit in the range below.
4507
4508    --- CODE RANGE of SJIS ---
4509    (character set)      (range)
4510    ASCII                0x00 .. 0x7F
4511    KATAKANA-JISX0201    0xA0 .. 0xDF
4512    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4513             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4514    -------------------------------
4515
4516 */
4517
4518 /* BIG5 is a coding system encoding two character sets: ASCII and
4519    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4520    character set and is encoded in two-byte.
4521
4522    --- CODE RANGE of BIG5 ---
4523    (character set)      (range)
4524    ASCII                0x00 .. 0x7F
4525    Big5 (1st byte)      0xA1 .. 0xFE
4526         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4527    --------------------------
4528
4529   */
4530
4531 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4532    Check if a text is encoded in SJIS.  If it is, return
4533    CATEGORY_MASK_SJIS, else return 0.  */
4534
4535 static int
4536 detect_coding_sjis (struct coding_system *coding,
4537                     struct coding_detection_info *detect_info)
4538 {
4539   const unsigned char *src = coding->source, *src_base;
4540   const unsigned char *src_end = coding->source + coding->src_bytes;
4541   int multibytep = coding->src_multibyte;
4542   EMACS_INT consumed_chars = 0;
4543   int found = 0;
4544   int c;
4545   Lisp_Object attrs, charset_list;
4546   int max_first_byte_of_2_byte_code;
4547
4548   CODING_GET_INFO (coding, attrs, charset_list);
4549   max_first_byte_of_2_byte_code
4550     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4551
4552   detect_info->checked |= CATEGORY_MASK_SJIS;
4553   /* A coding system of this category is always ASCII compatible.  */
4554   src += coding->head_ascii;
4555
4556   while (1)
4557     {
4558       src_base = src;
4559       ONE_MORE_BYTE (c);
4560       if (c < 0x80)
4561         continue;
4562       if ((c >= 0x81 && c <= 0x9F)
4563           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4564         {
4565           ONE_MORE_BYTE (c);
4566           if (c < 0x40 || c == 0x7F || c > 0xFC)
4567             break;
4568           found = CATEGORY_MASK_SJIS;
4569         }
4570       else if (c >= 0xA0 && c < 0xE0)
4571         found = CATEGORY_MASK_SJIS;
4572       else
4573         break;
4574     }
4575   detect_info->rejected |= CATEGORY_MASK_SJIS;
4576   return 0;
4577
4578  no_more_source:
4579   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4580     {
4581       detect_info->rejected |= CATEGORY_MASK_SJIS;
4582       return 0;
4583     }
4584   detect_info->found |= found;
4585   return 1;
4586 }
4587
4588 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4589    Check if a text is encoded in BIG5.  If it is, return
4590    CATEGORY_MASK_BIG5, else return 0.  */
4591
4592 static int
4593 detect_coding_big5 (struct coding_system *coding,
4594                     struct coding_detection_info *detect_info)
4595 {
4596   const unsigned char *src = coding->source, *src_base;
4597   const unsigned char *src_end = coding->source + coding->src_bytes;
4598   int multibytep = coding->src_multibyte;
4599   EMACS_INT consumed_chars = 0;
4600   int found = 0;
4601   int c;
4602
4603   detect_info->checked |= CATEGORY_MASK_BIG5;
4604   /* A coding system of this category is always ASCII compatible.  */
4605   src += coding->head_ascii;
4606
4607   while (1)
4608     {
4609       src_base = src;
4610       ONE_MORE_BYTE (c);
4611       if (c < 0x80)
4612         continue;
4613       if (c >= 0xA1)
4614         {
4615           ONE_MORE_BYTE (c);
4616           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4617             return 0;
4618           found = CATEGORY_MASK_BIG5;
4619         }
4620       else
4621         break;
4622     }
4623   detect_info->rejected |= CATEGORY_MASK_BIG5;
4624   return 0;
4625
4626  no_more_source:
4627   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4628     {
4629       detect_info->rejected |= CATEGORY_MASK_BIG5;
4630       return 0;
4631     }
4632   detect_info->found |= found;
4633   return 1;
4634 }
4635
4636 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4637    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4638
4639 static void
4640 decode_coding_sjis (struct coding_system *coding)
4641 {
4642   const unsigned char *src = coding->source + coding->consumed;
4643   const unsigned char *src_end = coding->source + coding->src_bytes;
4644   const unsigned char *src_base;
4645   int *charbuf = coding->charbuf + coding->charbuf_used;
4646   /* We may produce one charset annotation in one loop and one more at
4647      the end.  */
4648   int *charbuf_end
4649     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4650   EMACS_INT consumed_chars = 0, consumed_chars_base;
4651   int multibytep = coding->src_multibyte;
4652   struct charset *charset_roman, *charset_kanji, *charset_kana;
4653   struct charset *charset_kanji2;
4654   Lisp_Object attrs, charset_list, val;
4655   EMACS_INT char_offset = coding->produced_char;
4656   EMACS_INT last_offset = char_offset;
4657   int last_id = charset_ascii;
4658   int eol_dos =
4659     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4660   int byte_after_cr = -1;
4661
4662   CODING_GET_INFO (coding, attrs, charset_list);
4663
4664   val = charset_list;
4665   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4666   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4667   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4668   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4669
4670   while (1)
4671     {
4672       int c, c1;
4673       struct charset *charset;
4674
4675       src_base = src;
4676       consumed_chars_base = consumed_chars;
4677
4678       if (charbuf >= charbuf_end)
4679         {
4680           if (byte_after_cr >= 0)
4681             src_base--;
4682           break;
4683         }
4684
4685       if (byte_after_cr >= 0)
4686         c = byte_after_cr, byte_after_cr = -1;
4687       else
4688         ONE_MORE_BYTE (c);
4689       if (c < 0)
4690         goto invalid_code;
4691       if (c < 0x80)
4692         {
4693           if (eol_dos && c == '\r')
4694             ONE_MORE_BYTE (byte_after_cr);
4695           charset = charset_roman;
4696         }
4697       else if (c == 0x80 || c == 0xA0)
4698         goto invalid_code;
4699       else if (c >= 0xA1 && c <= 0xDF)
4700         {
4701           /* SJIS -> JISX0201-Kana */
4702           c &= 0x7F;
4703           charset = charset_kana;
4704         }
4705       else if (c <= 0xEF)
4706         {
4707           /* SJIS -> JISX0208 */
4708           ONE_MORE_BYTE (c1);
4709           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4710             goto invalid_code;
4711           c = (c << 8) | c1;
4712           SJIS_TO_JIS (c);
4713           charset = charset_kanji;
4714         }
4715       else if (c <= 0xFC && charset_kanji2)
4716         {
4717           /* SJIS -> JISX0213-2 */
4718           ONE_MORE_BYTE (c1);
4719           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4720             goto invalid_code;
4721           c = (c << 8) | c1;
4722           SJIS_TO_JIS2 (c);
4723           charset = charset_kanji2;
4724         }
4725       else
4726         goto invalid_code;
4727       if (charset->id != charset_ascii
4728           && last_id != charset->id)
4729         {
4730           if (last_id != charset_ascii)
4731             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4732           last_id = charset->id;
4733           last_offset = char_offset;
4734         }
4735       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4736       *charbuf++ = c;
4737       char_offset++;
4738       continue;
4739
4740     invalid_code:
4741       src = src_base;
4742       consumed_chars = consumed_chars_base;
4743       ONE_MORE_BYTE (c);
4744       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4745       char_offset++;
4746       coding->errors++;
4747     }
4748
4749  no_more_source:
4750   if (last_id != charset_ascii)
4751     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4752   coding->consumed_char += consumed_chars_base;
4753   coding->consumed = src_base - coding->source;
4754   coding->charbuf_used = charbuf - coding->charbuf;
4755 }
4756
4757 static void
4758 decode_coding_big5 (struct coding_system *coding)
4759 {
4760   const unsigned char *src = coding->source + coding->consumed;
4761   const unsigned char *src_end = coding->source + coding->src_bytes;
4762   const unsigned char *src_base;
4763   int *charbuf = coding->charbuf + coding->charbuf_used;
4764   /* We may produce one charset annotation in one loop and one more at
4765      the end.  */
4766   int *charbuf_end
4767     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4768   EMACS_INT consumed_chars = 0, consumed_chars_base;
4769   int multibytep = coding->src_multibyte;
4770   struct charset *charset_roman, *charset_big5;
4771   Lisp_Object attrs, charset_list, val;
4772   EMACS_INT char_offset = coding->produced_char;
4773   EMACS_INT last_offset = char_offset;
4774   int last_id = charset_ascii;
4775   int eol_dos =
4776     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4777   int byte_after_cr = -1;
4778
4779   CODING_GET_INFO (coding, attrs, charset_list);
4780   val = charset_list;
4781   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4782   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4783
4784   while (1)
4785     {
4786       int c, c1;
4787       struct charset *charset;
4788
4789       src_base = src;
4790       consumed_chars_base = consumed_chars;
4791
4792       if (charbuf >= charbuf_end)
4793         {
4794           if (byte_after_cr >= 0)
4795             src_base--;
4796           break;
4797         }
4798
4799       if (byte_after_cr >= 0)
4800         c = byte_after_cr, byte_after_cr = -1;
4801       else
4802         ONE_MORE_BYTE (c);
4803
4804       if (c < 0)
4805         goto invalid_code;
4806       if (c < 0x80)
4807         {
4808           if (eol_dos && c == '\r')
4809             ONE_MORE_BYTE (byte_after_cr);
4810           charset = charset_roman;
4811         }
4812       else
4813         {
4814           /* BIG5 -> Big5 */
4815           if (c < 0xA1 || c > 0xFE)
4816             goto invalid_code;
4817           ONE_MORE_BYTE (c1);
4818           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4819             goto invalid_code;
4820           c = c << 8 | c1;
4821           charset = charset_big5;
4822         }
4823       if (charset->id != charset_ascii
4824           && last_id != charset->id)
4825         {
4826           if (last_id != charset_ascii)
4827             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4828           last_id = charset->id;
4829           last_offset = char_offset;
4830         }
4831       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4832       *charbuf++ = c;
4833       char_offset++;
4834       continue;
4835
4836     invalid_code:
4837       src = src_base;
4838       consumed_chars = consumed_chars_base;
4839       ONE_MORE_BYTE (c);
4840       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4841       char_offset++;
4842       coding->errors++;
4843     }
4844
4845  no_more_source:
4846   if (last_id != charset_ascii)
4847     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4848   coding->consumed_char += consumed_chars_base;
4849   coding->consumed = src_base - coding->source;
4850   coding->charbuf_used = charbuf - coding->charbuf;
4851 }
4852
4853 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4854    This function can encode charsets `ascii', `katakana-jisx0201',
4855    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4856    are sure that all these charsets are registered as official charset
4857    (i.e. do not have extended leading-codes).  Characters of other
4858    charsets are produced without any encoding.  If SJIS_P is 1, encode
4859    SJIS text, else encode BIG5 text.  */
4860
4861 static int
4862 encode_coding_sjis (struct coding_system *coding)
4863 {
4864   int multibytep = coding->dst_multibyte;
4865   int *charbuf = coding->charbuf;
4866   int *charbuf_end = charbuf + coding->charbuf_used;
4867   unsigned char *dst = coding->destination + coding->produced;
4868   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4869   int safe_room = 4;
4870   EMACS_INT produced_chars = 0;
4871   Lisp_Object attrs, charset_list, val;
4872   int ascii_compatible;
4873   struct charset *charset_kanji, *charset_kana;
4874   struct charset *charset_kanji2;
4875   int c;
4876
4877   CODING_GET_INFO (coding, attrs, charset_list);
4878   val = XCDR (charset_list);
4879   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4880   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4881   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4882
4883   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4884
4885   while (charbuf < charbuf_end)
4886     {
4887       ASSURE_DESTINATION (safe_room);
4888       c = *charbuf++;
4889       /* Now encode the character C.  */
4890       if (ASCII_CHAR_P (c) && ascii_compatible)
4891         EMIT_ONE_ASCII_BYTE (c);
4892       else if (CHAR_BYTE8_P (c))
4893         {
4894           c = CHAR_TO_BYTE8 (c);
4895           EMIT_ONE_BYTE (c);
4896         }
4897       else
4898         {
4899           unsigned code;
4900           struct charset *charset = char_charset (c, charset_list, &code);
4901
4902           if (!charset)
4903             {
4904               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4905                 {
4906                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4907                   charset = CHARSET_FROM_ID (charset_ascii);
4908                 }
4909               else
4910                 {
4911                   c = coding->default_char;
4912                   charset = char_charset (c, charset_list, &code);
4913                 }
4914             }
4915           if (code == CHARSET_INVALID_CODE (charset))
4916             abort ();
4917           if (charset == charset_kanji)
4918             {
4919               int c1, c2;
4920               JIS_TO_SJIS (code);
4921               c1 = code >> 8, c2 = code & 0xFF;
4922               EMIT_TWO_BYTES (c1, c2);
4923             }
4924           else if (charset == charset_kana)
4925             EMIT_ONE_BYTE (code | 0x80);
4926           else if (charset_kanji2 && charset == charset_kanji2)
4927             {
4928               int c1, c2;
4929
4930               c1 = code >> 8;
4931               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4932                   || c1 == 0x28
4933                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4934                 {
4935                   JIS_TO_SJIS2 (code);
4936                   c1 = code >> 8, c2 = code & 0xFF;
4937                   EMIT_TWO_BYTES (c1, c2);
4938                 }
4939               else
4940                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4941             }
4942           else
4943             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4944         }
4945     }
4946   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4947   coding->produced_char += produced_chars;
4948   coding->produced = dst - coding->destination;
4949   return 0;
4950 }
4951
4952 static int
4953 encode_coding_big5 (struct coding_system *coding)
4954 {
4955   int multibytep = coding->dst_multibyte;
4956   int *charbuf = coding->charbuf;
4957   int *charbuf_end = charbuf + coding->charbuf_used;
4958   unsigned char *dst = coding->destination + coding->produced;
4959   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4960   int safe_room = 4;
4961   EMACS_INT produced_chars = 0;
4962   Lisp_Object attrs, charset_list, val;
4963   int ascii_compatible;
4964   struct charset *charset_big5;
4965   int c;
4966
4967   CODING_GET_INFO (coding, attrs, charset_list);
4968   val = XCDR (charset_list);
4969   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4970   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4971
4972   while (charbuf < charbuf_end)
4973     {
4974       ASSURE_DESTINATION (safe_room);
4975       c = *charbuf++;
4976       /* Now encode the character C.  */
4977       if (ASCII_CHAR_P (c) && ascii_compatible)
4978         EMIT_ONE_ASCII_BYTE (c);
4979       else if (CHAR_BYTE8_P (c))
4980         {
4981           c = CHAR_TO_BYTE8 (c);
4982           EMIT_ONE_BYTE (c);
4983         }
4984       else
4985         {
4986           unsigned code;
4987           struct charset *charset = char_charset (c, charset_list, &code);
4988
4989           if (! charset)
4990             {
4991               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4992                 {
4993                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4994                   charset = CHARSET_FROM_ID (charset_ascii);
4995                 }
4996               else
4997                 {
4998                   c = coding->default_char;
4999                   charset = char_charset (c, charset_list, &code);
5000                 }
5001             }
5002           if (code == CHARSET_INVALID_CODE (charset))
5003             abort ();
5004           if (charset == charset_big5)
5005             {
5006               int c1, c2;
5007
5008               c1 = code >> 8, c2 = code & 0xFF;
5009               EMIT_TWO_BYTES (c1, c2);
5010             }
5011           else
5012             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5013         }
5014     }
5015   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5016   coding->produced_char += produced_chars;
5017   coding->produced = dst - coding->destination;
5018   return 0;
5019 }
5020
5021 \f
5022 /*** 10. CCL handlers ***/
5023
5024 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5025    Check if a text is encoded in a coding system of which
5026    encoder/decoder are written in CCL program.  If it is, return
5027    CATEGORY_MASK_CCL, else return 0.  */
5028
5029 static int
5030 detect_coding_ccl (struct coding_system *coding,
5031                    struct coding_detection_info *detect_info)
5032 {
5033   const unsigned char *src = coding->source, *src_base;
5034   const unsigned char *src_end = coding->source + coding->src_bytes;
5035   int multibytep = coding->src_multibyte;
5036   EMACS_INT consumed_chars = 0;
5037   int found = 0;
5038   unsigned char *valids;
5039   EMACS_INT head_ascii = coding->head_ascii;
5040   Lisp_Object attrs;
5041
5042   detect_info->checked |= CATEGORY_MASK_CCL;
5043
5044   coding = &coding_categories[coding_category_ccl];
5045   valids = CODING_CCL_VALIDS (coding);
5046   attrs = CODING_ID_ATTRS (coding->id);
5047   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5048     src += head_ascii;
5049
5050   while (1)
5051     {
5052       int c;
5053
5054       src_base = src;
5055       ONE_MORE_BYTE (c);
5056       if (c < 0 || ! valids[c])
5057         break;
5058       if ((valids[c] > 1))
5059         found = CATEGORY_MASK_CCL;
5060     }
5061   detect_info->rejected |= CATEGORY_MASK_CCL;
5062   return 0;
5063
5064  no_more_source:
5065   detect_info->found |= found;
5066   return 1;
5067 }
5068
5069 static void
5070 decode_coding_ccl (struct coding_system *coding)
5071 {
5072   const unsigned char *src = coding->source + coding->consumed;
5073   const unsigned char *src_end = coding->source + coding->src_bytes;
5074   int *charbuf = coding->charbuf + coding->charbuf_used;
5075   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5076   EMACS_INT consumed_chars = 0;
5077   int multibytep = coding->src_multibyte;
5078   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5079   int source_charbuf[1024];
5080   int source_byteidx[1025];
5081   Lisp_Object attrs, charset_list;
5082
5083   CODING_GET_INFO (coding, attrs, charset_list);
5084
5085   while (1)
5086     {
5087       const unsigned char *p = src;
5088       int i = 0;
5089
5090       if (multibytep)
5091         {
5092           while (i < 1024 && p < src_end)
5093             {
5094               source_byteidx[i] = p - src;
5095               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5096             }
5097           source_byteidx[i] = p - src;
5098         }
5099       else
5100         while (i < 1024 && p < src_end)
5101           source_charbuf[i++] = *p++;
5102
5103       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5104         ccl->last_block = 1;
5105       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5106                   charset_list);
5107       charbuf += ccl->produced;
5108       if (multibytep)
5109         src += source_byteidx[ccl->consumed];
5110       else
5111         src += ccl->consumed;
5112       consumed_chars += ccl->consumed;
5113       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5114         break;
5115     }
5116
5117   switch (ccl->status)
5118     {
5119     case CCL_STAT_SUSPEND_BY_SRC:
5120       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5121       break;
5122     case CCL_STAT_SUSPEND_BY_DST:
5123       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5124       break;
5125     case CCL_STAT_QUIT:
5126     case CCL_STAT_INVALID_CMD:
5127       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5128       break;
5129     default:
5130       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5131       break;
5132     }
5133   coding->consumed_char += consumed_chars;
5134   coding->consumed = src - coding->source;
5135   coding->charbuf_used = charbuf - coding->charbuf;
5136 }
5137
5138 static int
5139 encode_coding_ccl (struct coding_system *coding)
5140 {
5141   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5142   int multibytep = coding->dst_multibyte;
5143   int *charbuf = coding->charbuf;
5144   int *charbuf_end = charbuf + coding->charbuf_used;
5145   unsigned char *dst = coding->destination + coding->produced;
5146   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5147   int destination_charbuf[1024];
5148   EMACS_INT produced_chars = 0;
5149   int i;
5150   Lisp_Object attrs, charset_list;
5151
5152   CODING_GET_INFO (coding, attrs, charset_list);
5153   if (coding->consumed_char == coding->src_chars
5154       && coding->mode & CODING_MODE_LAST_BLOCK)
5155     ccl->last_block = 1;
5156
5157   while (charbuf < charbuf_end)
5158     {
5159       ccl_driver (ccl, charbuf, destination_charbuf,
5160                   charbuf_end - charbuf, 1024, charset_list);
5161       if (multibytep)
5162         {
5163           ASSURE_DESTINATION (ccl->produced * 2);
5164           for (i = 0; i < ccl->produced; i++)
5165             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5166         }
5167       else
5168         {
5169           ASSURE_DESTINATION (ccl->produced);
5170           for (i = 0; i < ccl->produced; i++)
5171             *dst++ = destination_charbuf[i] & 0xFF;
5172           produced_chars += ccl->produced;
5173         }
5174       charbuf += ccl->consumed;
5175       if (ccl->status == CCL_STAT_QUIT
5176           || ccl->status == CCL_STAT_INVALID_CMD)
5177         break;
5178     }
5179
5180   switch (ccl->status)
5181     {
5182     case CCL_STAT_SUSPEND_BY_SRC:
5183       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5184       break;
5185     case CCL_STAT_SUSPEND_BY_DST:
5186       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5187       break;
5188     case CCL_STAT_QUIT:
5189     case CCL_STAT_INVALID_CMD:
5190       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5191       break;
5192     default:
5193       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5194       break;
5195     }
5196
5197   coding->produced_char += produced_chars;
5198   coding->produced = dst - coding->destination;
5199   return 0;
5200 }
5201
5202
5203 \f
5204 /*** 10, 11. no-conversion handlers ***/
5205
5206 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5207
5208 static void
5209 decode_coding_raw_text (struct coding_system *coding)
5210 {
5211   int eol_dos =
5212     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5213
5214   coding->chars_at_source = 1;
5215   coding->consumed_char = coding->src_chars;
5216   coding->consumed = coding->src_bytes;
5217   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5218     {
5219       coding->consumed_char--;
5220       coding->consumed--;
5221       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5222     }
5223   else
5224     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5225 }
5226
5227 static int
5228 encode_coding_raw_text (struct coding_system *coding)
5229 {
5230   int multibytep = coding->dst_multibyte;
5231   int *charbuf = coding->charbuf;
5232   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5233   unsigned char *dst = coding->destination + coding->produced;
5234   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5235   EMACS_INT produced_chars = 0;
5236   int c;
5237
5238   if (multibytep)
5239     {
5240       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5241
5242       if (coding->src_multibyte)
5243         while (charbuf < charbuf_end)
5244           {
5245             ASSURE_DESTINATION (safe_room);
5246             c = *charbuf++;
5247             if (ASCII_CHAR_P (c))
5248               EMIT_ONE_ASCII_BYTE (c);
5249             else if (CHAR_BYTE8_P (c))
5250               {
5251                 c = CHAR_TO_BYTE8 (c);
5252                 EMIT_ONE_BYTE (c);
5253               }
5254             else
5255               {
5256                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5257
5258                 CHAR_STRING_ADVANCE (c, p1);
5259                 do
5260                   {
5261                     EMIT_ONE_BYTE (*p0);
5262                     p0++;
5263                   }
5264                 while (p0 < p1);
5265               }
5266           }
5267       else
5268         while (charbuf < charbuf_end)
5269           {
5270             ASSURE_DESTINATION (safe_room);
5271             c = *charbuf++;
5272             EMIT_ONE_BYTE (c);
5273           }
5274     }
5275   else
5276     {
5277       if (coding->src_multibyte)
5278         {
5279           int safe_room = MAX_MULTIBYTE_LENGTH;
5280
5281           while (charbuf < charbuf_end)
5282             {
5283               ASSURE_DESTINATION (safe_room);
5284               c = *charbuf++;
5285               if (ASCII_CHAR_P (c))
5286                 *dst++ = c;
5287               else if (CHAR_BYTE8_P (c))
5288                 *dst++ = CHAR_TO_BYTE8 (c);
5289               else
5290                 CHAR_STRING_ADVANCE (c, dst);
5291             }
5292         }
5293       else
5294         {
5295           ASSURE_DESTINATION (charbuf_end - charbuf);
5296           while (charbuf < charbuf_end && dst < dst_end)
5297             *dst++ = *charbuf++;
5298         }
5299       produced_chars = dst - (coding->destination + coding->produced);
5300     }
5301   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302   coding->produced_char += produced_chars;
5303   coding->produced = dst - coding->destination;
5304   return 0;
5305 }
5306
5307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5308    Check if a text is encoded in a charset-based coding system.  If it
5309    is, return 1, else return 0.  */
5310
5311 static int
5312 detect_coding_charset (struct coding_system *coding,
5313                        struct coding_detection_info *detect_info)
5314 {
5315   const unsigned char *src = coding->source, *src_base;
5316   const unsigned char *src_end = coding->source + coding->src_bytes;
5317   int multibytep = coding->src_multibyte;
5318   EMACS_INT consumed_chars = 0;
5319   Lisp_Object attrs, valids, name;
5320   int found = 0;
5321   EMACS_INT head_ascii = coding->head_ascii;
5322   int check_latin_extra = 0;
5323
5324   detect_info->checked |= CATEGORY_MASK_CHARSET;
5325
5326   coding = &coding_categories[coding_category_charset];
5327   attrs = CODING_ID_ATTRS (coding->id);
5328   valids = AREF (attrs, coding_attr_charset_valids);
5329   name = CODING_ID_NAME (coding->id);
5330   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5331                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5332       || strncmp (SSDATA (SYMBOL_NAME (name)),
5333                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5334     check_latin_extra = 1;
5335
5336   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5337     src += head_ascii;
5338
5339   while (1)
5340     {
5341       int c;
5342       Lisp_Object val;
5343       struct charset *charset;
5344       int dim, idx;
5345
5346       src_base = src;
5347       ONE_MORE_BYTE (c);
5348       if (c < 0)
5349         continue;
5350       val = AREF (valids, c);
5351       if (NILP (val))
5352         break;
5353       if (c >= 0x80)
5354         {
5355           if (c < 0xA0
5356               && check_latin_extra
5357               && (!VECTORP (Vlatin_extra_code_table)
5358                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5359             break;
5360           found = CATEGORY_MASK_CHARSET;
5361         }
5362       if (INTEGERP (val))
5363         {
5364           charset = CHARSET_FROM_ID (XFASTINT (val));
5365           dim = CHARSET_DIMENSION (charset);
5366           for (idx = 1; idx < dim; idx++)
5367             {
5368               if (src == src_end)
5369                 goto too_short;
5370               ONE_MORE_BYTE (c);
5371               if (c < charset->code_space[(dim - 1 - idx) * 4]
5372                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5373                 break;
5374             }
5375           if (idx < dim)
5376             break;
5377         }
5378       else
5379         {
5380           idx = 1;
5381           for (; CONSP (val); val = XCDR (val))
5382             {
5383               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5384               dim = CHARSET_DIMENSION (charset);
5385               while (idx < dim)
5386                 {
5387                   if (src == src_end)
5388                     goto too_short;
5389                   ONE_MORE_BYTE (c);
5390                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5391                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5392                     break;
5393                   idx++;
5394                 }
5395               if (idx == dim)
5396                 {
5397                   val = Qnil;
5398                   break;
5399                 }
5400             }
5401           if (CONSP (val))
5402             break;
5403         }
5404     }
5405  too_short:
5406   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5407   return 0;
5408
5409  no_more_source:
5410   detect_info->found |= found;
5411   return 1;
5412 }
5413
5414 static void
5415 decode_coding_charset (struct coding_system *coding)
5416 {
5417   const unsigned char *src = coding->source + coding->consumed;
5418   const unsigned char *src_end = coding->source + coding->src_bytes;
5419   const unsigned char *src_base;
5420   int *charbuf = coding->charbuf + coding->charbuf_used;
5421   /* We may produce one charset annotation in one loop and one more at
5422      the end.  */
5423   int *charbuf_end
5424     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5425   EMACS_INT consumed_chars = 0, consumed_chars_base;
5426   int multibytep = coding->src_multibyte;
5427   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5428   Lisp_Object valids;
5429   EMACS_INT char_offset = coding->produced_char;
5430   EMACS_INT last_offset = char_offset;
5431   int last_id = charset_ascii;
5432   int eol_dos =
5433     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5434   int byte_after_cr = -1;
5435
5436   valids = AREF (attrs, coding_attr_charset_valids);
5437
5438   while (1)
5439     {
5440       int c;
5441       Lisp_Object val;
5442       struct charset *charset;
5443       int dim;
5444       int len = 1;
5445       unsigned code;
5446
5447       src_base = src;
5448       consumed_chars_base = consumed_chars;
5449
5450       if (charbuf >= charbuf_end)
5451         {
5452           if (byte_after_cr >= 0)
5453             src_base--;
5454           break;
5455         }
5456
5457       if (byte_after_cr >= 0)
5458         {
5459           c = byte_after_cr;
5460           byte_after_cr = -1;
5461         }
5462       else
5463         {
5464           ONE_MORE_BYTE (c);
5465           if (eol_dos && c == '\r')
5466             ONE_MORE_BYTE (byte_after_cr);
5467         }
5468       if (c < 0)
5469         goto invalid_code;
5470       code = c;
5471
5472       val = AREF (valids, c);
5473       if (! INTEGERP (val) && ! CONSP (val))
5474         goto invalid_code;
5475       if (INTEGERP (val))
5476         {
5477           charset = CHARSET_FROM_ID (XFASTINT (val));
5478           dim = CHARSET_DIMENSION (charset);
5479           while (len < dim)
5480             {
5481               ONE_MORE_BYTE (c);
5482               code = (code << 8) | c;
5483               len++;
5484             }
5485           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5486                               charset, code, c);
5487         }
5488       else
5489         {
5490           /* VAL is a list of charset IDs.  It is assured that the
5491              list is sorted by charset dimensions (smaller one
5492              comes first).  */
5493           while (CONSP (val))
5494             {
5495               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5496               dim = CHARSET_DIMENSION (charset);
5497               while (len < dim)
5498                 {
5499                   ONE_MORE_BYTE (c);
5500                   code = (code << 8) | c;
5501                   len++;
5502                 }
5503               CODING_DECODE_CHAR (coding, src, src_base,
5504                                   src_end, charset, code, c);
5505               if (c >= 0)
5506                 break;
5507               val = XCDR (val);
5508             }
5509         }
5510       if (c < 0)
5511         goto invalid_code;
5512       if (charset->id != charset_ascii
5513           && last_id != charset->id)
5514         {
5515           if (last_id != charset_ascii)
5516             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5517           last_id = charset->id;
5518           last_offset = char_offset;
5519         }
5520
5521       *charbuf++ = c;
5522       char_offset++;
5523       continue;
5524
5525     invalid_code:
5526       src = src_base;
5527       consumed_chars = consumed_chars_base;
5528       ONE_MORE_BYTE (c);
5529       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5530       char_offset++;
5531       coding->errors++;
5532     }
5533
5534  no_more_source:
5535   if (last_id != charset_ascii)
5536     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5537   coding->consumed_char += consumed_chars_base;
5538   coding->consumed = src_base - coding->source;
5539   coding->charbuf_used = charbuf - coding->charbuf;
5540 }
5541
5542 static int
5543 encode_coding_charset (struct coding_system *coding)
5544 {
5545   int multibytep = coding->dst_multibyte;
5546   int *charbuf = coding->charbuf;
5547   int *charbuf_end = charbuf + coding->charbuf_used;
5548   unsigned char *dst = coding->destination + coding->produced;
5549   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5550   int safe_room = MAX_MULTIBYTE_LENGTH;
5551   EMACS_INT produced_chars = 0;
5552   Lisp_Object attrs, charset_list;
5553   int ascii_compatible;
5554   int c;
5555
5556   CODING_GET_INFO (coding, attrs, charset_list);
5557   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5558
5559   while (charbuf < charbuf_end)
5560     {
5561       struct charset *charset;
5562       unsigned code;
5563
5564       ASSURE_DESTINATION (safe_room);
5565       c = *charbuf++;
5566       if (ascii_compatible && ASCII_CHAR_P (c))
5567         EMIT_ONE_ASCII_BYTE (c);
5568       else if (CHAR_BYTE8_P (c))
5569         {
5570           c = CHAR_TO_BYTE8 (c);
5571           EMIT_ONE_BYTE (c);
5572         }
5573       else
5574         {
5575           charset = char_charset (c, charset_list, &code);
5576           if (charset)
5577             {
5578               if (CHARSET_DIMENSION (charset) == 1)
5579                 EMIT_ONE_BYTE (code);
5580               else if (CHARSET_DIMENSION (charset) == 2)
5581                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5582               else if (CHARSET_DIMENSION (charset) == 3)
5583                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5584               else
5585                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5586                                  (code >> 8) & 0xFF, code & 0xFF);
5587             }
5588           else
5589             {
5590               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5591                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5592               else
5593                 c = coding->default_char;
5594               EMIT_ONE_BYTE (c);
5595             }
5596         }
5597     }
5598
5599   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5600   coding->produced_char += produced_chars;
5601   coding->produced = dst - coding->destination;
5602   return 0;
5603 }
5604
5605 \f
5606 /*** 7. C library functions ***/
5607
5608 /* Setup coding context CODING from information about CODING_SYSTEM.
5609    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5610    CODING_SYSTEM is invalid, signal an error.  */
5611
5612 void
5613 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5614 {
5615   Lisp_Object attrs;
5616   Lisp_Object eol_type;
5617   Lisp_Object coding_type;
5618   Lisp_Object val;
5619
5620   if (NILP (coding_system))
5621     coding_system = Qundecided;
5622
5623   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5624
5625   attrs = CODING_ID_ATTRS (coding->id);
5626   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5627
5628   coding->mode = 0;
5629   coding->head_ascii = -1;
5630   if (VECTORP (eol_type))
5631     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5632                             | CODING_REQUIRE_DETECTION_MASK);
5633   else if (! EQ (eol_type, Qunix))
5634     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5635                             | CODING_REQUIRE_ENCODING_MASK);
5636   else
5637     coding->common_flags = 0;
5638   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5639     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5640   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5641     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5642   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5643     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5644
5645   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5646   coding->max_charset_id = SCHARS (val) - 1;
5647   coding->safe_charsets = SDATA (val);
5648   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5649   coding->carryover_bytes = 0;
5650
5651   coding_type = CODING_ATTR_TYPE (attrs);
5652   if (EQ (coding_type, Qundecided))
5653     {
5654       coding->detector = NULL;
5655       coding->decoder = decode_coding_raw_text;
5656       coding->encoder = encode_coding_raw_text;
5657       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5658     }
5659   else if (EQ (coding_type, Qiso_2022))
5660     {
5661       int i;
5662       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5663
5664       /* Invoke graphic register 0 to plane 0.  */
5665       CODING_ISO_INVOCATION (coding, 0) = 0;
5666       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5667       CODING_ISO_INVOCATION (coding, 1)
5668         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5669       /* Setup the initial status of designation.  */
5670       for (i = 0; i < 4; i++)
5671         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5672       /* Not single shifting initially.  */
5673       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5674       /* Beginning of buffer should also be regarded as bol. */
5675       CODING_ISO_BOL (coding) = 1;
5676       coding->detector = detect_coding_iso_2022;
5677       coding->decoder = decode_coding_iso_2022;
5678       coding->encoder = encode_coding_iso_2022;
5679       if (flags & CODING_ISO_FLAG_SAFE)
5680         coding->mode |= CODING_MODE_SAFE_ENCODING;
5681       coding->common_flags
5682         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5683             | CODING_REQUIRE_FLUSHING_MASK);
5684       if (flags & CODING_ISO_FLAG_COMPOSITION)
5685         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5686       if (flags & CODING_ISO_FLAG_DESIGNATION)
5687         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5688       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5689         {
5690           setup_iso_safe_charsets (attrs);
5691           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5692           coding->max_charset_id = SCHARS (val) - 1;
5693           coding->safe_charsets = SDATA (val);
5694         }
5695       CODING_ISO_FLAGS (coding) = flags;
5696       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5697       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5698       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5699       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5700     }
5701   else if (EQ (coding_type, Qcharset))
5702     {
5703       coding->detector = detect_coding_charset;
5704       coding->decoder = decode_coding_charset;
5705       coding->encoder = encode_coding_charset;
5706       coding->common_flags
5707         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708     }
5709   else if (EQ (coding_type, Qutf_8))
5710     {
5711       val = AREF (attrs, coding_attr_utf_bom);
5712       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5713                                    : EQ (val, Qt) ? utf_with_bom
5714                                    : utf_without_bom);
5715       coding->detector = detect_coding_utf_8;
5716       coding->decoder = decode_coding_utf_8;
5717       coding->encoder = encode_coding_utf_8;
5718       coding->common_flags
5719         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5720       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5721         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5722     }
5723   else if (EQ (coding_type, Qutf_16))
5724     {
5725       val = AREF (attrs, coding_attr_utf_bom);
5726       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5727                                     : EQ (val, Qt) ? utf_with_bom
5728                                     : utf_without_bom);
5729       val = AREF (attrs, coding_attr_utf_16_endian);
5730       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5731                                        : utf_16_little_endian);
5732       CODING_UTF_16_SURROGATE (coding) = 0;
5733       coding->detector = detect_coding_utf_16;
5734       coding->decoder = decode_coding_utf_16;
5735       coding->encoder = encode_coding_utf_16;
5736       coding->common_flags
5737         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5738       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5739         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5740     }
5741   else if (EQ (coding_type, Qccl))
5742     {
5743       coding->detector = detect_coding_ccl;
5744       coding->decoder = decode_coding_ccl;
5745       coding->encoder = encode_coding_ccl;
5746       coding->common_flags
5747         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5748             | CODING_REQUIRE_FLUSHING_MASK);
5749     }
5750   else if (EQ (coding_type, Qemacs_mule))
5751     {
5752       coding->detector = detect_coding_emacs_mule;
5753       coding->decoder = decode_coding_emacs_mule;
5754       coding->encoder = encode_coding_emacs_mule;
5755       coding->common_flags
5756         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5757       coding->spec.emacs_mule.full_support = 1;
5758       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5759           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5760         {
5761           Lisp_Object tail, safe_charsets;
5762           int max_charset_id = 0;
5763
5764           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5765                tail = XCDR (tail))
5766             if (max_charset_id < XFASTINT (XCAR (tail)))
5767               max_charset_id = XFASTINT (XCAR (tail));
5768           safe_charsets = make_uninit_string (max_charset_id + 1);
5769           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5770           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5771                tail = XCDR (tail))
5772             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5773           coding->max_charset_id = max_charset_id;
5774           coding->safe_charsets = SDATA (safe_charsets);
5775           coding->spec.emacs_mule.full_support = 1;
5776         }
5777       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5778       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5779     }
5780   else if (EQ (coding_type, Qshift_jis))
5781     {
5782       coding->detector = detect_coding_sjis;
5783       coding->decoder = decode_coding_sjis;
5784       coding->encoder = encode_coding_sjis;
5785       coding->common_flags
5786         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5787     }
5788   else if (EQ (coding_type, Qbig5))
5789     {
5790       coding->detector = detect_coding_big5;
5791       coding->decoder = decode_coding_big5;
5792       coding->encoder = encode_coding_big5;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795     }
5796   else                          /* EQ (coding_type, Qraw_text) */
5797     {
5798       coding->detector = NULL;
5799       coding->decoder = decode_coding_raw_text;
5800       coding->encoder = encode_coding_raw_text;
5801       if (! EQ (eol_type, Qunix))
5802         {
5803           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5804           if (! VECTORP (eol_type))
5805             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5806         }
5807
5808     }
5809
5810   return;
5811 }
5812
5813 /* Return a list of charsets supported by CODING.  */
5814
5815 Lisp_Object
5816 coding_charset_list (struct coding_system *coding)
5817 {
5818   Lisp_Object attrs, charset_list;
5819
5820   CODING_GET_INFO (coding, attrs, charset_list);
5821   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5822     {
5823       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5824
5825       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5826         charset_list = Viso_2022_charset_list;
5827     }
5828   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5829     {
5830       charset_list = Vemacs_mule_charset_list;
5831     }
5832   return charset_list;
5833 }
5834
5835
5836 /* Return a list of charsets supported by CODING-SYSTEM.  */
5837
5838 Lisp_Object
5839 coding_system_charset_list (Lisp_Object coding_system)
5840 {
5841   int id;
5842   Lisp_Object attrs, charset_list;
5843
5844   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5845   attrs = CODING_ID_ATTRS (id);
5846
5847   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5848     {
5849       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5850
5851       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5852         charset_list = Viso_2022_charset_list;
5853       else
5854         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5855     }
5856   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5857     {
5858       charset_list = Vemacs_mule_charset_list;
5859     }
5860   else
5861     {
5862       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5863     }
5864   return charset_list;
5865 }
5866
5867
5868 /* Return raw-text or one of its subsidiaries that has the same
5869    eol_type as CODING-SYSTEM.  */
5870
5871 Lisp_Object
5872 raw_text_coding_system (Lisp_Object coding_system)
5873 {
5874   Lisp_Object spec, attrs;
5875   Lisp_Object eol_type, raw_text_eol_type;
5876
5877   if (NILP (coding_system))
5878     return Qraw_text;
5879   spec = CODING_SYSTEM_SPEC (coding_system);
5880   attrs = AREF (spec, 0);
5881
5882   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5883     return coding_system;
5884
5885   eol_type = AREF (spec, 2);
5886   if (VECTORP (eol_type))
5887     return Qraw_text;
5888   spec = CODING_SYSTEM_SPEC (Qraw_text);
5889   raw_text_eol_type = AREF (spec, 2);
5890   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5891           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5892           : AREF (raw_text_eol_type, 2));
5893 }
5894
5895
5896 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5897    the subsidiary that has the same eol-spec as PARENT (if it is not
5898    nil and specifies end-of-line format) or the system's setting
5899    (system_eol_type).  */
5900
5901 Lisp_Object
5902 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5903 {
5904   Lisp_Object spec, eol_type;
5905
5906   if (NILP (coding_system))
5907     coding_system = Qraw_text;
5908   spec = CODING_SYSTEM_SPEC (coding_system);
5909   eol_type = AREF (spec, 2);
5910   if (VECTORP (eol_type))
5911     {
5912       Lisp_Object parent_eol_type;
5913
5914       if (! NILP (parent))
5915         {
5916           Lisp_Object parent_spec;
5917
5918           parent_spec = CODING_SYSTEM_SPEC (parent);
5919           parent_eol_type = AREF (parent_spec, 2);
5920           if (VECTORP (parent_eol_type))
5921             parent_eol_type = system_eol_type;
5922         }
5923       else
5924         parent_eol_type = system_eol_type;
5925       if (EQ (parent_eol_type, Qunix))
5926         coding_system = AREF (eol_type, 0);
5927       else if (EQ (parent_eol_type, Qdos))
5928         coding_system = AREF (eol_type, 1);
5929       else if (EQ (parent_eol_type, Qmac))
5930         coding_system = AREF (eol_type, 2);
5931     }
5932   return coding_system;
5933 }
5934
5935
5936 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5937    decided for writing to a process.  If not, complement them, and
5938    return a new coding system.  */
5939
5940 Lisp_Object
5941 complement_process_encoding_system (Lisp_Object coding_system)
5942 {
5943   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5944   Lisp_Object spec, attrs;
5945   int i;
5946
5947   for (i = 0; i < 3; i++)
5948     {
5949       if (i == 1)
5950         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5951       else if (i == 2)
5952         coding_system = preferred_coding_system ();
5953       spec = CODING_SYSTEM_SPEC (coding_system);
5954       if (NILP (spec))
5955         continue;
5956       attrs = AREF (spec, 0);
5957       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5958         coding_base = CODING_ATTR_BASE_NAME (attrs);
5959       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5960         eol_base = coding_system;
5961       if (! NILP (coding_base) && ! NILP (eol_base))
5962         break;
5963     }
5964
5965   if (i > 0)
5966     /* The original CODING_SYSTEM didn't specify text-conversion or
5967        eol-conversion.  Be sure that we return a fully complemented
5968        coding system.  */
5969     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5970   return coding_system;
5971 }
5972
5973
5974 /* Emacs has a mechanism to automatically detect a coding system if it
5975    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5976    it's impossible to distinguish some coding systems accurately
5977    because they use the same range of codes.  So, at first, coding
5978    systems are categorized into 7, those are:
5979
5980    o coding-category-emacs-mule
5981
5982         The category for a coding system which has the same code range
5983         as Emacs' internal format.  Assigned the coding-system (Lisp
5984         symbol) `emacs-mule' by default.
5985
5986    o coding-category-sjis
5987
5988         The category for a coding system which has the same code range
5989         as SJIS.  Assigned the coding-system (Lisp
5990         symbol) `japanese-shift-jis' by default.
5991
5992    o coding-category-iso-7
5993
5994         The category for a coding system which has the same code range
5995         as ISO2022 of 7-bit environment.  This doesn't use any locking
5996         shift and single shift functions.  This can encode/decode all
5997         charsets.  Assigned the coding-system (Lisp symbol)
5998         `iso-2022-7bit' by default.
5999
6000    o coding-category-iso-7-tight
6001
6002         Same as coding-category-iso-7 except that this can
6003         encode/decode only the specified charsets.
6004
6005    o coding-category-iso-8-1
6006
6007         The category for a coding system which has the same code range
6008         as ISO2022 of 8-bit environment and graphic plane 1 used only
6009         for DIMENSION1 charset.  This doesn't use any locking shift
6010         and single shift functions.  Assigned the coding-system (Lisp
6011         symbol) `iso-latin-1' by default.
6012
6013    o coding-category-iso-8-2
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 8-bit environment and graphic plane 1 used only
6017         for DIMENSION2 charset.  This doesn't use any locking shift
6018         and single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `japanese-iso-8bit' by default.
6020
6021    o coding-category-iso-7-else
6022
6023         The category for a coding system which has the same code range
6024         as ISO2022 of 7-bit environment but uses locking shift or
6025         single shift functions.  Assigned the coding-system (Lisp
6026         symbol) `iso-2022-7bit-lock' by default.
6027
6028    o coding-category-iso-8-else
6029
6030         The category for a coding system which has the same code range
6031         as ISO2022 of 8-bit environment but uses locking shift or
6032         single shift functions.  Assigned the coding-system (Lisp
6033         symbol) `iso-2022-8bit-ss2' by default.
6034
6035    o coding-category-big5
6036
6037         The category for a coding system which has the same code range
6038         as BIG5.  Assigned the coding-system (Lisp symbol)
6039         `cn-big5' by default.
6040
6041    o coding-category-utf-8
6042
6043         The category for a coding system which has the same code range
6044         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6045         symbol) `utf-8' by default.
6046
6047    o coding-category-utf-16-be
6048
6049         The category for a coding system in which a text has an
6050         Unicode signature (cf. Unicode Standard) in the order of BIG
6051         endian at the head.  Assigned the coding-system (Lisp symbol)
6052         `utf-16-be' by default.
6053
6054    o coding-category-utf-16-le
6055
6056         The category for a coding system in which a text has an
6057         Unicode signature (cf. Unicode Standard) in the order of
6058         LITTLE endian at the head.  Assigned the coding-system (Lisp
6059         symbol) `utf-16-le' by default.
6060
6061    o coding-category-ccl
6062
6063         The category for a coding system of which encoder/decoder is
6064         written in CCL programs.  The default value is nil, i.e., no
6065         coding system is assigned.
6066
6067    o coding-category-binary
6068
6069         The category for a coding system not categorized in any of the
6070         above.  Assigned the coding-system (Lisp symbol)
6071         `no-conversion' by default.
6072
6073    Each of them is a Lisp symbol and the value is an actual
6074    `coding-system's (this is also a Lisp symbol) assigned by a user.
6075    What Emacs does actually is to detect a category of coding system.
6076    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6077    decide only one possible category, it selects a category of the
6078    highest priority.  Priorities of categories are also specified by a
6079    user in a Lisp variable `coding-category-list'.
6080
6081 */
6082
6083 #define EOL_SEEN_NONE   0
6084 #define EOL_SEEN_LF     1
6085 #define EOL_SEEN_CR     2
6086 #define EOL_SEEN_CRLF   4
6087
6088 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6089    SOURCE is encoded.  If CATEGORY is one of
6090    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6091    two-byte, else they are encoded by one-byte.
6092
6093    Return one of EOL_SEEN_XXX.  */
6094
6095 #define MAX_EOL_CHECK_COUNT 3
6096
6097 static int
6098 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6099             enum coding_category category)
6100 {
6101   const unsigned char *src = source, *src_end = src + src_bytes;
6102   unsigned char c;
6103   int total  = 0;
6104   int eol_seen = EOL_SEEN_NONE;
6105
6106   if ((1 << category) & CATEGORY_MASK_UTF_16)
6107     {
6108       int msb, lsb;
6109
6110       msb = category == (coding_category_utf_16_le
6111                          | coding_category_utf_16_le_nosig);
6112       lsb = 1 - msb;
6113
6114       while (src + 1 < src_end)
6115         {
6116           c = src[lsb];
6117           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6118             {
6119               int this_eol;
6120
6121               if (c == '\n')
6122                 this_eol = EOL_SEEN_LF;
6123               else if (src + 3 >= src_end
6124                        || src[msb + 2] != 0
6125                        || src[lsb + 2] != '\n')
6126                 this_eol = EOL_SEEN_CR;
6127               else
6128                 {
6129                   this_eol = EOL_SEEN_CRLF;
6130                   src += 2;
6131                 }
6132
6133               if (eol_seen == EOL_SEEN_NONE)
6134                 /* This is the first end-of-line.  */
6135                 eol_seen = this_eol;
6136               else if (eol_seen != this_eol)
6137                 {
6138                   /* The found type is different from what found before.
6139                      Allow for stray ^M characters in DOS EOL files.  */
6140                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6141                       || (eol_seen == EOL_SEEN_CRLF
6142                           && this_eol == EOL_SEEN_CR))
6143                     eol_seen = EOL_SEEN_CRLF;
6144                   else
6145                     {
6146                       eol_seen = EOL_SEEN_LF;
6147                       break;
6148                     }
6149                 }
6150               if (++total == MAX_EOL_CHECK_COUNT)
6151                 break;
6152             }
6153           src += 2;
6154         }
6155     }
6156   else
6157     while (src < src_end)
6158       {
6159         c = *src++;
6160         if (c == '\n' || c == '\r')
6161           {
6162             int this_eol;
6163
6164             if (c == '\n')
6165               this_eol = EOL_SEEN_LF;
6166             else if (src >= src_end || *src != '\n')
6167               this_eol = EOL_SEEN_CR;
6168             else
6169               this_eol = EOL_SEEN_CRLF, src++;
6170
6171             if (eol_seen == EOL_SEEN_NONE)
6172               /* This is the first end-of-line.  */
6173               eol_seen = this_eol;
6174             else if (eol_seen != this_eol)
6175               {
6176                 /* The found type is different from what found before.
6177                    Allow for stray ^M characters in DOS EOL files.  */
6178                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6179                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6180                   eol_seen = EOL_SEEN_CRLF;
6181                 else
6182                   {
6183                     eol_seen = EOL_SEEN_LF;
6184                     break;
6185                   }
6186               }
6187             if (++total == MAX_EOL_CHECK_COUNT)
6188               break;
6189           }
6190       }
6191   return eol_seen;
6192 }
6193
6194
6195 static Lisp_Object
6196 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6197 {
6198   Lisp_Object eol_type;
6199
6200   eol_type = CODING_ID_EOL_TYPE (coding->id);
6201   if (eol_seen & EOL_SEEN_LF)
6202     {
6203       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6204       eol_type = Qunix;
6205     }
6206   else if (eol_seen & EOL_SEEN_CRLF)
6207     {
6208       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6209       eol_type = Qdos;
6210     }
6211   else if (eol_seen & EOL_SEEN_CR)
6212     {
6213       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6214       eol_type = Qmac;
6215     }
6216   return eol_type;
6217 }
6218
6219 /* Detect how a text specified in CODING is encoded.  If a coding
6220    system is detected, update fields of CODING by the detected coding
6221    system.  */
6222
6223 static void
6224 detect_coding (struct coding_system *coding)
6225 {
6226   const unsigned char *src, *src_end;
6227   int saved_mode = coding->mode;
6228
6229   coding->consumed = coding->consumed_char = 0;
6230   coding->produced = coding->produced_char = 0;
6231   coding_set_source (coding);
6232
6233   src_end = coding->source + coding->src_bytes;
6234   coding->head_ascii = 0;
6235
6236   /* If we have not yet decided the text encoding type, detect it
6237      now.  */
6238   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6239     {
6240       int c, i;
6241       struct coding_detection_info detect_info;
6242       int null_byte_found = 0, eight_bit_found = 0;
6243
6244       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6245       for (src = coding->source; src < src_end; src++)
6246         {
6247           c = *src;
6248           if (c & 0x80)
6249             {
6250               eight_bit_found = 1;
6251               if (null_byte_found)
6252                 break;
6253             }
6254           else if (c < 0x20)
6255             {
6256               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6257                   && ! inhibit_iso_escape_detection
6258                   && ! detect_info.checked)
6259                 {
6260                   if (detect_coding_iso_2022 (coding, &detect_info))
6261                     {
6262                       /* We have scanned the whole data.  */
6263                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6264                         {
6265                           /* We didn't find an 8-bit code.  We may
6266                              have found a null-byte, but it's very
6267                              rare that a binary file conforms to
6268                              ISO-2022.  */
6269                           src = src_end;
6270                           coding->head_ascii = src - coding->source;
6271                         }
6272                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6273                       break;
6274                     }
6275                 }
6276               else if (! c && !inhibit_null_byte_detection)
6277                 {
6278                   null_byte_found = 1;
6279                   if (eight_bit_found)
6280                     break;
6281                 }
6282               if (! eight_bit_found)
6283                 coding->head_ascii++;
6284             }
6285           else if (! eight_bit_found)
6286             coding->head_ascii++;
6287         }
6288
6289       if (null_byte_found || eight_bit_found
6290           || coding->head_ascii < coding->src_bytes
6291           || detect_info.found)
6292         {
6293           enum coding_category category;
6294           struct coding_system *this;
6295
6296           if (coding->head_ascii == coding->src_bytes)
6297             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6298             for (i = 0; i < coding_category_raw_text; i++)
6299               {
6300                 category = coding_priorities[i];
6301                 this = coding_categories + category;
6302                 if (detect_info.found & (1 << category))
6303                   break;
6304               }
6305           else
6306             {
6307               if (null_byte_found)
6308                 {
6309                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6310                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6311                 }
6312               for (i = 0; i < coding_category_raw_text; i++)
6313                 {
6314                   category = coding_priorities[i];
6315                   this = coding_categories + category;
6316                   if (this->id < 0)
6317                     {
6318                       /* No coding system of this category is defined.  */
6319                       detect_info.rejected |= (1 << category);
6320                     }
6321                   else if (category >= coding_category_raw_text)
6322                     continue;
6323                   else if (detect_info.checked & (1 << category))
6324                     {
6325                       if (detect_info.found & (1 << category))
6326                         break;
6327                     }
6328                   else if ((*(this->detector)) (coding, &detect_info)
6329                            && detect_info.found & (1 << category))
6330                     {
6331                       if (category == coding_category_utf_16_auto)
6332                         {
6333                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6334                             category = coding_category_utf_16_le;
6335                           else
6336                             category = coding_category_utf_16_be;
6337                         }
6338                       break;
6339                     }
6340                 }
6341             }
6342
6343           if (i < coding_category_raw_text)
6344             setup_coding_system (CODING_ID_NAME (this->id), coding);
6345           else if (null_byte_found)
6346             setup_coding_system (Qno_conversion, coding);
6347           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6348                    == CATEGORY_MASK_ANY)
6349             setup_coding_system (Qraw_text, coding);
6350           else if (detect_info.rejected)
6351             for (i = 0; i < coding_category_raw_text; i++)
6352               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6353                 {
6354                   this = coding_categories + coding_priorities[i];
6355                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6356                   break;
6357                 }
6358         }
6359     }
6360   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6361            == coding_category_utf_8_auto)
6362     {
6363       Lisp_Object coding_systems;
6364       struct coding_detection_info detect_info;
6365
6366       coding_systems
6367         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6368       detect_info.found = detect_info.rejected = 0;
6369       coding->head_ascii = 0;
6370       if (CONSP (coding_systems)
6371           && detect_coding_utf_8 (coding, &detect_info))
6372         {
6373           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6374             setup_coding_system (XCAR (coding_systems), coding);
6375           else
6376             setup_coding_system (XCDR (coding_systems), coding);
6377         }
6378     }
6379   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6380            == coding_category_utf_16_auto)
6381     {
6382       Lisp_Object coding_systems;
6383       struct coding_detection_info detect_info;
6384
6385       coding_systems
6386         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6387       detect_info.found = detect_info.rejected = 0;
6388       coding->head_ascii = 0;
6389       if (CONSP (coding_systems)
6390           && detect_coding_utf_16 (coding, &detect_info))
6391         {
6392           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6393             setup_coding_system (XCAR (coding_systems), coding);
6394           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6395             setup_coding_system (XCDR (coding_systems), coding);
6396         }
6397     }
6398   coding->mode = saved_mode;
6399 }
6400
6401
6402 static void
6403 decode_eol (struct coding_system *coding)
6404 {
6405   Lisp_Object eol_type;
6406   unsigned char *p, *pbeg, *pend;
6407
6408   eol_type = CODING_ID_EOL_TYPE (coding->id);
6409   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6410     return;
6411
6412   if (NILP (coding->dst_object))
6413     pbeg = coding->destination;
6414   else
6415     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6416   pend = pbeg + coding->produced;
6417
6418   if (VECTORP (eol_type))
6419     {
6420       int eol_seen = EOL_SEEN_NONE;
6421
6422       for (p = pbeg; p < pend; p++)
6423         {
6424           if (*p == '\n')
6425             eol_seen |= EOL_SEEN_LF;
6426           else if (*p == '\r')
6427             {
6428               if (p + 1 < pend && *(p + 1) == '\n')
6429                 {
6430                   eol_seen |= EOL_SEEN_CRLF;
6431                   p++;
6432                 }
6433               else
6434                 eol_seen |= EOL_SEEN_CR;
6435             }
6436         }
6437       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6438       if ((eol_seen & EOL_SEEN_CRLF) != 0
6439           && (eol_seen & EOL_SEEN_CR) != 0
6440           && (eol_seen & EOL_SEEN_LF) == 0)
6441         eol_seen = EOL_SEEN_CRLF;
6442       else if (eol_seen != EOL_SEEN_NONE
6443           && eol_seen != EOL_SEEN_LF
6444           && eol_seen != EOL_SEEN_CRLF
6445           && eol_seen != EOL_SEEN_CR)
6446         eol_seen = EOL_SEEN_LF;
6447       if (eol_seen != EOL_SEEN_NONE)
6448         eol_type = adjust_coding_eol_type (coding, eol_seen);
6449     }
6450
6451   if (EQ (eol_type, Qmac))
6452     {
6453       for (p = pbeg; p < pend; p++)
6454         if (*p == '\r')
6455           *p = '\n';
6456     }
6457   else if (EQ (eol_type, Qdos))
6458     {
6459       EMACS_INT n = 0;
6460
6461       if (NILP (coding->dst_object))
6462         {
6463           /* Start deleting '\r' from the tail to minimize the memory
6464              movement.  */
6465           for (p = pend - 2; p >= pbeg; p--)
6466             if (*p == '\r')
6467               {
6468                 memmove (p, p + 1, pend-- - p - 1);
6469                 n++;
6470               }
6471         }
6472       else
6473         {
6474           EMACS_INT pos_byte = coding->dst_pos_byte;
6475           EMACS_INT pos = coding->dst_pos;
6476           EMACS_INT pos_end = pos + coding->produced_char - 1;
6477
6478           while (pos < pos_end)
6479             {
6480               p = BYTE_POS_ADDR (pos_byte);
6481               if (*p == '\r' && p[1] == '\n')
6482                 {
6483                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6484                   n++;
6485                   pos_end--;
6486                 }
6487               pos++;
6488               if (coding->dst_multibyte)
6489                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6490               else
6491                 pos_byte++;
6492             }
6493         }
6494       coding->produced -= n;
6495       coding->produced_char -= n;
6496     }
6497 }
6498
6499
6500 /* Return a translation table (or list of them) from coding system
6501    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6502    decoding (ENCODEP is zero). */
6503
6504 static Lisp_Object
6505 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6506 {
6507   Lisp_Object standard, translation_table;
6508   Lisp_Object val;
6509
6510   if (NILP (Venable_character_translation))
6511     {
6512       if (max_lookup)
6513         *max_lookup = 0;
6514       return Qnil;
6515     }
6516   if (encodep)
6517     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6518       standard = Vstandard_translation_table_for_encode;
6519   else
6520     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6521       standard = Vstandard_translation_table_for_decode;
6522   if (NILP (translation_table))
6523     translation_table = standard;
6524   else
6525     {
6526       if (SYMBOLP (translation_table))
6527         translation_table = Fget (translation_table, Qtranslation_table);
6528       else if (CONSP (translation_table))
6529         {
6530           translation_table = Fcopy_sequence (translation_table);
6531           for (val = translation_table; CONSP (val); val = XCDR (val))
6532             if (SYMBOLP (XCAR (val)))
6533               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6534         }
6535       if (CHAR_TABLE_P (standard))
6536         {
6537           if (CONSP (translation_table))
6538             translation_table = nconc2 (translation_table,
6539                                         Fcons (standard, Qnil));
6540           else
6541             translation_table = Fcons (translation_table,
6542                                        Fcons (standard, Qnil));
6543         }
6544     }
6545
6546   if (max_lookup)
6547     {
6548       *max_lookup = 1;
6549       if (CHAR_TABLE_P (translation_table)
6550           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6551         {
6552           val = XCHAR_TABLE (translation_table)->extras[1];
6553           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6554             *max_lookup = XFASTINT (val);
6555         }
6556       else if (CONSP (translation_table))
6557         {
6558           Lisp_Object tail;
6559
6560           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6561             if (CHAR_TABLE_P (XCAR (tail))
6562                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6563               {
6564                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6565                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6566                   *max_lookup = XFASTINT (tailval);
6567               }
6568         }
6569     }
6570   return translation_table;
6571 }
6572
6573 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6574   do {                                                          \
6575     trans = Qnil;                                               \
6576     if (CHAR_TABLE_P (table))                                   \
6577       {                                                         \
6578         trans = CHAR_TABLE_REF (table, c);                      \
6579         if (CHARACTERP (trans))                                 \
6580           c = XFASTINT (trans), trans = Qnil;                   \
6581       }                                                         \
6582     else if (CONSP (table))                                     \
6583       {                                                         \
6584         Lisp_Object tail;                                       \
6585                                                                 \
6586         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6587           if (CHAR_TABLE_P (XCAR (tail)))                       \
6588             {                                                   \
6589               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6590               if (CHARACTERP (trans))                           \
6591                 c = XFASTINT (trans), trans = Qnil;             \
6592               else if (! NILP (trans))                          \
6593                 break;                                          \
6594             }                                                   \
6595       }                                                         \
6596   } while (0)
6597
6598
6599 /* Return a translation of character(s) at BUF according to TRANS.
6600    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6601    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6602    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6603    translation is found, and Qnil if not found..
6604    If BUF is too short to lookup characters in FROM, return Qt.  */
6605
6606 static Lisp_Object
6607 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6608 {
6609
6610   if (INTEGERP (trans))
6611     return trans;
6612   for (; CONSP (trans); trans = XCDR (trans))
6613     {
6614       Lisp_Object val = XCAR (trans);
6615       Lisp_Object from = XCAR (val);
6616       int len = ASIZE (from);
6617       int i;
6618
6619       for (i = 0; i < len; i++)
6620         {
6621           if (buf + i == buf_end)
6622             return Qt;
6623           if (XINT (AREF (from, i)) != buf[i])
6624             break;
6625         }
6626       if (i == len)
6627         return val;
6628     }
6629   return Qnil;
6630 }
6631
6632
6633 static int
6634 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6635                int last_block)
6636 {
6637   unsigned char *dst = coding->destination + coding->produced;
6638   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6639   EMACS_INT produced;
6640   EMACS_INT produced_chars = 0;
6641   int carryover = 0;
6642
6643   if (! coding->chars_at_source)
6644     {
6645       /* Source characters are in coding->charbuf.  */
6646       int *buf = coding->charbuf;
6647       int *buf_end = buf + coding->charbuf_used;
6648
6649       if (EQ (coding->src_object, coding->dst_object))
6650         {
6651           coding_set_source (coding);
6652           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6653         }
6654
6655       while (buf < buf_end)
6656         {
6657           int c = *buf, i;
6658
6659           if (c >= 0)
6660             {
6661               EMACS_INT from_nchars = 1, to_nchars = 1;
6662               Lisp_Object trans = Qnil;
6663
6664               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6665               if (! NILP (trans))
6666                 {
6667                   trans = get_translation (trans, buf, buf_end);
6668                   if (INTEGERP (trans))
6669                     c = XINT (trans);
6670                   else if (CONSP (trans))
6671                     {
6672                       from_nchars = ASIZE (XCAR (trans));
6673                       trans = XCDR (trans);
6674                       if (INTEGERP (trans))
6675                         c = XINT (trans);
6676                       else
6677                         {
6678                           to_nchars = ASIZE (trans);
6679                           c = XINT (AREF (trans, 0));
6680                         }
6681                     }
6682                   else if (EQ (trans, Qt) && ! last_block)
6683                     break;
6684                 }
6685
6686               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6687                 {
6688                   dst = alloc_destination (coding,
6689                                            buf_end - buf
6690                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6691                                            dst);
6692                   if (EQ (coding->src_object, coding->dst_object))
6693                     {
6694                       coding_set_source (coding);
6695                       dst_end = (((unsigned char *) coding->source)
6696                                  + coding->consumed);
6697                     }
6698                   else
6699                     dst_end = coding->destination + coding->dst_bytes;
6700                 }
6701
6702               for (i = 0; i < to_nchars; i++)
6703                 {
6704                   if (i > 0)
6705                     c = XINT (AREF (trans, i));
6706                   if (coding->dst_multibyte
6707                       || ! CHAR_BYTE8_P (c))
6708                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6709                   else
6710                     *dst++ = CHAR_TO_BYTE8 (c);
6711                 }
6712               produced_chars += to_nchars;
6713               buf += from_nchars;
6714             }
6715           else
6716             /* This is an annotation datum.  (-C) is the length.  */
6717             buf += -c;
6718         }
6719       carryover = buf_end - buf;
6720     }
6721   else
6722     {
6723       /* Source characters are at coding->source.  */
6724       const unsigned char *src = coding->source;
6725       const unsigned char *src_end = src + coding->consumed;
6726
6727       if (EQ (coding->dst_object, coding->src_object))
6728         dst_end = (unsigned char *) src;
6729       if (coding->src_multibyte != coding->dst_multibyte)
6730         {
6731           if (coding->src_multibyte)
6732             {
6733               int multibytep = 1;
6734               EMACS_INT consumed_chars = 0;
6735
6736               while (1)
6737                 {
6738                   const unsigned char *src_base = src;
6739                   int c;
6740
6741                   ONE_MORE_BYTE (c);
6742                   if (dst == dst_end)
6743                     {
6744                       if (EQ (coding->src_object, coding->dst_object))
6745                         dst_end = (unsigned char *) src;
6746                       if (dst == dst_end)
6747                         {
6748                           EMACS_INT offset = src - coding->source;
6749
6750                           dst = alloc_destination (coding, src_end - src + 1,
6751                                                    dst);
6752                           dst_end = coding->destination + coding->dst_bytes;
6753                           coding_set_source (coding);
6754                           src = coding->source + offset;
6755                           src_end = coding->source + coding->src_bytes;
6756                           if (EQ (coding->src_object, coding->dst_object))
6757                             dst_end = (unsigned char *) src;
6758                         }
6759                     }
6760                   *dst++ = c;
6761                   produced_chars++;
6762                 }
6763             no_more_source:
6764               ;
6765             }
6766           else
6767             while (src < src_end)
6768               {
6769                 int multibytep = 1;
6770                 int c = *src++;
6771
6772                 if (dst >= dst_end - 1)
6773                   {
6774                     if (EQ (coding->src_object, coding->dst_object))
6775                       dst_end = (unsigned char *) src;
6776                     if (dst >= dst_end - 1)
6777                       {
6778                         EMACS_INT offset = src - coding->source;
6779                         EMACS_INT more_bytes;
6780
6781                         if (EQ (coding->src_object, coding->dst_object))
6782                           more_bytes = ((src_end - src) / 2) + 2;
6783                         else
6784                           more_bytes = src_end - src + 2;
6785                         dst = alloc_destination (coding, more_bytes, dst);
6786                         dst_end = coding->destination + coding->dst_bytes;
6787                         coding_set_source (coding);
6788                         src = coding->source + offset;
6789                         src_end = coding->source + coding->src_bytes;
6790                         if (EQ (coding->src_object, coding->dst_object))
6791                           dst_end = (unsigned char *) src;
6792                       }
6793                   }
6794                 EMIT_ONE_BYTE (c);
6795               }
6796         }
6797       else
6798         {
6799           if (!EQ (coding->src_object, coding->dst_object))
6800             {
6801               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6802
6803               if (require > 0)
6804                 {
6805                   EMACS_INT offset = src - coding->source;
6806
6807                   dst = alloc_destination (coding, require, dst);
6808                   coding_set_source (coding);
6809                   src = coding->source + offset;
6810                   src_end = coding->source + coding->src_bytes;
6811                 }
6812             }
6813           produced_chars = coding->consumed_char;
6814           while (src < src_end)
6815             *dst++ = *src++;
6816         }
6817     }
6818
6819   produced = dst - (coding->destination + coding->produced);
6820   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6821     insert_from_gap (produced_chars, produced);
6822   coding->produced += produced;
6823   coding->produced_char += produced_chars;
6824   return carryover;
6825 }
6826
6827 /* Compose text in CODING->object according to the annotation data at
6828    CHARBUF.  CHARBUF is an array:
6829      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6830  */
6831
6832 static inline void
6833 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6834 {
6835   int len;
6836   EMACS_INT to;
6837   enum composition_method method;
6838   Lisp_Object components;
6839
6840   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6841   to = pos + charbuf[2];
6842   method = (enum composition_method) (charbuf[4]);
6843
6844   if (method == COMPOSITION_RELATIVE)
6845     components = Qnil;
6846   else
6847     {
6848       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6849       int i, j;
6850
6851       if (method == COMPOSITION_WITH_RULE)
6852         len = charbuf[2] * 3 - 2;
6853       charbuf += MAX_ANNOTATION_LENGTH;
6854       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6855       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6856         {
6857           if (charbuf[i] >= 0)
6858             args[j] = make_number (charbuf[i]);
6859           else
6860             {
6861               i++;
6862               args[j] = make_number (charbuf[i] % 0x100);
6863             }
6864         }
6865       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6866     }
6867   compose_text (pos, to, components, Qnil, coding->dst_object);
6868 }
6869
6870
6871 /* Put `charset' property on text in CODING->object according to
6872    the annotation data at CHARBUF.  CHARBUF is an array:
6873      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6874  */
6875
6876 static inline void
6877 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6878 {
6879   EMACS_INT from = pos - charbuf[2];
6880   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6881
6882   Fput_text_property (make_number (from), make_number (pos),
6883                       Qcharset, CHARSET_NAME (charset),
6884                       coding->dst_object);
6885 }
6886
6887
6888 #define CHARBUF_SIZE 0x4000
6889
6890 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6891   do {                                                                  \
6892     int size = CHARBUF_SIZE;                                            \
6893                                                                         \
6894     coding->charbuf = NULL;                                             \
6895     while (size > 1024)                                                 \
6896       {                                                                 \
6897         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6898         if (coding->charbuf)                                            \
6899           break;                                                        \
6900         size >>= 1;                                                     \
6901       }                                                                 \
6902     if (! coding->charbuf)                                              \
6903       {                                                                 \
6904         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6905         return coding->result;                                          \
6906       }                                                                 \
6907     coding->charbuf_size = size;                                        \
6908   } while (0)
6909
6910
6911 static void
6912 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6913 {
6914   int *charbuf = coding->charbuf;
6915   int *charbuf_end = charbuf + coding->charbuf_used;
6916
6917   if (NILP (coding->dst_object))
6918     return;
6919
6920   while (charbuf < charbuf_end)
6921     {
6922       if (*charbuf >= 0)
6923         pos++, charbuf++;
6924       else
6925         {
6926           int len = -*charbuf;
6927
6928           if (len > 2)
6929             switch (charbuf[1])
6930               {
6931               case CODING_ANNOTATE_COMPOSITION_MASK:
6932                 produce_composition (coding, charbuf, pos);
6933                 break;
6934               case CODING_ANNOTATE_CHARSET_MASK:
6935                 produce_charset (coding, charbuf, pos);
6936                 break;
6937               }
6938           charbuf += len;
6939         }
6940     }
6941 }
6942
6943 /* Decode the data at CODING->src_object into CODING->dst_object.
6944    CODING->src_object is a buffer, a string, or nil.
6945    CODING->dst_object is a buffer.
6946
6947    If CODING->src_object is a buffer, it must be the current buffer.
6948    In this case, if CODING->src_pos is positive, it is a position of
6949    the source text in the buffer, otherwise, the source text is in the
6950    gap area of the buffer, and CODING->src_pos specifies the offset of
6951    the text from GPT (which must be the same as PT).  If this is the
6952    same buffer as CODING->dst_object, CODING->src_pos must be
6953    negative.
6954
6955    If CODING->src_object is a string, CODING->src_pos is an index to
6956    that string.
6957
6958    If CODING->src_object is nil, CODING->source must already point to
6959    the non-relocatable memory area.  In this case, CODING->src_pos is
6960    an offset from CODING->source.
6961
6962    The decoded data is inserted at the current point of the buffer
6963    CODING->dst_object.
6964 */
6965
6966 static int
6967 decode_coding (struct coding_system *coding)
6968 {
6969   Lisp_Object attrs;
6970   Lisp_Object undo_list;
6971   Lisp_Object translation_table;
6972   struct ccl_spec cclspec;
6973   int carryover;
6974   int i;
6975
6976   if (BUFFERP (coding->src_object)
6977       && coding->src_pos > 0
6978       && coding->src_pos < GPT
6979       && coding->src_pos + coding->src_chars > GPT)
6980     move_gap_both (coding->src_pos, coding->src_pos_byte);
6981
6982   undo_list = Qt;
6983   if (BUFFERP (coding->dst_object))
6984     {
6985       if (current_buffer != XBUFFER (coding->dst_object))
6986         set_buffer_internal (XBUFFER (coding->dst_object));
6987       if (GPT != PT)
6988         move_gap_both (PT, PT_BYTE);
6989       undo_list = BVAR (current_buffer, undo_list);
6990       BVAR (current_buffer, undo_list) = Qt;
6991     }
6992
6993   coding->consumed = coding->consumed_char = 0;
6994   coding->produced = coding->produced_char = 0;
6995   coding->chars_at_source = 0;
6996   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6997   coding->errors = 0;
6998
6999   ALLOC_CONVERSION_WORK_AREA (coding);
7000
7001   attrs = CODING_ID_ATTRS (coding->id);
7002   translation_table = get_translation_table (attrs, 0, NULL);
7003
7004   carryover = 0;
7005   if (coding->decoder == decode_coding_ccl)
7006     {
7007       coding->spec.ccl = &cclspec;
7008       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7009     }
7010   do
7011     {
7012       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7013
7014       coding_set_source (coding);
7015       coding->annotated = 0;
7016       coding->charbuf_used = carryover;
7017       (*(coding->decoder)) (coding);
7018       coding_set_destination (coding);
7019       carryover = produce_chars (coding, translation_table, 0);
7020       if (coding->annotated)
7021         produce_annotation (coding, pos);
7022       for (i = 0; i < carryover; i++)
7023         coding->charbuf[i]
7024           = coding->charbuf[coding->charbuf_used - carryover + i];
7025     }
7026   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7027          || (coding->consumed < coding->src_bytes
7028              && (coding->result == CODING_RESULT_SUCCESS
7029                  || coding->result == CODING_RESULT_INVALID_SRC)));
7030
7031   if (carryover > 0)
7032     {
7033       coding_set_destination (coding);
7034       coding->charbuf_used = carryover;
7035       produce_chars (coding, translation_table, 1);
7036     }
7037
7038   coding->carryover_bytes = 0;
7039   if (coding->consumed < coding->src_bytes)
7040     {
7041       int nbytes = coding->src_bytes - coding->consumed;
7042       const unsigned char *src;
7043
7044       coding_set_source (coding);
7045       coding_set_destination (coding);
7046       src = coding->source + coding->consumed;
7047
7048       if (coding->mode & CODING_MODE_LAST_BLOCK)
7049         {
7050           /* Flush out unprocessed data as binary chars.  We are sure
7051              that the number of data is less than the size of
7052              coding->charbuf.  */
7053           coding->charbuf_used = 0;
7054           coding->chars_at_source = 0;
7055
7056           while (nbytes-- > 0)
7057             {
7058               int c = *src++;
7059
7060               if (c & 0x80)
7061                 c = BYTE8_TO_CHAR (c);
7062               coding->charbuf[coding->charbuf_used++] = c;
7063             }
7064           produce_chars (coding, Qnil, 1);
7065         }
7066       else
7067         {
7068           /* Record unprocessed bytes in coding->carryover.  We are
7069              sure that the number of data is less than the size of
7070              coding->carryover.  */
7071           unsigned char *p = coding->carryover;
7072
7073           if (nbytes > sizeof coding->carryover)
7074             nbytes = sizeof coding->carryover;
7075           coding->carryover_bytes = nbytes;
7076           while (nbytes-- > 0)
7077             *p++ = *src++;
7078         }
7079       coding->consumed = coding->src_bytes;
7080     }
7081
7082   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7083       && !inhibit_eol_conversion)
7084     decode_eol (coding);
7085   if (BUFFERP (coding->dst_object))
7086     {
7087       BVAR (current_buffer, undo_list) = undo_list;
7088       record_insert (coding->dst_pos, coding->produced_char);
7089     }
7090   return coding->result;
7091 }
7092
7093
7094 /* Extract an annotation datum from a composition starting at POS and
7095    ending before LIMIT of CODING->src_object (buffer or string), store
7096    the data in BUF, set *STOP to a starting position of the next
7097    composition (if any) or to LIMIT, and return the address of the
7098    next element of BUF.
7099
7100    If such an annotation is not found, set *STOP to a starting
7101    position of a composition after POS (if any) or to LIMIT, and
7102    return BUF.  */
7103
7104 static inline int *
7105 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7106                                struct coding_system *coding, int *buf,
7107                                EMACS_INT *stop)
7108 {
7109   EMACS_INT start, end;
7110   Lisp_Object prop;
7111
7112   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7113       || end > limit)
7114     *stop = limit;
7115   else if (start > pos)
7116     *stop = start;
7117   else
7118     {
7119       if (start == pos)
7120         {
7121           /* We found a composition.  Store the corresponding
7122              annotation data in BUF.  */
7123           int *head = buf;
7124           enum composition_method method = COMPOSITION_METHOD (prop);
7125           int nchars = COMPOSITION_LENGTH (prop);
7126
7127           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7128           if (method != COMPOSITION_RELATIVE)
7129             {
7130               Lisp_Object components;
7131               int len, i, i_byte;
7132
7133               components = COMPOSITION_COMPONENTS (prop);
7134               if (VECTORP (components))
7135                 {
7136                   len = ASIZE (components);
7137                   for (i = 0; i < len; i++)
7138                     *buf++ = XINT (AREF (components, i));
7139                 }
7140               else if (STRINGP (components))
7141                 {
7142                   len = SCHARS (components);
7143                   i = i_byte = 0;
7144                   while (i < len)
7145                     {
7146                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7147                       buf++;
7148                     }
7149                 }
7150               else if (INTEGERP (components))
7151                 {
7152                   len = 1;
7153                   *buf++ = XINT (components);
7154                 }
7155               else if (CONSP (components))
7156                 {
7157                   for (len = 0; CONSP (components);
7158                        len++, components = XCDR (components))
7159                     *buf++ = XINT (XCAR (components));
7160                 }
7161               else
7162                 abort ();
7163               *head -= len;
7164             }
7165         }
7166
7167       if (find_composition (end, limit, &start, &end, &prop,
7168                             coding->src_object)
7169           && end <= limit)
7170         *stop = start;
7171       else
7172         *stop = limit;
7173     }
7174   return buf;
7175 }
7176
7177
7178 /* Extract an annotation datum from a text property `charset' at POS of
7179    CODING->src_object (buffer of string), store the data in BUF, set
7180    *STOP to the position where the value of `charset' property changes
7181    (limiting by LIMIT), and return the address of the next element of
7182    BUF.
7183
7184    If the property value is nil, set *STOP to the position where the
7185    property value is non-nil (limiting by LIMIT), and return BUF.  */
7186
7187 static inline int *
7188 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7189                            struct coding_system *coding, int *buf,
7190                            EMACS_INT *stop)
7191 {
7192   Lisp_Object val, next;
7193   int id;
7194
7195   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7196   if (! NILP (val) && CHARSETP (val))
7197     id = XINT (CHARSET_SYMBOL_ID (val));
7198   else
7199     id = -1;
7200   ADD_CHARSET_DATA (buf, 0, id);
7201   next = Fnext_single_property_change (make_number (pos), Qcharset,
7202                                        coding->src_object,
7203                                        make_number (limit));
7204   *stop = XINT (next);
7205   return buf;
7206 }
7207
7208
7209 static void
7210 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7211                int max_lookup)
7212 {
7213   int *buf = coding->charbuf;
7214   int *buf_end = coding->charbuf + coding->charbuf_size;
7215   const unsigned char *src = coding->source + coding->consumed;
7216   const unsigned char *src_end = coding->source + coding->src_bytes;
7217   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7218   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7219   int multibytep = coding->src_multibyte;
7220   Lisp_Object eol_type;
7221   int c;
7222   EMACS_INT stop, stop_composition, stop_charset;
7223   int *lookup_buf = NULL;
7224
7225   if (! NILP (translation_table))
7226     lookup_buf = alloca (sizeof (int) * max_lookup);
7227
7228   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7229   if (VECTORP (eol_type))
7230     eol_type = Qunix;
7231
7232   /* Note: composition handling is not yet implemented.  */
7233   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7234
7235   if (NILP (coding->src_object))
7236     stop = stop_composition = stop_charset = end_pos;
7237   else
7238     {
7239       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7240         stop = stop_composition = pos;
7241       else
7242         stop = stop_composition = end_pos;
7243       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7244         stop = stop_charset = pos;
7245       else
7246         stop_charset = end_pos;
7247     }
7248
7249   /* Compensate for CRLF and conversion.  */
7250   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7251   while (buf < buf_end)
7252     {
7253       Lisp_Object trans;
7254
7255       if (pos == stop)
7256         {
7257           if (pos == end_pos)
7258             break;
7259           if (pos == stop_composition)
7260             buf = handle_composition_annotation (pos, end_pos, coding,
7261                                                  buf, &stop_composition);
7262           if (pos == stop_charset)
7263             buf = handle_charset_annotation (pos, end_pos, coding,
7264                                              buf, &stop_charset);
7265           stop = (stop_composition < stop_charset
7266                   ? stop_composition : stop_charset);
7267         }
7268
7269       if (! multibytep)
7270         {
7271           EMACS_INT bytes;
7272
7273           if (coding->encoder == encode_coding_raw_text
7274               || coding->encoder == encode_coding_ccl)
7275             c = *src++, pos++;
7276           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7277             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7278           else
7279             c = BYTE8_TO_CHAR (*src), src++, pos++;
7280         }
7281       else
7282         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7283       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7284         c = '\n';
7285       if (! EQ (eol_type, Qunix))
7286         {
7287           if (c == '\n')
7288             {
7289               if (EQ (eol_type, Qdos))
7290                 *buf++ = '\r';
7291               else
7292                 c = '\r';
7293             }
7294         }
7295
7296       trans = Qnil;
7297       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7298       if (NILP (trans))
7299         *buf++ = c;
7300       else
7301         {
7302           int from_nchars = 1, to_nchars = 1;
7303           int *lookup_buf_end;
7304           const unsigned char *p = src;
7305           int i;
7306
7307           lookup_buf[0] = c;
7308           for (i = 1; i < max_lookup && p < src_end; i++)
7309             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7310           lookup_buf_end = lookup_buf + i;
7311           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7312           if (INTEGERP (trans))
7313             c = XINT (trans);
7314           else if (CONSP (trans))
7315             {
7316               from_nchars = ASIZE (XCAR (trans));
7317               trans = XCDR (trans);
7318               if (INTEGERP (trans))
7319                 c = XINT (trans);
7320               else
7321                 {
7322                   to_nchars = ASIZE (trans);
7323                   if (buf + to_nchars > buf_end)
7324                     break;
7325                   c = XINT (AREF (trans, 0));
7326                 }
7327             }
7328           else
7329             break;
7330           *buf++ = c;
7331           for (i = 1; i < to_nchars; i++)
7332             *buf++ = XINT (AREF (trans, i));
7333           for (i = 1; i < from_nchars; i++, pos++)
7334             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7335         }
7336     }
7337
7338   coding->consumed = src - coding->source;
7339   coding->consumed_char = pos - coding->src_pos;
7340   coding->charbuf_used = buf - coding->charbuf;
7341   coding->chars_at_source = 0;
7342 }
7343
7344
7345 /* Encode the text at CODING->src_object into CODING->dst_object.
7346    CODING->src_object is a buffer or a string.
7347    CODING->dst_object is a buffer or nil.
7348
7349    If CODING->src_object is a buffer, it must be the current buffer.
7350    In this case, if CODING->src_pos is positive, it is a position of
7351    the source text in the buffer, otherwise. the source text is in the
7352    gap area of the buffer, and coding->src_pos specifies the offset of
7353    the text from GPT (which must be the same as PT).  If this is the
7354    same buffer as CODING->dst_object, CODING->src_pos must be
7355    negative and CODING should not have `pre-write-conversion'.
7356
7357    If CODING->src_object is a string, CODING should not have
7358    `pre-write-conversion'.
7359
7360    If CODING->dst_object is a buffer, the encoded data is inserted at
7361    the current point of that buffer.
7362
7363    If CODING->dst_object is nil, the encoded data is placed at the
7364    memory area specified by CODING->destination.  */
7365
7366 static int
7367 encode_coding (struct coding_system *coding)
7368 {
7369   Lisp_Object attrs;
7370   Lisp_Object translation_table;
7371   int max_lookup;
7372   struct ccl_spec cclspec;
7373
7374   attrs = CODING_ID_ATTRS (coding->id);
7375   if (coding->encoder == encode_coding_raw_text)
7376     translation_table = Qnil, max_lookup = 0;
7377   else
7378     translation_table = get_translation_table (attrs, 1, &max_lookup);
7379
7380   if (BUFFERP (coding->dst_object))
7381     {
7382       set_buffer_internal (XBUFFER (coding->dst_object));
7383       coding->dst_multibyte
7384         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7385     }
7386
7387   coding->consumed = coding->consumed_char = 0;
7388   coding->produced = coding->produced_char = 0;
7389   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7390   coding->errors = 0;
7391
7392   ALLOC_CONVERSION_WORK_AREA (coding);
7393
7394   if (coding->encoder == encode_coding_ccl)
7395     {
7396       coding->spec.ccl = &cclspec;
7397       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7398     }
7399   do {
7400     coding_set_source (coding);
7401     consume_chars (coding, translation_table, max_lookup);
7402     coding_set_destination (coding);
7403     (*(coding->encoder)) (coding);
7404   } while (coding->consumed_char < coding->src_chars);
7405
7406   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7407     insert_from_gap (coding->produced_char, coding->produced);
7408
7409   return (coding->result);
7410 }
7411
7412
7413 /* Name (or base name) of work buffer for code conversion.  */
7414 static Lisp_Object Vcode_conversion_workbuf_name;
7415
7416 /* A working buffer used by the top level conversion.  Once it is
7417    created, it is never destroyed.  It has the name
7418    Vcode_conversion_workbuf_name.  The other working buffers are
7419    destroyed after the use is finished, and their names are modified
7420    versions of Vcode_conversion_workbuf_name.  */
7421 static Lisp_Object Vcode_conversion_reused_workbuf;
7422
7423 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7424 static int reused_workbuf_in_use;
7425
7426
7427 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7428    multibyteness of returning buffer.  */
7429
7430 static Lisp_Object
7431 make_conversion_work_buffer (int multibyte)
7432 {
7433   Lisp_Object name, workbuf;
7434   struct buffer *current;
7435
7436   if (reused_workbuf_in_use++)
7437     {
7438       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7439       workbuf = Fget_buffer_create (name);
7440     }
7441   else
7442     {
7443       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7444         Vcode_conversion_reused_workbuf
7445           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7446       workbuf = Vcode_conversion_reused_workbuf;
7447     }
7448   current = current_buffer;
7449   set_buffer_internal (XBUFFER (workbuf));
7450   /* We can't allow modification hooks to run in the work buffer.  For
7451      instance, directory_files_internal assumes that file decoding
7452      doesn't compile new regexps.  */
7453   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7454   Ferase_buffer ();
7455   BVAR (current_buffer, undo_list) = Qt;
7456   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7457   set_buffer_internal (current);
7458   return workbuf;
7459 }
7460
7461
7462 static Lisp_Object
7463 code_conversion_restore (Lisp_Object arg)
7464 {
7465   Lisp_Object current, workbuf;
7466   struct gcpro gcpro1;
7467
7468   GCPRO1 (arg);
7469   current = XCAR (arg);
7470   workbuf = XCDR (arg);
7471   if (! NILP (workbuf))
7472     {
7473       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7474         reused_workbuf_in_use = 0;
7475       else if (! NILP (Fbuffer_live_p (workbuf)))
7476         Fkill_buffer (workbuf);
7477     }
7478   set_buffer_internal (XBUFFER (current));
7479   UNGCPRO;
7480   return Qnil;
7481 }
7482
7483 Lisp_Object
7484 code_conversion_save (int with_work_buf, int multibyte)
7485 {
7486   Lisp_Object workbuf = Qnil;
7487
7488   if (with_work_buf)
7489     workbuf = make_conversion_work_buffer (multibyte);
7490   record_unwind_protect (code_conversion_restore,
7491                          Fcons (Fcurrent_buffer (), workbuf));
7492   return workbuf;
7493 }
7494
7495 int
7496 decode_coding_gap (struct coding_system *coding,
7497                    EMACS_INT chars, EMACS_INT bytes)
7498 {
7499   int count = SPECPDL_INDEX ();
7500   Lisp_Object attrs;
7501
7502   code_conversion_save (0, 0);
7503
7504   coding->src_object = Fcurrent_buffer ();
7505   coding->src_chars = chars;
7506   coding->src_bytes = bytes;
7507   coding->src_pos = -chars;
7508   coding->src_pos_byte = -bytes;
7509   coding->src_multibyte = chars < bytes;
7510   coding->dst_object = coding->src_object;
7511   coding->dst_pos = PT;
7512   coding->dst_pos_byte = PT_BYTE;
7513   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7514
7515   if (CODING_REQUIRE_DETECTION (coding))
7516     detect_coding (coding);
7517
7518   coding->mode |= CODING_MODE_LAST_BLOCK;
7519   current_buffer->text->inhibit_shrinking = 1;
7520   decode_coding (coding);
7521   current_buffer->text->inhibit_shrinking = 0;
7522
7523   attrs = CODING_ID_ATTRS (coding->id);
7524   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7525     {
7526       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7527       Lisp_Object val;
7528
7529       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7530       val = call1 (CODING_ATTR_POST_READ (attrs),
7531                    make_number (coding->produced_char));
7532       CHECK_NATNUM (val);
7533       coding->produced_char += Z - prev_Z;
7534       coding->produced += Z_BYTE - prev_Z_BYTE;
7535     }
7536
7537   unbind_to (count, Qnil);
7538   return coding->result;
7539 }
7540
7541
7542 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7543    SRC_OBJECT into DST_OBJECT by coding context CODING.
7544
7545    SRC_OBJECT is a buffer, a string, or Qnil.
7546
7547    If it is a buffer, the text is at point of the buffer.  FROM and TO
7548    are positions in the buffer.
7549
7550    If it is a string, the text is at the beginning of the string.
7551    FROM and TO are indices to the string.
7552
7553    If it is nil, the text is at coding->source.  FROM and TO are
7554    indices to coding->source.
7555
7556    DST_OBJECT is a buffer, Qt, or Qnil.
7557
7558    If it is a buffer, the decoded text is inserted at point of the
7559    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7560    is deleted.
7561
7562    If it is Qt, a string is made from the decoded text, and
7563    set in CODING->dst_object.
7564
7565    If it is Qnil, the decoded text is stored at CODING->destination.
7566    The caller must allocate CODING->dst_bytes bytes at
7567    CODING->destination by xmalloc.  If the decoded text is longer than
7568    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7569  */
7570
7571 void
7572 decode_coding_object (struct coding_system *coding,
7573                       Lisp_Object src_object,
7574                       EMACS_INT from, EMACS_INT from_byte,
7575                       EMACS_INT to, EMACS_INT to_byte,
7576                       Lisp_Object dst_object)
7577 {
7578   int count = SPECPDL_INDEX ();
7579   unsigned char *destination IF_LINT (= NULL);
7580   EMACS_INT dst_bytes IF_LINT (= 0);
7581   EMACS_INT chars = to - from;
7582   EMACS_INT bytes = to_byte - from_byte;
7583   Lisp_Object attrs;
7584   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7585   int need_marker_adjustment = 0;
7586   Lisp_Object old_deactivate_mark;
7587
7588   old_deactivate_mark = Vdeactivate_mark;
7589
7590   if (NILP (dst_object))
7591     {
7592       destination = coding->destination;
7593       dst_bytes = coding->dst_bytes;
7594     }
7595
7596   coding->src_object = src_object;
7597   coding->src_chars = chars;
7598   coding->src_bytes = bytes;
7599   coding->src_multibyte = chars < bytes;
7600
7601   if (STRINGP (src_object))
7602     {
7603       coding->src_pos = from;
7604       coding->src_pos_byte = from_byte;
7605     }
7606   else if (BUFFERP (src_object))
7607     {
7608       set_buffer_internal (XBUFFER (src_object));
7609       if (from != GPT)
7610         move_gap_both (from, from_byte);
7611       if (EQ (src_object, dst_object))
7612         {
7613           struct Lisp_Marker *tail;
7614
7615           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7616             {
7617               tail->need_adjustment
7618                 = tail->charpos == (tail->insertion_type ? from : to);
7619               need_marker_adjustment |= tail->need_adjustment;
7620             }
7621           saved_pt = PT, saved_pt_byte = PT_BYTE;
7622           TEMP_SET_PT_BOTH (from, from_byte);
7623           current_buffer->text->inhibit_shrinking = 1;
7624           del_range_both (from, from_byte, to, to_byte, 1);
7625           coding->src_pos = -chars;
7626           coding->src_pos_byte = -bytes;
7627         }
7628       else
7629         {
7630           coding->src_pos = from;
7631           coding->src_pos_byte = from_byte;
7632         }
7633     }
7634
7635   if (CODING_REQUIRE_DETECTION (coding))
7636     detect_coding (coding);
7637   attrs = CODING_ID_ATTRS (coding->id);
7638
7639   if (EQ (dst_object, Qt)
7640       || (! NILP (CODING_ATTR_POST_READ (attrs))
7641           && NILP (dst_object)))
7642     {
7643       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7644       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7645       coding->dst_pos = BEG;
7646       coding->dst_pos_byte = BEG_BYTE;
7647     }
7648   else if (BUFFERP (dst_object))
7649     {
7650       code_conversion_save (0, 0);
7651       coding->dst_object = dst_object;
7652       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7653       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7654       coding->dst_multibyte
7655         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7656     }
7657   else
7658     {
7659       code_conversion_save (0, 0);
7660       coding->dst_object = Qnil;
7661       /* Most callers presume this will return a multibyte result, and they
7662          won't use `binary' or `raw-text' anyway, so let's not worry about
7663          CODING_FOR_UNIBYTE.  */
7664       coding->dst_multibyte = 1;
7665     }
7666
7667   decode_coding (coding);
7668
7669   if (BUFFERP (coding->dst_object))
7670     set_buffer_internal (XBUFFER (coding->dst_object));
7671
7672   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7673     {
7674       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7675       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7676       Lisp_Object val;
7677
7678       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7679       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7680               old_deactivate_mark);
7681       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7682                         make_number (coding->produced_char));
7683       UNGCPRO;
7684       CHECK_NATNUM (val);
7685       coding->produced_char += Z - prev_Z;
7686       coding->produced += Z_BYTE - prev_Z_BYTE;
7687     }
7688
7689   if (EQ (dst_object, Qt))
7690     {
7691       coding->dst_object = Fbuffer_string ();
7692     }
7693   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7694     {
7695       set_buffer_internal (XBUFFER (coding->dst_object));
7696       if (dst_bytes < coding->produced)
7697         {
7698           destination = xrealloc (destination, coding->produced);
7699           if (! destination)
7700             {
7701               record_conversion_result (coding,
7702                                         CODING_RESULT_INSUFFICIENT_MEM);
7703               unbind_to (count, Qnil);
7704               return;
7705             }
7706           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7707             move_gap_both (BEGV, BEGV_BYTE);
7708           memcpy (destination, BEGV_ADDR, coding->produced);
7709           coding->destination = destination;
7710         }
7711     }
7712
7713   if (saved_pt >= 0)
7714     {
7715       /* This is the case of:
7716          (BUFFERP (src_object) && EQ (src_object, dst_object))
7717          As we have moved PT while replacing the original buffer
7718          contents, we must recover it now.  */
7719       set_buffer_internal (XBUFFER (src_object));
7720       current_buffer->text->inhibit_shrinking = 0;
7721       if (saved_pt < from)
7722         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7723       else if (saved_pt < from + chars)
7724         TEMP_SET_PT_BOTH (from, from_byte);
7725       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7726         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7727                           saved_pt_byte + (coding->produced - bytes));
7728       else
7729         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7730                           saved_pt_byte + (coding->produced - bytes));
7731
7732       if (need_marker_adjustment)
7733         {
7734           struct Lisp_Marker *tail;
7735
7736           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7737             if (tail->need_adjustment)
7738               {
7739                 tail->need_adjustment = 0;
7740                 if (tail->insertion_type)
7741                   {
7742                     tail->bytepos = from_byte;
7743                     tail->charpos = from;
7744                   }
7745                 else
7746                   {
7747                     tail->bytepos = from_byte + coding->produced;
7748                     tail->charpos
7749                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7750                          ? tail->bytepos : from + coding->produced_char);
7751                   }
7752               }
7753         }
7754     }
7755
7756   Vdeactivate_mark = old_deactivate_mark;
7757   unbind_to (count, coding->dst_object);
7758 }
7759
7760
7761 void
7762 encode_coding_object (struct coding_system *coding,
7763                       Lisp_Object src_object,
7764                       EMACS_INT from, EMACS_INT from_byte,
7765                       EMACS_INT to, EMACS_INT to_byte,
7766                       Lisp_Object dst_object)
7767 {
7768   int count = SPECPDL_INDEX ();
7769   EMACS_INT chars = to - from;
7770   EMACS_INT bytes = to_byte - from_byte;
7771   Lisp_Object attrs;
7772   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7773   int need_marker_adjustment = 0;
7774   int kill_src_buffer = 0;
7775   Lisp_Object old_deactivate_mark;
7776
7777   old_deactivate_mark = Vdeactivate_mark;
7778
7779   coding->src_object = src_object;
7780   coding->src_chars = chars;
7781   coding->src_bytes = bytes;
7782   coding->src_multibyte = chars < bytes;
7783
7784   attrs = CODING_ID_ATTRS (coding->id);
7785
7786   if (EQ (src_object, dst_object))
7787     {
7788       struct Lisp_Marker *tail;
7789
7790       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7791         {
7792           tail->need_adjustment
7793             = tail->charpos == (tail->insertion_type ? from : to);
7794           need_marker_adjustment |= tail->need_adjustment;
7795         }
7796     }
7797
7798   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7799     {
7800       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7801       set_buffer_internal (XBUFFER (coding->src_object));
7802       if (STRINGP (src_object))
7803         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7804       else if (BUFFERP (src_object))
7805         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7806       else
7807         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7808
7809       if (EQ (src_object, dst_object))
7810         {
7811           set_buffer_internal (XBUFFER (src_object));
7812           saved_pt = PT, saved_pt_byte = PT_BYTE;
7813           del_range_both (from, from_byte, to, to_byte, 1);
7814           set_buffer_internal (XBUFFER (coding->src_object));
7815         }
7816
7817       {
7818         Lisp_Object args[3];
7819         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7820
7821         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7822                 old_deactivate_mark);
7823         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7824         args[1] = make_number (BEG);
7825         args[2] = make_number (Z);
7826         safe_call (3, args);
7827         UNGCPRO;
7828       }
7829       if (XBUFFER (coding->src_object) != current_buffer)
7830         kill_src_buffer = 1;
7831       coding->src_object = Fcurrent_buffer ();
7832       if (BEG != GPT)
7833         move_gap_both (BEG, BEG_BYTE);
7834       coding->src_chars = Z - BEG;
7835       coding->src_bytes = Z_BYTE - BEG_BYTE;
7836       coding->src_pos = BEG;
7837       coding->src_pos_byte = BEG_BYTE;
7838       coding->src_multibyte = Z < Z_BYTE;
7839     }
7840   else if (STRINGP (src_object))
7841     {
7842       code_conversion_save (0, 0);
7843       coding->src_pos = from;
7844       coding->src_pos_byte = from_byte;
7845     }
7846   else if (BUFFERP (src_object))
7847     {
7848       code_conversion_save (0, 0);
7849       set_buffer_internal (XBUFFER (src_object));
7850       if (EQ (src_object, dst_object))
7851         {
7852           saved_pt = PT, saved_pt_byte = PT_BYTE;
7853           coding->src_object = del_range_1 (from, to, 1, 1);
7854           coding->src_pos = 0;
7855           coding->src_pos_byte = 0;
7856         }
7857       else
7858         {
7859           if (from < GPT && to >= GPT)
7860             move_gap_both (from, from_byte);
7861           coding->src_pos = from;
7862           coding->src_pos_byte = from_byte;
7863         }
7864     }
7865   else
7866     code_conversion_save (0, 0);
7867
7868   if (BUFFERP (dst_object))
7869     {
7870       coding->dst_object = dst_object;
7871       if (EQ (src_object, dst_object))
7872         {
7873           coding->dst_pos = from;
7874           coding->dst_pos_byte = from_byte;
7875         }
7876       else
7877         {
7878           struct buffer *current = current_buffer;
7879
7880           set_buffer_temp (XBUFFER (dst_object));
7881           coding->dst_pos = PT;
7882           coding->dst_pos_byte = PT_BYTE;
7883           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7884           set_buffer_temp (current);
7885         }
7886       coding->dst_multibyte
7887         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7888     }
7889   else if (EQ (dst_object, Qt))
7890     {
7891       coding->dst_object = Qnil;
7892       coding->dst_bytes = coding->src_chars;
7893       if (coding->dst_bytes == 0)
7894         coding->dst_bytes = 1;
7895       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7896       coding->dst_multibyte = 0;
7897     }
7898   else
7899     {
7900       coding->dst_object = Qnil;
7901       coding->dst_multibyte = 0;
7902     }
7903
7904   encode_coding (coding);
7905
7906   if (EQ (dst_object, Qt))
7907     {
7908       if (BUFFERP (coding->dst_object))
7909         coding->dst_object = Fbuffer_string ();
7910       else
7911         {
7912           coding->dst_object
7913             = make_unibyte_string ((char *) coding->destination,
7914                                    coding->produced);
7915           xfree (coding->destination);
7916         }
7917     }
7918
7919   if (saved_pt >= 0)
7920     {
7921       /* This is the case of:
7922          (BUFFERP (src_object) && EQ (src_object, dst_object))
7923          As we have moved PT while replacing the original buffer
7924          contents, we must recover it now.  */
7925       set_buffer_internal (XBUFFER (src_object));
7926       if (saved_pt < from)
7927         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7928       else if (saved_pt < from + chars)
7929         TEMP_SET_PT_BOTH (from, from_byte);
7930       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7931         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7932                           saved_pt_byte + (coding->produced - bytes));
7933       else
7934         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7935                           saved_pt_byte + (coding->produced - bytes));
7936
7937       if (need_marker_adjustment)
7938         {
7939           struct Lisp_Marker *tail;
7940
7941           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7942             if (tail->need_adjustment)
7943               {
7944                 tail->need_adjustment = 0;
7945                 if (tail->insertion_type)
7946                   {
7947                     tail->bytepos = from_byte;
7948                     tail->charpos = from;
7949                   }
7950                 else
7951                   {
7952                     tail->bytepos = from_byte + coding->produced;
7953                     tail->charpos
7954                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7955                          ? tail->bytepos : from + coding->produced_char);
7956                   }
7957               }
7958         }
7959     }
7960
7961   if (kill_src_buffer)
7962     Fkill_buffer (coding->src_object);
7963
7964   Vdeactivate_mark = old_deactivate_mark;
7965   unbind_to (count, Qnil);
7966 }
7967
7968
7969 Lisp_Object
7970 preferred_coding_system (void)
7971 {
7972   int id = coding_categories[coding_priorities[0]].id;
7973
7974   return CODING_ID_NAME (id);
7975 }
7976
7977 \f
7978 #ifdef emacs
7979 /*** 8. Emacs Lisp library functions ***/
7980
7981 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7982        doc: /* Return t if OBJECT is nil or a coding-system.
7983 See the documentation of `define-coding-system' for information
7984 about coding-system objects.  */)
7985   (Lisp_Object object)
7986 {
7987   if (NILP (object)
7988       || CODING_SYSTEM_ID (object) >= 0)
7989     return Qt;
7990   if (! SYMBOLP (object)
7991       || NILP (Fget (object, Qcoding_system_define_form)))
7992     return Qnil;
7993   return Qt;
7994 }
7995
7996 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7997        Sread_non_nil_coding_system, 1, 1, 0,
7998        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7999   (Lisp_Object prompt)
8000 {
8001   Lisp_Object val;
8002   do
8003     {
8004       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8005                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8006     }
8007   while (SCHARS (val) == 0);
8008   return (Fintern (val, Qnil));
8009 }
8010
8011 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8012        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8013 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8014 Ignores case when completing coding systems (all Emacs coding systems
8015 are lower-case).  */)
8016   (Lisp_Object prompt, Lisp_Object default_coding_system)
8017 {
8018   Lisp_Object val;
8019   int count = SPECPDL_INDEX ();
8020
8021   if (SYMBOLP (default_coding_system))
8022     default_coding_system = SYMBOL_NAME (default_coding_system);
8023   specbind (Qcompletion_ignore_case, Qt);
8024   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8025                           Qt, Qnil, Qcoding_system_history,
8026                           default_coding_system, Qnil);
8027   unbind_to (count, Qnil);
8028   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8029 }
8030
8031 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8032        1, 1, 0,
8033        doc: /* Check validity of CODING-SYSTEM.
8034 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8035 It is valid if it is nil or a symbol defined as a coding system by the
8036 function `define-coding-system'.  */)
8037   (Lisp_Object coding_system)
8038 {
8039   Lisp_Object define_form;
8040
8041   define_form = Fget (coding_system, Qcoding_system_define_form);
8042   if (! NILP (define_form))
8043     {
8044       Fput (coding_system, Qcoding_system_define_form, Qnil);
8045       safe_eval (define_form);
8046     }
8047   if (!NILP (Fcoding_system_p (coding_system)))
8048     return coding_system;
8049   xsignal1 (Qcoding_system_error, coding_system);
8050 }
8051
8052 \f
8053 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8054    HIGHEST is nonzero, return the coding system of the highest
8055    priority among the detected coding systems.  Otherwise return a
8056    list of detected coding systems sorted by their priorities.  If
8057    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8058    multibyte form but contains only ASCII and eight-bit chars.
8059    Otherwise, the bytes are raw bytes.
8060
8061    CODING-SYSTEM controls the detection as below:
8062
8063    If it is nil, detect both text-format and eol-format.  If the
8064    text-format part of CODING-SYSTEM is already specified
8065    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8066    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8067    detect only text-format.  */
8068
8069 Lisp_Object
8070 detect_coding_system (const unsigned char *src,
8071                       EMACS_INT src_chars, EMACS_INT src_bytes,
8072                       int highest, int multibytep,
8073                       Lisp_Object coding_system)
8074 {
8075   const unsigned char *src_end = src + src_bytes;
8076   Lisp_Object attrs, eol_type;
8077   Lisp_Object val = Qnil;
8078   struct coding_system coding;
8079   int id;
8080   struct coding_detection_info detect_info;
8081   enum coding_category base_category;
8082   int null_byte_found = 0, eight_bit_found = 0;
8083
8084   if (NILP (coding_system))
8085     coding_system = Qundecided;
8086   setup_coding_system (coding_system, &coding);
8087   attrs = CODING_ID_ATTRS (coding.id);
8088   eol_type = CODING_ID_EOL_TYPE (coding.id);
8089   coding_system = CODING_ATTR_BASE_NAME (attrs);
8090
8091   coding.source = src;
8092   coding.src_chars = src_chars;
8093   coding.src_bytes = src_bytes;
8094   coding.src_multibyte = multibytep;
8095   coding.consumed = 0;
8096   coding.mode |= CODING_MODE_LAST_BLOCK;
8097   coding.head_ascii = 0;
8098
8099   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8100
8101   /* At first, detect text-format if necessary.  */
8102   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8103   if (base_category == coding_category_undecided)
8104     {
8105       enum coding_category category IF_LINT (= 0);
8106       struct coding_system *this IF_LINT (= NULL);
8107       int c, i;
8108
8109       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8110       for (; src < src_end; src++)
8111         {
8112           c = *src;
8113           if (c & 0x80)
8114             {
8115               eight_bit_found = 1;
8116               if (null_byte_found)
8117                 break;
8118             }
8119           else if (c < 0x20)
8120             {
8121               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8122                   && ! inhibit_iso_escape_detection
8123                   && ! detect_info.checked)
8124                 {
8125                   if (detect_coding_iso_2022 (&coding, &detect_info))
8126                     {
8127                       /* We have scanned the whole data.  */
8128                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8129                         {
8130                           /* We didn't find an 8-bit code.  We may
8131                              have found a null-byte, but it's very
8132                              rare that a binary file confirm to
8133                              ISO-2022.  */
8134                           src = src_end;
8135                           coding.head_ascii = src - coding.source;
8136                         }
8137                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8138                       break;
8139                     }
8140                 }
8141               else if (! c && !inhibit_null_byte_detection)
8142                 {
8143                   null_byte_found = 1;
8144                   if (eight_bit_found)
8145                     break;
8146                 }
8147               if (! eight_bit_found)
8148                 coding.head_ascii++;
8149             }
8150           else if (! eight_bit_found)
8151             coding.head_ascii++;
8152         }
8153
8154       if (null_byte_found || eight_bit_found
8155           || coding.head_ascii < coding.src_bytes
8156           || detect_info.found)
8157         {
8158           if (coding.head_ascii == coding.src_bytes)
8159             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8160             for (i = 0; i < coding_category_raw_text; i++)
8161               {
8162                 category = coding_priorities[i];
8163                 this = coding_categories + category;
8164                 if (detect_info.found & (1 << category))
8165                   break;
8166               }
8167           else
8168             {
8169               if (null_byte_found)
8170                 {
8171                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8172                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8173                 }
8174               for (i = 0; i < coding_category_raw_text; i++)
8175                 {
8176                   category = coding_priorities[i];
8177                   this = coding_categories + category;
8178
8179                   if (this->id < 0)
8180                     {
8181                       /* No coding system of this category is defined.  */
8182                       detect_info.rejected |= (1 << category);
8183                     }
8184                   else if (category >= coding_category_raw_text)
8185                     continue;
8186                   else if (detect_info.checked & (1 << category))
8187                     {
8188                       if (highest
8189                           && (detect_info.found & (1 << category)))
8190                         break;
8191                     }
8192                   else if ((*(this->detector)) (&coding, &detect_info)
8193                            && highest
8194                            && (detect_info.found & (1 << category)))
8195                     {
8196                       if (category == coding_category_utf_16_auto)
8197                         {
8198                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8199                             category = coding_category_utf_16_le;
8200                           else
8201                             category = coding_category_utf_16_be;
8202                         }
8203                       break;
8204                     }
8205                 }
8206             }
8207         }
8208
8209       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8210           || null_byte_found)
8211         {
8212           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8213           id = CODING_SYSTEM_ID (Qno_conversion);
8214           val = Fcons (make_number (id), Qnil);
8215         }
8216       else if (! detect_info.rejected && ! detect_info.found)
8217         {
8218           detect_info.found = CATEGORY_MASK_ANY;
8219           id = coding_categories[coding_category_undecided].id;
8220           val = Fcons (make_number (id), Qnil);
8221         }
8222       else if (highest)
8223         {
8224           if (detect_info.found)
8225             {
8226               detect_info.found = 1 << category;
8227               val = Fcons (make_number (this->id), Qnil);
8228             }
8229           else
8230             for (i = 0; i < coding_category_raw_text; i++)
8231               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8232                 {
8233                   detect_info.found = 1 << coding_priorities[i];
8234                   id = coding_categories[coding_priorities[i]].id;
8235                   val = Fcons (make_number (id), Qnil);
8236                   break;
8237                 }
8238         }
8239       else
8240         {
8241           int mask = detect_info.rejected | detect_info.found;
8242           int found = 0;
8243
8244           for (i = coding_category_raw_text - 1; i >= 0; i--)
8245             {
8246               category = coding_priorities[i];
8247               if (! (mask & (1 << category)))
8248                 {
8249                   found |= 1 << category;
8250                   id = coding_categories[category].id;
8251                   if (id >= 0)
8252                     val = Fcons (make_number (id), val);
8253                 }
8254             }
8255           for (i = coding_category_raw_text - 1; i >= 0; i--)
8256             {
8257               category = coding_priorities[i];
8258               if (detect_info.found & (1 << category))
8259                 {
8260                   id = coding_categories[category].id;
8261                   val = Fcons (make_number (id), val);
8262                 }
8263             }
8264           detect_info.found |= found;
8265         }
8266     }
8267   else if (base_category == coding_category_utf_8_auto)
8268     {
8269       if (detect_coding_utf_8 (&coding, &detect_info))
8270         {
8271           struct coding_system *this;
8272
8273           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8274             this = coding_categories + coding_category_utf_8_sig;
8275           else
8276             this = coding_categories + coding_category_utf_8_nosig;
8277           val = Fcons (make_number (this->id), Qnil);
8278         }
8279     }
8280   else if (base_category == coding_category_utf_16_auto)
8281     {
8282       if (detect_coding_utf_16 (&coding, &detect_info))
8283         {
8284           struct coding_system *this;
8285
8286           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8287             this = coding_categories + coding_category_utf_16_le;
8288           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8289             this = coding_categories + coding_category_utf_16_be;
8290           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8291             this = coding_categories + coding_category_utf_16_be_nosig;
8292           else
8293             this = coding_categories + coding_category_utf_16_le_nosig;
8294           val = Fcons (make_number (this->id), Qnil);
8295         }
8296     }
8297   else
8298     {
8299       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8300       val = Fcons (make_number (coding.id), Qnil);
8301     }
8302
8303   /* Then, detect eol-format if necessary.  */
8304   {
8305     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8306     Lisp_Object tail;
8307
8308     if (VECTORP (eol_type))
8309       {
8310         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8311           {
8312             if (null_byte_found)
8313               normal_eol = EOL_SEEN_LF;
8314             else
8315               normal_eol = detect_eol (coding.source, src_bytes,
8316                                        coding_category_raw_text);
8317           }
8318         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8319                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8320           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8321                                       coding_category_utf_16_be);
8322         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8323                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8324           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8325                                       coding_category_utf_16_le);
8326       }
8327     else
8328       {
8329         if (EQ (eol_type, Qunix))
8330           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8331         else if (EQ (eol_type, Qdos))
8332           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8333         else
8334           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8335       }
8336
8337     for (tail = val; CONSP (tail); tail = XCDR (tail))
8338       {
8339         enum coding_category category;
8340         int this_eol;
8341
8342         id = XINT (XCAR (tail));
8343         attrs = CODING_ID_ATTRS (id);
8344         category = XINT (CODING_ATTR_CATEGORY (attrs));
8345         eol_type = CODING_ID_EOL_TYPE (id);
8346         if (VECTORP (eol_type))
8347           {
8348             if (category == coding_category_utf_16_be
8349                 || category == coding_category_utf_16_be_nosig)
8350               this_eol = utf_16_be_eol;
8351             else if (category == coding_category_utf_16_le
8352                      || category == coding_category_utf_16_le_nosig)
8353               this_eol = utf_16_le_eol;
8354             else
8355               this_eol = normal_eol;
8356
8357             if (this_eol == EOL_SEEN_LF)
8358               XSETCAR (tail, AREF (eol_type, 0));
8359             else if (this_eol == EOL_SEEN_CRLF)
8360               XSETCAR (tail, AREF (eol_type, 1));
8361             else if (this_eol == EOL_SEEN_CR)
8362               XSETCAR (tail, AREF (eol_type, 2));
8363             else
8364               XSETCAR (tail, CODING_ID_NAME (id));
8365           }
8366         else
8367           XSETCAR (tail, CODING_ID_NAME (id));
8368       }
8369   }
8370
8371   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8372 }
8373
8374
8375 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8376        2, 3, 0,
8377        doc: /* Detect coding system of the text in the region between START and END.
8378 Return a list of possible coding systems ordered by priority.
8379 The coding systems to try and their priorities follows what
8380 the function `coding-system-priority-list' (which see) returns.
8381
8382 If only ASCII characters are found (except for such ISO-2022 control
8383 characters as ESC), it returns a list of single element `undecided'
8384 or its subsidiary coding system according to a detected end-of-line
8385 format.
8386
8387 If optional argument HIGHEST is non-nil, return the coding system of
8388 highest priority.  */)
8389   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8390 {
8391   int from, to;
8392   int from_byte, to_byte;
8393
8394   CHECK_NUMBER_COERCE_MARKER (start);
8395   CHECK_NUMBER_COERCE_MARKER (end);
8396
8397   validate_region (&start, &end);
8398   from = XINT (start), to = XINT (end);
8399   from_byte = CHAR_TO_BYTE (from);
8400   to_byte = CHAR_TO_BYTE (to);
8401
8402   if (from < GPT && to >= GPT)
8403     move_gap_both (to, to_byte);
8404
8405   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8406                                to - from, to_byte - from_byte,
8407                                !NILP (highest),
8408                                !NILP (BVAR (current_buffer
8409                                       , enable_multibyte_characters)),
8410                                Qnil);
8411 }
8412
8413 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8414        1, 2, 0,
8415        doc: /* Detect coding system of the text in STRING.
8416 Return a list of possible coding systems ordered by priority.
8417 The coding systems to try and their priorities follows what
8418 the function `coding-system-priority-list' (which see) returns.
8419
8420 If only ASCII characters are found (except for such ISO-2022 control
8421 characters as ESC), it returns a list of single element `undecided'
8422 or its subsidiary coding system according to a detected end-of-line
8423 format.
8424
8425 If optional argument HIGHEST is non-nil, return the coding system of
8426 highest priority.  */)
8427   (Lisp_Object string, Lisp_Object highest)
8428 {
8429   CHECK_STRING (string);
8430
8431   return detect_coding_system (SDATA (string),
8432                                SCHARS (string), SBYTES (string),
8433                                !NILP (highest), STRING_MULTIBYTE (string),
8434                                Qnil);
8435 }
8436
8437
8438 static inline int
8439 char_encodable_p (int c, Lisp_Object attrs)
8440 {
8441   Lisp_Object tail;
8442   struct charset *charset;
8443   Lisp_Object translation_table;
8444
8445   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8446   if (! NILP (translation_table))
8447     c = translate_char (translation_table, c);
8448   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8449        CONSP (tail); tail = XCDR (tail))
8450     {
8451       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8452       if (CHAR_CHARSET_P (c, charset))
8453         break;
8454     }
8455   return (! NILP (tail));
8456 }
8457
8458
8459 /* Return a list of coding systems that safely encode the text between
8460    START and END.  If EXCLUDE is non-nil, it is a list of coding
8461    systems not to check.  The returned list doesn't contain any such
8462    coding systems.  In any case, if the text contains only ASCII or is
8463    unibyte, return t.  */
8464
8465 DEFUN ("find-coding-systems-region-internal",
8466        Ffind_coding_systems_region_internal,
8467        Sfind_coding_systems_region_internal, 2, 3, 0,
8468        doc: /* Internal use only.  */)
8469   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8470 {
8471   Lisp_Object coding_attrs_list, safe_codings;
8472   EMACS_INT start_byte, end_byte;
8473   const unsigned char *p, *pbeg, *pend;
8474   int c;
8475   Lisp_Object tail, elt, work_table;
8476
8477   if (STRINGP (start))
8478     {
8479       if (!STRING_MULTIBYTE (start)
8480           || SCHARS (start) == SBYTES (start))
8481         return Qt;
8482       start_byte = 0;
8483       end_byte = SBYTES (start);
8484     }
8485   else
8486     {
8487       CHECK_NUMBER_COERCE_MARKER (start);
8488       CHECK_NUMBER_COERCE_MARKER (end);
8489       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8490         args_out_of_range (start, end);
8491       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8492         return Qt;
8493       start_byte = CHAR_TO_BYTE (XINT (start));
8494       end_byte = CHAR_TO_BYTE (XINT (end));
8495       if (XINT (end) - XINT (start) == end_byte - start_byte)
8496         return Qt;
8497
8498       if (XINT (start) < GPT && XINT (end) > GPT)
8499         {
8500           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8501             move_gap_both (XINT (start), start_byte);
8502           else
8503             move_gap_both (XINT (end), end_byte);
8504         }
8505     }
8506
8507   coding_attrs_list = Qnil;
8508   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8509     if (NILP (exclude)
8510         || NILP (Fmemq (XCAR (tail), exclude)))
8511       {
8512         Lisp_Object attrs;
8513
8514         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8515         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8516             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8517           {
8518             ASET (attrs, coding_attr_trans_tbl,
8519                   get_translation_table (attrs, 1, NULL));
8520             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8521           }
8522       }
8523
8524   if (STRINGP (start))
8525     p = pbeg = SDATA (start);
8526   else
8527     p = pbeg = BYTE_POS_ADDR (start_byte);
8528   pend = p + (end_byte - start_byte);
8529
8530   while (p < pend && ASCII_BYTE_P (*p)) p++;
8531   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8532
8533   work_table = Fmake_char_table (Qnil, Qnil);
8534   while (p < pend)
8535     {
8536       if (ASCII_BYTE_P (*p))
8537         p++;
8538       else
8539         {
8540           c = STRING_CHAR_ADVANCE (p);
8541           if (!NILP (char_table_ref (work_table, c)))
8542             /* This character was already checked.  Ignore it.  */
8543             continue;
8544
8545           charset_map_loaded = 0;
8546           for (tail = coding_attrs_list; CONSP (tail);)
8547             {
8548               elt = XCAR (tail);
8549               if (NILP (elt))
8550                 tail = XCDR (tail);
8551               else if (char_encodable_p (c, elt))
8552                 tail = XCDR (tail);
8553               else if (CONSP (XCDR (tail)))
8554                 {
8555                   XSETCAR (tail, XCAR (XCDR (tail)));
8556                   XSETCDR (tail, XCDR (XCDR (tail)));
8557                 }
8558               else
8559                 {
8560                   XSETCAR (tail, Qnil);
8561                   tail = XCDR (tail);
8562                 }
8563             }
8564           if (charset_map_loaded)
8565             {
8566               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8567
8568               if (STRINGP (start))
8569                 pbeg = SDATA (start);
8570               else
8571                 pbeg = BYTE_POS_ADDR (start_byte);
8572               p = pbeg + p_offset;
8573               pend = pbeg + pend_offset;
8574             }
8575           char_table_set (work_table, c, Qt);
8576         }
8577     }
8578
8579   safe_codings = list2 (Qraw_text, Qno_conversion);
8580   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8581     if (! NILP (XCAR (tail)))
8582       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8583
8584   return safe_codings;
8585 }
8586
8587
8588 DEFUN ("unencodable-char-position", Funencodable_char_position,
8589        Sunencodable_char_position, 3, 5, 0,
8590        doc: /*
8591 Return position of first un-encodable character in a region.
8592 START and END specify the region and CODING-SYSTEM specifies the
8593 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8594
8595 If optional 4th argument COUNT is non-nil, it specifies at most how
8596 many un-encodable characters to search.  In this case, the value is a
8597 list of positions.
8598
8599 If optional 5th argument STRING is non-nil, it is a string to search
8600 for un-encodable characters.  In that case, START and END are indexes
8601 to the string.  */)
8602   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8603 {
8604   int n;
8605   struct coding_system coding;
8606   Lisp_Object attrs, charset_list, translation_table;
8607   Lisp_Object positions;
8608   int from, to;
8609   const unsigned char *p, *stop, *pend;
8610   int ascii_compatible;
8611
8612   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8613   attrs = CODING_ID_ATTRS (coding.id);
8614   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8615     return Qnil;
8616   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8617   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8618   translation_table = get_translation_table (attrs, 1, NULL);
8619
8620   if (NILP (string))
8621     {
8622       validate_region (&start, &end);
8623       from = XINT (start);
8624       to = XINT (end);
8625       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8626           || (ascii_compatible
8627               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8628         return Qnil;
8629       p = CHAR_POS_ADDR (from);
8630       pend = CHAR_POS_ADDR (to);
8631       if (from < GPT && to >= GPT)
8632         stop = GPT_ADDR;
8633       else
8634         stop = pend;
8635     }
8636   else
8637     {
8638       CHECK_STRING (string);
8639       CHECK_NATNUM (start);
8640       CHECK_NATNUM (end);
8641       from = XINT (start);
8642       to = XINT (end);
8643       if (from > to
8644           || to > SCHARS (string))
8645         args_out_of_range_3 (string, start, end);
8646       if (! STRING_MULTIBYTE (string))
8647         return Qnil;
8648       p = SDATA (string) + string_char_to_byte (string, from);
8649       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8650       if (ascii_compatible && (to - from) == (pend - p))
8651         return Qnil;
8652     }
8653
8654   if (NILP (count))
8655     n = 1;
8656   else
8657     {
8658       CHECK_NATNUM (count);
8659       n = XINT (count);
8660     }
8661
8662   positions = Qnil;
8663   while (1)
8664     {
8665       int c;
8666
8667       if (ascii_compatible)
8668         while (p < stop && ASCII_BYTE_P (*p))
8669           p++, from++;
8670       if (p >= stop)
8671         {
8672           if (p >= pend)
8673             break;
8674           stop = pend;
8675           p = GAP_END_ADDR;
8676         }
8677
8678       c = STRING_CHAR_ADVANCE (p);
8679       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8680           && ! char_charset (translate_char (translation_table, c),
8681                              charset_list, NULL))
8682         {
8683           positions = Fcons (make_number (from), positions);
8684           n--;
8685           if (n == 0)
8686             break;
8687         }
8688
8689       from++;
8690     }
8691
8692   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8693 }
8694
8695
8696 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8697        Scheck_coding_systems_region, 3, 3, 0,
8698        doc: /* Check if the region is encodable by coding systems.
8699
8700 START and END are buffer positions specifying the region.
8701 CODING-SYSTEM-LIST is a list of coding systems to check.
8702
8703 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8704 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8705 whole region, POS0, POS1, ... are buffer positions where non-encodable
8706 characters are found.
8707
8708 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8709 value is nil.
8710
8711 START may be a string.  In that case, check if the string is
8712 encodable, and the value contains indices to the string instead of
8713 buffer positions.  END is ignored.
8714
8715 If the current buffer (or START if it is a string) is unibyte, the value
8716 is nil.  */)
8717   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8718 {
8719   Lisp_Object list;
8720   EMACS_INT start_byte, end_byte;
8721   int pos;
8722   const unsigned char *p, *pbeg, *pend;
8723   int c;
8724   Lisp_Object tail, elt, attrs;
8725
8726   if (STRINGP (start))
8727     {
8728       if (!STRING_MULTIBYTE (start)
8729           || SCHARS (start) == SBYTES (start))
8730         return Qnil;
8731       start_byte = 0;
8732       end_byte = SBYTES (start);
8733       pos = 0;
8734     }
8735   else
8736     {
8737       CHECK_NUMBER_COERCE_MARKER (start);
8738       CHECK_NUMBER_COERCE_MARKER (end);
8739       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8740         args_out_of_range (start, end);
8741       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8742         return Qnil;
8743       start_byte = CHAR_TO_BYTE (XINT (start));
8744       end_byte = CHAR_TO_BYTE (XINT (end));
8745       if (XINT (end) - XINT (start) == end_byte - start_byte)
8746         return Qnil;
8747
8748       if (XINT (start) < GPT && XINT (end) > GPT)
8749         {
8750           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8751             move_gap_both (XINT (start), start_byte);
8752           else
8753             move_gap_both (XINT (end), end_byte);
8754         }
8755       pos = XINT (start);
8756     }
8757
8758   list = Qnil;
8759   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8760     {
8761       elt = XCAR (tail);
8762       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8763       ASET (attrs, coding_attr_trans_tbl,
8764             get_translation_table (attrs, 1, NULL));
8765       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8766     }
8767
8768   if (STRINGP (start))
8769     p = pbeg = SDATA (start);
8770   else
8771     p = pbeg = BYTE_POS_ADDR (start_byte);
8772   pend = p + (end_byte - start_byte);
8773
8774   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8775   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8776
8777   while (p < pend)
8778     {
8779       if (ASCII_BYTE_P (*p))
8780         p++;
8781       else
8782         {
8783           c = STRING_CHAR_ADVANCE (p);
8784
8785           charset_map_loaded = 0;
8786           for (tail = list; CONSP (tail); tail = XCDR (tail))
8787             {
8788               elt = XCDR (XCAR (tail));
8789               if (! char_encodable_p (c, XCAR (elt)))
8790                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8791             }
8792           if (charset_map_loaded)
8793             {
8794               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8795
8796               if (STRINGP (start))
8797                 pbeg = SDATA (start);
8798               else
8799                 pbeg = BYTE_POS_ADDR (start_byte);
8800               p = pbeg + p_offset;
8801               pend = pbeg + pend_offset;
8802             }
8803         }
8804       pos++;
8805     }
8806
8807   tail = list;
8808   list = Qnil;
8809   for (; CONSP (tail); tail = XCDR (tail))
8810     {
8811       elt = XCAR (tail);
8812       if (CONSP (XCDR (XCDR (elt))))
8813         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8814                       list);
8815     }
8816
8817   return list;
8818 }
8819
8820
8821 static Lisp_Object
8822 code_convert_region (Lisp_Object start, Lisp_Object end,
8823                      Lisp_Object coding_system, Lisp_Object dst_object,
8824                      int encodep, int norecord)
8825 {
8826   struct coding_system coding;
8827   EMACS_INT from, from_byte, to, to_byte;
8828   Lisp_Object src_object;
8829
8830   CHECK_NUMBER_COERCE_MARKER (start);
8831   CHECK_NUMBER_COERCE_MARKER (end);
8832   if (NILP (coding_system))
8833     coding_system = Qno_conversion;
8834   else
8835     CHECK_CODING_SYSTEM (coding_system);
8836   src_object = Fcurrent_buffer ();
8837   if (NILP (dst_object))
8838     dst_object = src_object;
8839   else if (! EQ (dst_object, Qt))
8840     CHECK_BUFFER (dst_object);
8841
8842   validate_region (&start, &end);
8843   from = XFASTINT (start);
8844   from_byte = CHAR_TO_BYTE (from);
8845   to = XFASTINT (end);
8846   to_byte = CHAR_TO_BYTE (to);
8847
8848   setup_coding_system (coding_system, &coding);
8849   coding.mode |= CODING_MODE_LAST_BLOCK;
8850
8851   if (encodep)
8852     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8853                           dst_object);
8854   else
8855     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8856                           dst_object);
8857   if (! norecord)
8858     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8859
8860   return (BUFFERP (dst_object)
8861           ? make_number (coding.produced_char)
8862           : coding.dst_object);
8863 }
8864
8865
8866 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8867        3, 4, "r\nzCoding system: ",
8868        doc: /* Decode the current region from the specified coding system.
8869 When called from a program, takes four arguments:
8870         START, END, CODING-SYSTEM, and DESTINATION.
8871 START and END are buffer positions.
8872
8873 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8874 If nil, the region between START and END is replaced by the decoded text.
8875 If buffer, the decoded text is inserted in that buffer after point (point
8876 does not move).
8877 In those cases, the length of the decoded text is returned.
8878 If DESTINATION is t, the decoded text is returned.
8879
8880 This function sets `last-coding-system-used' to the precise coding system
8881 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8882 not fully specified.)  */)
8883   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8884 {
8885   return code_convert_region (start, end, coding_system, destination, 0, 0);
8886 }
8887
8888 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8889        3, 4, "r\nzCoding system: ",
8890        doc: /* Encode the current region by specified coding system.
8891 When called from a program, takes four arguments:
8892         START, END, CODING-SYSTEM and DESTINATION.
8893 START and END are buffer positions.
8894
8895 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8896 If nil, the region between START and END is replace by the encoded text.
8897 If buffer, the encoded text is inserted in that buffer after point (point
8898 does not move).
8899 In those cases, the length of the encoded text is returned.
8900 If DESTINATION is t, the encoded text is returned.
8901
8902 This function sets `last-coding-system-used' to the precise coding system
8903 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8904 not fully specified.)  */)
8905   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8906 {
8907   return code_convert_region (start, end, coding_system, destination, 1, 0);
8908 }
8909
8910 Lisp_Object
8911 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8912                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8913 {
8914   struct coding_system coding;
8915   EMACS_INT chars, bytes;
8916
8917   CHECK_STRING (string);
8918   if (NILP (coding_system))
8919     {
8920       if (! norecord)
8921         Vlast_coding_system_used = Qno_conversion;
8922       if (NILP (dst_object))
8923         return (nocopy ? Fcopy_sequence (string) : string);
8924     }
8925
8926   if (NILP (coding_system))
8927     coding_system = Qno_conversion;
8928   else
8929     CHECK_CODING_SYSTEM (coding_system);
8930   if (NILP (dst_object))
8931     dst_object = Qt;
8932   else if (! EQ (dst_object, Qt))
8933     CHECK_BUFFER (dst_object);
8934
8935   setup_coding_system (coding_system, &coding);
8936   coding.mode |= CODING_MODE_LAST_BLOCK;
8937   chars = SCHARS (string);
8938   bytes = SBYTES (string);
8939   if (encodep)
8940     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8941   else
8942     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8943   if (! norecord)
8944     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8945
8946   return (BUFFERP (dst_object)
8947           ? make_number (coding.produced_char)
8948           : coding.dst_object);
8949 }
8950
8951
8952 /* Encode or decode STRING according to CODING_SYSTEM.
8953    Do not set Vlast_coding_system_used.
8954
8955    This function is called only from macros DECODE_FILE and
8956    ENCODE_FILE, thus we ignore character composition.  */
8957
8958 Lisp_Object
8959 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8960                               int encodep)
8961 {
8962   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8963 }
8964
8965
8966 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8967        2, 4, 0,
8968        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8969
8970 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8971 if the decoding operation is trivial.
8972
8973 Optional fourth arg BUFFER non-nil means that the decoded text is
8974 inserted in that buffer after point (point does not move).  In this
8975 case, the return value is the length of the decoded text.
8976
8977 This function sets `last-coding-system-used' to the precise coding system
8978 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8979 not fully specified.)  */)
8980   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8981 {
8982   return code_convert_string (string, coding_system, buffer,
8983                               0, ! NILP (nocopy), 0);
8984 }
8985
8986 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8987        2, 4, 0,
8988        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8989
8990 Optional third arg NOCOPY non-nil means it is OK to return STRING
8991 itself if the encoding operation is trivial.
8992
8993 Optional fourth arg BUFFER non-nil means that the encoded text is
8994 inserted in that buffer after point (point does not move).  In this
8995 case, the return value is the length of the encoded text.
8996
8997 This function sets `last-coding-system-used' to the precise coding system
8998 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8999 not fully specified.)  */)
9000   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9001 {
9002   return code_convert_string (string, coding_system, buffer,
9003                               1, ! NILP (nocopy), 0);
9004 }
9005
9006 \f
9007 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9008        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9009 Return the corresponding character.  */)
9010   (Lisp_Object code)
9011 {
9012   Lisp_Object spec, attrs, val;
9013   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9014   EMACS_INT ch;
9015   int c;
9016
9017   CHECK_NATNUM (code);
9018   ch = XFASTINT (code);
9019   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9020   attrs = AREF (spec, 0);
9021
9022   if (ASCII_BYTE_P (ch)
9023       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9024     return code;
9025
9026   val = CODING_ATTR_CHARSET_LIST (attrs);
9027   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9028   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9029   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9030
9031   if (ch <= 0x7F)
9032     {
9033       c = ch;
9034       charset = charset_roman;
9035     }
9036   else if (ch >= 0xA0 && ch < 0xDF)
9037     {
9038       c = ch - 0x80;
9039       charset = charset_kana;
9040     }
9041   else
9042     {
9043       EMACS_INT c1 = ch >> 8;
9044       int c2 = ch & 0xFF;
9045
9046       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9047           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9048         error ("Invalid code: %"pI"d", ch);
9049       c = ch;
9050       SJIS_TO_JIS (c);
9051       charset = charset_kanji;
9052     }
9053   c = DECODE_CHAR (charset, c);
9054   if (c < 0)
9055     error ("Invalid code: %"pI"d", ch);
9056   return make_number (c);
9057 }
9058
9059
9060 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9061        doc: /* Encode a Japanese character CH to shift_jis encoding.
9062 Return the corresponding code in SJIS.  */)
9063   (Lisp_Object ch)
9064 {
9065   Lisp_Object spec, attrs, charset_list;
9066   int c;
9067   struct charset *charset;
9068   unsigned code;
9069
9070   CHECK_CHARACTER (ch);
9071   c = XFASTINT (ch);
9072   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9073   attrs = AREF (spec, 0);
9074
9075   if (ASCII_CHAR_P (c)
9076       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9077     return ch;
9078
9079   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9080   charset = char_charset (c, charset_list, &code);
9081   if (code == CHARSET_INVALID_CODE (charset))
9082     error ("Can't encode by shift_jis encoding: %c", c);
9083   JIS_TO_SJIS (code);
9084
9085   return make_number (code);
9086 }
9087
9088 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9089        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9090 Return the corresponding character.  */)
9091   (Lisp_Object code)
9092 {
9093   Lisp_Object spec, attrs, val;
9094   struct charset *charset_roman, *charset_big5, *charset;
9095   EMACS_INT ch;
9096   int c;
9097
9098   CHECK_NATNUM (code);
9099   ch = XFASTINT (code);
9100   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9101   attrs = AREF (spec, 0);
9102
9103   if (ASCII_BYTE_P (ch)
9104       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9105     return code;
9106
9107   val = CODING_ATTR_CHARSET_LIST (attrs);
9108   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9109   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9110
9111   if (ch <= 0x7F)
9112     {
9113       c = ch;
9114       charset = charset_roman;
9115     }
9116   else
9117     {
9118       EMACS_INT b1 = ch >> 8;
9119       int b2 = ch & 0x7F;
9120       if (b1 < 0xA1 || b1 > 0xFE
9121           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9122         error ("Invalid code: %"pI"d", ch);
9123       c = ch;
9124       charset = charset_big5;
9125     }
9126   c = DECODE_CHAR (charset, c);
9127   if (c < 0)
9128     error ("Invalid code: %"pI"d", ch);
9129   return make_number (c);
9130 }
9131
9132 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9133        doc: /* Encode the Big5 character CH to BIG5 coding system.
9134 Return the corresponding character code in Big5.  */)
9135   (Lisp_Object ch)
9136 {
9137   Lisp_Object spec, attrs, charset_list;
9138   struct charset *charset;
9139   int c;
9140   unsigned code;
9141
9142   CHECK_CHARACTER (ch);
9143   c = XFASTINT (ch);
9144   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9145   attrs = AREF (spec, 0);
9146   if (ASCII_CHAR_P (c)
9147       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9148     return ch;
9149
9150   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9151   charset = char_charset (c, charset_list, &code);
9152   if (code == CHARSET_INVALID_CODE (charset))
9153     error ("Can't encode by Big5 encoding: %c", c);
9154
9155   return make_number (code);
9156 }
9157
9158 \f
9159 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9160        Sset_terminal_coding_system_internal, 1, 2, 0,
9161        doc: /* Internal use only.  */)
9162   (Lisp_Object coding_system, Lisp_Object terminal)
9163 {
9164   struct terminal *term = get_terminal (terminal, 1);
9165   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9166   CHECK_SYMBOL (coding_system);
9167   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9168   /* We had better not send unsafe characters to terminal.  */
9169   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9170   /* Character composition should be disabled.  */
9171   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9172   terminal_coding->src_multibyte = 1;
9173   terminal_coding->dst_multibyte = 0;
9174   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9175     term->charset_list = coding_charset_list (terminal_coding);
9176   else
9177     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9178   return Qnil;
9179 }
9180
9181 DEFUN ("set-safe-terminal-coding-system-internal",
9182        Fset_safe_terminal_coding_system_internal,
9183        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9184        doc: /* Internal use only.  */)
9185   (Lisp_Object coding_system)
9186 {
9187   CHECK_SYMBOL (coding_system);
9188   setup_coding_system (Fcheck_coding_system (coding_system),
9189                        &safe_terminal_coding);
9190   /* Character composition should be disabled.  */
9191   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9192   safe_terminal_coding.src_multibyte = 1;
9193   safe_terminal_coding.dst_multibyte = 0;
9194   return Qnil;
9195 }
9196
9197 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9198        Sterminal_coding_system, 0, 1, 0,
9199        doc: /* Return coding system specified for terminal output on the given terminal.
9200 TERMINAL may be a terminal object, a frame, or nil for the selected
9201 frame's terminal device.  */)
9202   (Lisp_Object terminal)
9203 {
9204   struct coding_system *terminal_coding
9205     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9206   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9207
9208   /* For backward compatibility, return nil if it is `undecided'. */
9209   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9210 }
9211
9212 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9213        Sset_keyboard_coding_system_internal, 1, 2, 0,
9214        doc: /* Internal use only.  */)
9215   (Lisp_Object coding_system, Lisp_Object terminal)
9216 {
9217   struct terminal *t = get_terminal (terminal, 1);
9218   CHECK_SYMBOL (coding_system);
9219   if (NILP (coding_system))
9220     coding_system = Qno_conversion;
9221   else
9222     Fcheck_coding_system (coding_system);
9223   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9224   /* Character composition should be disabled.  */
9225   TERMINAL_KEYBOARD_CODING (t)->common_flags
9226     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9227   return Qnil;
9228 }
9229
9230 DEFUN ("keyboard-coding-system",
9231        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9232        doc: /* Return coding system specified for decoding keyboard input.  */)
9233   (Lisp_Object terminal)
9234 {
9235   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9236                          (get_terminal (terminal, 1))->id);
9237 }
9238
9239 \f
9240 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9241        Sfind_operation_coding_system,  1, MANY, 0,
9242        doc: /* Choose a coding system for an operation based on the target name.
9243 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9244 DECODING-SYSTEM is the coding system to use for decoding
9245 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9246 for encoding (in case OPERATION does encoding).
9247
9248 The first argument OPERATION specifies an I/O primitive:
9249   For file I/O, `insert-file-contents' or `write-region'.
9250   For process I/O, `call-process', `call-process-region', or `start-process'.
9251   For network I/O, `open-network-stream'.
9252
9253 The remaining arguments should be the same arguments that were passed
9254 to the primitive.  Depending on which primitive, one of those arguments
9255 is selected as the TARGET.  For example, if OPERATION does file I/O,
9256 whichever argument specifies the file name is TARGET.
9257
9258 TARGET has a meaning which depends on OPERATION:
9259   For file I/O, TARGET is a file name (except for the special case below).
9260   For process I/O, TARGET is a process name.
9261   For network I/O, TARGET is a service name or a port number.
9262
9263 This function looks up what is specified for TARGET in
9264 `file-coding-system-alist', `process-coding-system-alist',
9265 or `network-coding-system-alist' depending on OPERATION.
9266 They may specify a coding system, a cons of coding systems,
9267 or a function symbol to call.
9268 In the last case, we call the function with one argument,
9269 which is a list of all the arguments given to this function.
9270 If the function can't decide a coding system, it can return
9271 `undecided' so that the normal code-detection is performed.
9272
9273 If OPERATION is `insert-file-contents', the argument corresponding to
9274 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9275 file name to look up, and BUFFER is a buffer that contains the file's
9276 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9277 function to call for FILENAME, that function should examine the
9278 contents of BUFFER instead of reading the file.
9279
9280 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9281   (ptrdiff_t nargs, Lisp_Object *args)
9282 {
9283   Lisp_Object operation, target_idx, target, val;
9284   register Lisp_Object chain;
9285
9286   if (nargs < 2)
9287     error ("Too few arguments");
9288   operation = args[0];
9289   if (!SYMBOLP (operation)
9290       || !NATNUMP (target_idx = Fget (operation, Qtarget_idx)))
9291     error ("Invalid first argument");
9292   if (nargs < 1 + XFASTINT (target_idx))
9293     error ("Too few arguments for operation `%s'",
9294            SDATA (SYMBOL_NAME (operation)));
9295   target = args[XFASTINT (target_idx) + 1];
9296   if (!(STRINGP (target)
9297         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9298             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9299         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9300     error ("Invalid argument %"pI"d of operation `%s'",
9301            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9302   if (CONSP (target))
9303     target = XCAR (target);
9304
9305   chain = ((EQ (operation, Qinsert_file_contents)
9306             || EQ (operation, Qwrite_region))
9307            ? Vfile_coding_system_alist
9308            : (EQ (operation, Qopen_network_stream)
9309               ? Vnetwork_coding_system_alist
9310               : Vprocess_coding_system_alist));
9311   if (NILP (chain))
9312     return Qnil;
9313
9314   for (; CONSP (chain); chain = XCDR (chain))
9315     {
9316       Lisp_Object elt;
9317
9318       elt = XCAR (chain);
9319       if (CONSP (elt)
9320           && ((STRINGP (target)
9321                && STRINGP (XCAR (elt))
9322                && fast_string_match (XCAR (elt), target) >= 0)
9323               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9324         {
9325           val = XCDR (elt);
9326           /* Here, if VAL is both a valid coding system and a valid
9327              function symbol, we return VAL as a coding system.  */
9328           if (CONSP (val))
9329             return val;
9330           if (! SYMBOLP (val))
9331             return Qnil;
9332           if (! NILP (Fcoding_system_p (val)))
9333             return Fcons (val, val);
9334           if (! NILP (Ffboundp (val)))
9335             {
9336               /* We use call1 rather than safe_call1
9337                  so as to get bug reports about functions called here
9338                  which don't handle the current interface.  */
9339               val = call1 (val, Flist (nargs, args));
9340               if (CONSP (val))
9341                 return val;
9342               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9343                 return Fcons (val, val);
9344             }
9345           return Qnil;
9346         }
9347     }
9348   return Qnil;
9349 }
9350
9351 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9352        Sset_coding_system_priority, 0, MANY, 0,
9353        doc: /* Assign higher priority to the coding systems given as arguments.
9354 If multiple coding systems belong to the same category,
9355 all but the first one are ignored.
9356
9357 usage: (set-coding-system-priority &rest coding-systems)  */)
9358   (ptrdiff_t nargs, Lisp_Object *args)
9359 {
9360   ptrdiff_t i, j;
9361   int changed[coding_category_max];
9362   enum coding_category priorities[coding_category_max];
9363
9364   memset (changed, 0, sizeof changed);
9365
9366   for (i = j = 0; i < nargs; i++)
9367     {
9368       enum coding_category category;
9369       Lisp_Object spec, attrs;
9370
9371       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9372       attrs = AREF (spec, 0);
9373       category = XINT (CODING_ATTR_CATEGORY (attrs));
9374       if (changed[category])
9375         /* Ignore this coding system because a coding system of the
9376            same category already had a higher priority.  */
9377         continue;
9378       changed[category] = 1;
9379       priorities[j++] = category;
9380       if (coding_categories[category].id >= 0
9381           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9382         setup_coding_system (args[i], &coding_categories[category]);
9383       Fset (AREF (Vcoding_category_table, category), args[i]);
9384     }
9385
9386   /* Now we have decided top J priorities.  Reflect the order of the
9387      original priorities to the remaining priorities.  */
9388
9389   for (i = j, j = 0; i < coding_category_max; i++, j++)
9390     {
9391       while (j < coding_category_max
9392              && changed[coding_priorities[j]])
9393         j++;
9394       if (j == coding_category_max)
9395         abort ();
9396       priorities[i] = coding_priorities[j];
9397     }
9398
9399   memcpy (coding_priorities, priorities, sizeof priorities);
9400
9401   /* Update `coding-category-list'.  */
9402   Vcoding_category_list = Qnil;
9403   for (i = coding_category_max; i-- > 0; )
9404     Vcoding_category_list
9405       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9406                Vcoding_category_list);
9407
9408   return Qnil;
9409 }
9410
9411 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9412        Scoding_system_priority_list, 0, 1, 0,
9413        doc: /* Return a list of coding systems ordered by their priorities.
9414 The list contains a subset of coding systems; i.e. coding systems
9415 assigned to each coding category (see `coding-category-list').
9416
9417 HIGHESTP non-nil means just return the highest priority one.  */)
9418   (Lisp_Object highestp)
9419 {
9420   int i;
9421   Lisp_Object val;
9422
9423   for (i = 0, val = Qnil; i < coding_category_max; i++)
9424     {
9425       enum coding_category category = coding_priorities[i];
9426       int id = coding_categories[category].id;
9427       Lisp_Object attrs;
9428
9429       if (id < 0)
9430         continue;
9431       attrs = CODING_ID_ATTRS (id);
9432       if (! NILP (highestp))
9433         return CODING_ATTR_BASE_NAME (attrs);
9434       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9435     }
9436   return Fnreverse (val);
9437 }
9438
9439 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9440
9441 static Lisp_Object
9442 make_subsidiaries (Lisp_Object base)
9443 {
9444   Lisp_Object subsidiaries;
9445   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9446   char *buf = (char *) alloca (base_name_len + 6);
9447   int i;
9448
9449   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9450   subsidiaries = Fmake_vector (make_number (3), Qnil);
9451   for (i = 0; i < 3; i++)
9452     {
9453       strcpy (buf + base_name_len, suffixes[i]);
9454       ASET (subsidiaries, i, intern (buf));
9455     }
9456   return subsidiaries;
9457 }
9458
9459
9460 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9461        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9462        doc: /* For internal use only.
9463 usage: (define-coding-system-internal ...)  */)
9464   (ptrdiff_t nargs, Lisp_Object *args)
9465 {
9466   Lisp_Object name;
9467   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9468   Lisp_Object attrs;            /* Vector of attributes.  */
9469   Lisp_Object eol_type;
9470   Lisp_Object aliases;
9471   Lisp_Object coding_type, charset_list, safe_charsets;
9472   enum coding_category category;
9473   Lisp_Object tail, val;
9474   int max_charset_id = 0;
9475   int i;
9476
9477   if (nargs < coding_arg_max)
9478     goto short_args;
9479
9480   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9481
9482   name = args[coding_arg_name];
9483   CHECK_SYMBOL (name);
9484   CODING_ATTR_BASE_NAME (attrs) = name;
9485
9486   val = args[coding_arg_mnemonic];
9487   if (! STRINGP (val))
9488     CHECK_CHARACTER (val);
9489   CODING_ATTR_MNEMONIC (attrs) = val;
9490
9491   coding_type = args[coding_arg_coding_type];
9492   CHECK_SYMBOL (coding_type);
9493   CODING_ATTR_TYPE (attrs) = coding_type;
9494
9495   charset_list = args[coding_arg_charset_list];
9496   if (SYMBOLP (charset_list))
9497     {
9498       if (EQ (charset_list, Qiso_2022))
9499         {
9500           if (! EQ (coding_type, Qiso_2022))
9501             error ("Invalid charset-list");
9502           charset_list = Viso_2022_charset_list;
9503         }
9504       else if (EQ (charset_list, Qemacs_mule))
9505         {
9506           if (! EQ (coding_type, Qemacs_mule))
9507             error ("Invalid charset-list");
9508           charset_list = Vemacs_mule_charset_list;
9509         }
9510       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9511         if (max_charset_id < XFASTINT (XCAR (tail)))
9512           max_charset_id = XFASTINT (XCAR (tail));
9513     }
9514   else
9515     {
9516       charset_list = Fcopy_sequence (charset_list);
9517       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9518         {
9519           struct charset *charset;
9520
9521           val = XCAR (tail);
9522           CHECK_CHARSET_GET_CHARSET (val, charset);
9523           if (EQ (coding_type, Qiso_2022)
9524               ? CHARSET_ISO_FINAL (charset) < 0
9525               : EQ (coding_type, Qemacs_mule)
9526               ? CHARSET_EMACS_MULE_ID (charset) < 0
9527               : 0)
9528             error ("Can't handle charset `%s'",
9529                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9530
9531           XSETCAR (tail, make_number (charset->id));
9532           if (max_charset_id < charset->id)
9533             max_charset_id = charset->id;
9534         }
9535     }
9536   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9537
9538   safe_charsets = make_uninit_string (max_charset_id + 1);
9539   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9540   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9541     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9542   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9543
9544   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9545
9546   val = args[coding_arg_decode_translation_table];
9547   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9548     CHECK_SYMBOL (val);
9549   CODING_ATTR_DECODE_TBL (attrs) = val;
9550
9551   val = args[coding_arg_encode_translation_table];
9552   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9553     CHECK_SYMBOL (val);
9554   CODING_ATTR_ENCODE_TBL (attrs) = val;
9555
9556   val = args[coding_arg_post_read_conversion];
9557   CHECK_SYMBOL (val);
9558   CODING_ATTR_POST_READ (attrs) = val;
9559
9560   val = args[coding_arg_pre_write_conversion];
9561   CHECK_SYMBOL (val);
9562   CODING_ATTR_PRE_WRITE (attrs) = val;
9563
9564   val = args[coding_arg_default_char];
9565   if (NILP (val))
9566     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9567   else
9568     {
9569       CHECK_CHARACTER (val);
9570       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9571     }
9572
9573   val = args[coding_arg_for_unibyte];
9574   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9575
9576   val = args[coding_arg_plist];
9577   CHECK_LIST (val);
9578   CODING_ATTR_PLIST (attrs) = val;
9579
9580   if (EQ (coding_type, Qcharset))
9581     {
9582       /* Generate a lisp vector of 256 elements.  Each element is nil,
9583          integer, or a list of charset IDs.
9584
9585          If Nth element is nil, the byte code N is invalid in this
9586          coding system.
9587
9588          If Nth element is a number NUM, N is the first byte of a
9589          charset whose ID is NUM.
9590
9591          If Nth element is a list of charset IDs, N is the first byte
9592          of one of them.  The list is sorted by dimensions of the
9593          charsets.  A charset of smaller dimension comes first. */
9594       val = Fmake_vector (make_number (256), Qnil);
9595
9596       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9597         {
9598           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9599           int dim = CHARSET_DIMENSION (charset);
9600           int idx = (dim - 1) * 4;
9601
9602           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9603             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9604
9605           for (i = charset->code_space[idx];
9606                i <= charset->code_space[idx + 1]; i++)
9607             {
9608               Lisp_Object tmp, tmp2;
9609               int dim2;
9610
9611               tmp = AREF (val, i);
9612               if (NILP (tmp))
9613                 tmp = XCAR (tail);
9614               else if (NUMBERP (tmp))
9615                 {
9616                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9617                   if (dim < dim2)
9618                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9619                   else
9620                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9621                 }
9622               else
9623                 {
9624                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9625                     {
9626                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9627                       if (dim < dim2)
9628                         break;
9629                     }
9630                   if (NILP (tmp2))
9631                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9632                   else
9633                     {
9634                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9635                       XSETCAR (tmp2, XCAR (tail));
9636                     }
9637                 }
9638               ASET (val, i, tmp);
9639             }
9640         }
9641       ASET (attrs, coding_attr_charset_valids, val);
9642       category = coding_category_charset;
9643     }
9644   else if (EQ (coding_type, Qccl))
9645     {
9646       Lisp_Object valids;
9647
9648       if (nargs < coding_arg_ccl_max)
9649         goto short_args;
9650
9651       val = args[coding_arg_ccl_decoder];
9652       CHECK_CCL_PROGRAM (val);
9653       if (VECTORP (val))
9654         val = Fcopy_sequence (val);
9655       ASET (attrs, coding_attr_ccl_decoder, val);
9656
9657       val = args[coding_arg_ccl_encoder];
9658       CHECK_CCL_PROGRAM (val);
9659       if (VECTORP (val))
9660         val = Fcopy_sequence (val);
9661       ASET (attrs, coding_attr_ccl_encoder, val);
9662
9663       val = args[coding_arg_ccl_valids];
9664       valids = Fmake_string (make_number (256), make_number (0));
9665       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9666         {
9667           int from, to;
9668
9669           val = Fcar (tail);
9670           if (INTEGERP (val))
9671             {
9672               from = to = XINT (val);
9673               if (from < 0 || from > 255)
9674                 args_out_of_range_3 (val, make_number (0), make_number (255));
9675             }
9676           else
9677             {
9678               CHECK_CONS (val);
9679               CHECK_NATNUM_CAR (val);
9680               CHECK_NATNUM_CDR (val);
9681               from = XINT (XCAR (val));
9682               if (from > 255)
9683                 args_out_of_range_3 (XCAR (val),
9684                                      make_number (0), make_number (255));
9685               to = XINT (XCDR (val));
9686               if (to < from || to > 255)
9687                 args_out_of_range_3 (XCDR (val),
9688                                      XCAR (val), make_number (255));
9689             }
9690           for (i = from; i <= to; i++)
9691             SSET (valids, i, 1);
9692         }
9693       ASET (attrs, coding_attr_ccl_valids, valids);
9694
9695       category = coding_category_ccl;
9696     }
9697   else if (EQ (coding_type, Qutf_16))
9698     {
9699       Lisp_Object bom, endian;
9700
9701       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9702
9703       if (nargs < coding_arg_utf16_max)
9704         goto short_args;
9705
9706       bom = args[coding_arg_utf16_bom];
9707       if (! NILP (bom) && ! EQ (bom, Qt))
9708         {
9709           CHECK_CONS (bom);
9710           val = XCAR (bom);
9711           CHECK_CODING_SYSTEM (val);
9712           val = XCDR (bom);
9713           CHECK_CODING_SYSTEM (val);
9714         }
9715       ASET (attrs, coding_attr_utf_bom, bom);
9716
9717       endian = args[coding_arg_utf16_endian];
9718       CHECK_SYMBOL (endian);
9719       if (NILP (endian))
9720         endian = Qbig;
9721       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9722         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9723       ASET (attrs, coding_attr_utf_16_endian, endian);
9724
9725       category = (CONSP (bom)
9726                   ? coding_category_utf_16_auto
9727                   : NILP (bom)
9728                   ? (EQ (endian, Qbig)
9729                      ? coding_category_utf_16_be_nosig
9730                      : coding_category_utf_16_le_nosig)
9731                   : (EQ (endian, Qbig)
9732                      ? coding_category_utf_16_be
9733                      : coding_category_utf_16_le));
9734     }
9735   else if (EQ (coding_type, Qiso_2022))
9736     {
9737       Lisp_Object initial, reg_usage, request, flags;
9738
9739       if (nargs < coding_arg_iso2022_max)
9740         goto short_args;
9741
9742       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9743       CHECK_VECTOR (initial);
9744       for (i = 0; i < 4; i++)
9745         {
9746           val = Faref (initial, make_number (i));
9747           if (! NILP (val))
9748             {
9749               struct charset *charset;
9750
9751               CHECK_CHARSET_GET_CHARSET (val, charset);
9752               ASET (initial, i, make_number (CHARSET_ID (charset)));
9753               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9754                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9755             }
9756           else
9757             ASET (initial, i, make_number (-1));
9758         }
9759
9760       reg_usage = args[coding_arg_iso2022_reg_usage];
9761       CHECK_CONS (reg_usage);
9762       CHECK_NUMBER_CAR (reg_usage);
9763       CHECK_NUMBER_CDR (reg_usage);
9764
9765       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9766       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9767         {
9768           int id;
9769           Lisp_Object tmp1;
9770
9771           val = Fcar (tail);
9772           CHECK_CONS (val);
9773           tmp1 = XCAR (val);
9774           CHECK_CHARSET_GET_ID (tmp1, id);
9775           CHECK_NATNUM_CDR (val);
9776           if (XINT (XCDR (val)) >= 4)
9777             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9778           XSETCAR (val, make_number (id));
9779         }
9780
9781       flags = args[coding_arg_iso2022_flags];
9782       CHECK_NATNUM (flags);
9783       i = XINT (flags);
9784       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9785         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9786
9787       ASET (attrs, coding_attr_iso_initial, initial);
9788       ASET (attrs, coding_attr_iso_usage, reg_usage);
9789       ASET (attrs, coding_attr_iso_request, request);
9790       ASET (attrs, coding_attr_iso_flags, flags);
9791       setup_iso_safe_charsets (attrs);
9792
9793       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9794         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9795                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9796                     ? coding_category_iso_7_else
9797                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9798                     ? coding_category_iso_7
9799                     : coding_category_iso_7_tight);
9800       else
9801         {
9802           int id = XINT (AREF (initial, 1));
9803
9804           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9805                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9806                        || id < 0)
9807                       ? coding_category_iso_8_else
9808                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9809                       ? coding_category_iso_8_1
9810                       : coding_category_iso_8_2);
9811         }
9812       if (category != coding_category_iso_8_1
9813           && category != coding_category_iso_8_2)
9814         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9815     }
9816   else if (EQ (coding_type, Qemacs_mule))
9817     {
9818       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9819         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9820       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9821       category = coding_category_emacs_mule;
9822     }
9823   else if (EQ (coding_type, Qshift_jis))
9824     {
9825
9826       struct charset *charset;
9827
9828       if (XINT (Flength (charset_list)) != 3
9829           && XINT (Flength (charset_list)) != 4)
9830         error ("There should be three or four charsets");
9831
9832       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9833       if (CHARSET_DIMENSION (charset) != 1)
9834         error ("Dimension of charset %s is not one",
9835                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9836       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9837         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9838
9839       charset_list = XCDR (charset_list);
9840       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9841       if (CHARSET_DIMENSION (charset) != 1)
9842         error ("Dimension of charset %s is not one",
9843                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9844
9845       charset_list = XCDR (charset_list);
9846       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9847       if (CHARSET_DIMENSION (charset) != 2)
9848         error ("Dimension of charset %s is not two",
9849                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9850
9851       charset_list = XCDR (charset_list);
9852       if (! NILP (charset_list))
9853         {
9854           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9855           if (CHARSET_DIMENSION (charset) != 2)
9856             error ("Dimension of charset %s is not two",
9857                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9858         }
9859
9860       category = coding_category_sjis;
9861       Vsjis_coding_system = name;
9862     }
9863   else if (EQ (coding_type, Qbig5))
9864     {
9865       struct charset *charset;
9866
9867       if (XINT (Flength (charset_list)) != 2)
9868         error ("There should be just two charsets");
9869
9870       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9871       if (CHARSET_DIMENSION (charset) != 1)
9872         error ("Dimension of charset %s is not one",
9873                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9874       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9875         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9876
9877       charset_list = XCDR (charset_list);
9878       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9879       if (CHARSET_DIMENSION (charset) != 2)
9880         error ("Dimension of charset %s is not two",
9881                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9882
9883       category = coding_category_big5;
9884       Vbig5_coding_system = name;
9885     }
9886   else if (EQ (coding_type, Qraw_text))
9887     {
9888       category = coding_category_raw_text;
9889       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9890     }
9891   else if (EQ (coding_type, Qutf_8))
9892     {
9893       Lisp_Object bom;
9894
9895       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9896
9897       if (nargs < coding_arg_utf8_max)
9898         goto short_args;
9899
9900       bom = args[coding_arg_utf8_bom];
9901       if (! NILP (bom) && ! EQ (bom, Qt))
9902         {
9903           CHECK_CONS (bom);
9904           val = XCAR (bom);
9905           CHECK_CODING_SYSTEM (val);
9906           val = XCDR (bom);
9907           CHECK_CODING_SYSTEM (val);
9908         }
9909       ASET (attrs, coding_attr_utf_bom, bom);
9910
9911       category = (CONSP (bom) ? coding_category_utf_8_auto
9912                   : NILP (bom) ? coding_category_utf_8_nosig
9913                   : coding_category_utf_8_sig);
9914     }
9915   else if (EQ (coding_type, Qundecided))
9916     category = coding_category_undecided;
9917   else
9918     error ("Invalid coding system type: %s",
9919            SDATA (SYMBOL_NAME (coding_type)));
9920
9921   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9922   CODING_ATTR_PLIST (attrs)
9923     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9924                                 CODING_ATTR_PLIST (attrs)));
9925   CODING_ATTR_PLIST (attrs)
9926     = Fcons (QCascii_compatible_p,
9927              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9928                     CODING_ATTR_PLIST (attrs)));
9929
9930   eol_type = args[coding_arg_eol_type];
9931   if (! NILP (eol_type)
9932       && ! EQ (eol_type, Qunix)
9933       && ! EQ (eol_type, Qdos)
9934       && ! EQ (eol_type, Qmac))
9935     error ("Invalid eol-type");
9936
9937   aliases = Fcons (name, Qnil);
9938
9939   if (NILP (eol_type))
9940     {
9941       eol_type = make_subsidiaries (name);
9942       for (i = 0; i < 3; i++)
9943         {
9944           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9945
9946           this_name = AREF (eol_type, i);
9947           this_aliases = Fcons (this_name, Qnil);
9948           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9949           this_spec = Fmake_vector (make_number (3), attrs);
9950           ASET (this_spec, 1, this_aliases);
9951           ASET (this_spec, 2, this_eol_type);
9952           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9953           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9954           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9955           if (NILP (val))
9956             Vcoding_system_alist
9957               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9958                        Vcoding_system_alist);
9959         }
9960     }
9961
9962   spec_vec = Fmake_vector (make_number (3), attrs);
9963   ASET (spec_vec, 1, aliases);
9964   ASET (spec_vec, 2, eol_type);
9965
9966   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9967   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9968   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9969   if (NILP (val))
9970     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9971                                   Vcoding_system_alist);
9972
9973   {
9974     int id = coding_categories[category].id;
9975
9976     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9977       setup_coding_system (name, &coding_categories[category]);
9978   }
9979
9980   return Qnil;
9981
9982  short_args:
9983   return Fsignal (Qwrong_number_of_arguments,
9984                   Fcons (intern ("define-coding-system-internal"),
9985                          make_number (nargs)));
9986 }
9987
9988
9989 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9990        3, 3, 0,
9991        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9992   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
9993 {
9994   Lisp_Object spec, attrs;
9995
9996   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9997   attrs = AREF (spec, 0);
9998   if (EQ (prop, QCmnemonic))
9999     {
10000       if (! STRINGP (val))
10001         CHECK_CHARACTER (val);
10002       CODING_ATTR_MNEMONIC (attrs) = val;
10003     }
10004   else if (EQ (prop, QCdefault_char))
10005     {
10006       if (NILP (val))
10007         val = make_number (' ');
10008       else
10009         CHECK_CHARACTER (val);
10010       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10011     }
10012   else if (EQ (prop, QCdecode_translation_table))
10013     {
10014       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10015         CHECK_SYMBOL (val);
10016       CODING_ATTR_DECODE_TBL (attrs) = val;
10017     }
10018   else if (EQ (prop, QCencode_translation_table))
10019     {
10020       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10021         CHECK_SYMBOL (val);
10022       CODING_ATTR_ENCODE_TBL (attrs) = val;
10023     }
10024   else if (EQ (prop, QCpost_read_conversion))
10025     {
10026       CHECK_SYMBOL (val);
10027       CODING_ATTR_POST_READ (attrs) = val;
10028     }
10029   else if (EQ (prop, QCpre_write_conversion))
10030     {
10031       CHECK_SYMBOL (val);
10032       CODING_ATTR_PRE_WRITE (attrs) = val;
10033     }
10034   else if (EQ (prop, QCascii_compatible_p))
10035     {
10036       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10037     }
10038
10039   CODING_ATTR_PLIST (attrs)
10040     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10041   return val;
10042 }
10043
10044
10045 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10046        Sdefine_coding_system_alias, 2, 2, 0,
10047        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10048   (Lisp_Object alias, Lisp_Object coding_system)
10049 {
10050   Lisp_Object spec, aliases, eol_type, val;
10051
10052   CHECK_SYMBOL (alias);
10053   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10054   aliases = AREF (spec, 1);
10055   /* ALIASES should be a list of length more than zero, and the first
10056      element is a base coding system.  Append ALIAS at the tail of the
10057      list.  */
10058   while (!NILP (XCDR (aliases)))
10059     aliases = XCDR (aliases);
10060   XSETCDR (aliases, Fcons (alias, Qnil));
10061
10062   eol_type = AREF (spec, 2);
10063   if (VECTORP (eol_type))
10064     {
10065       Lisp_Object subsidiaries;
10066       int i;
10067
10068       subsidiaries = make_subsidiaries (alias);
10069       for (i = 0; i < 3; i++)
10070         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10071                                      AREF (eol_type, i));
10072     }
10073
10074   Fputhash (alias, spec, Vcoding_system_hash_table);
10075   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10076   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10077   if (NILP (val))
10078     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10079                                   Vcoding_system_alist);
10080
10081   return Qnil;
10082 }
10083
10084 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10085        1, 1, 0,
10086        doc: /* Return the base of CODING-SYSTEM.
10087 Any alias or subsidiary coding system is not a base coding system.  */)
10088   (Lisp_Object coding_system)
10089 {
10090   Lisp_Object spec, attrs;
10091
10092   if (NILP (coding_system))
10093     return (Qno_conversion);
10094   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10095   attrs = AREF (spec, 0);
10096   return CODING_ATTR_BASE_NAME (attrs);
10097 }
10098
10099 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10100        1, 1, 0,
10101        doc: "Return the property list of CODING-SYSTEM.")
10102   (Lisp_Object coding_system)
10103 {
10104   Lisp_Object spec, attrs;
10105
10106   if (NILP (coding_system))
10107     coding_system = Qno_conversion;
10108   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10109   attrs = AREF (spec, 0);
10110   return CODING_ATTR_PLIST (attrs);
10111 }
10112
10113
10114 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10115        1, 1, 0,
10116        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10117   (Lisp_Object coding_system)
10118 {
10119   Lisp_Object spec;
10120
10121   if (NILP (coding_system))
10122     coding_system = Qno_conversion;
10123   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10124   return AREF (spec, 1);
10125 }
10126
10127 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10128        Scoding_system_eol_type, 1, 1, 0,
10129        doc: /* Return eol-type of CODING-SYSTEM.
10130 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10131
10132 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10133 and CR respectively.
10134
10135 A vector value indicates that a format of end-of-line should be
10136 detected automatically.  Nth element of the vector is the subsidiary
10137 coding system whose eol-type is N.  */)
10138   (Lisp_Object coding_system)
10139 {
10140   Lisp_Object spec, eol_type;
10141   int n;
10142
10143   if (NILP (coding_system))
10144     coding_system = Qno_conversion;
10145   if (! CODING_SYSTEM_P (coding_system))
10146     return Qnil;
10147   spec = CODING_SYSTEM_SPEC (coding_system);
10148   eol_type = AREF (spec, 2);
10149   if (VECTORP (eol_type))
10150     return Fcopy_sequence (eol_type);
10151   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10152   return make_number (n);
10153 }
10154
10155 #endif /* emacs */
10156
10157 \f
10158 /*** 9. Post-amble ***/
10159
10160 void
10161 init_coding_once (void)
10162 {
10163   int i;
10164
10165   for (i = 0; i < coding_category_max; i++)
10166     {
10167       coding_categories[i].id = -1;
10168       coding_priorities[i] = i;
10169     }
10170
10171   /* ISO2022 specific initialize routine.  */
10172   for (i = 0; i < 0x20; i++)
10173     iso_code_class[i] = ISO_control_0;
10174   for (i = 0x21; i < 0x7F; i++)
10175     iso_code_class[i] = ISO_graphic_plane_0;
10176   for (i = 0x80; i < 0xA0; i++)
10177     iso_code_class[i] = ISO_control_1;
10178   for (i = 0xA1; i < 0xFF; i++)
10179     iso_code_class[i] = ISO_graphic_plane_1;
10180   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10181   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10182   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10183   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10184   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10185   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10186   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10187   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10188   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10189
10190   for (i = 0; i < 256; i++)
10191     {
10192       emacs_mule_bytes[i] = 1;
10193     }
10194   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10195   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10196   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10197   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10198 }
10199
10200 #ifdef emacs
10201
10202 void
10203 syms_of_coding (void)
10204 {
10205   staticpro (&Vcoding_system_hash_table);
10206   {
10207     Lisp_Object args[2];
10208     args[0] = QCtest;
10209     args[1] = Qeq;
10210     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10211   }
10212
10213   staticpro (&Vsjis_coding_system);
10214   Vsjis_coding_system = Qnil;
10215
10216   staticpro (&Vbig5_coding_system);
10217   Vbig5_coding_system = Qnil;
10218
10219   staticpro (&Vcode_conversion_reused_workbuf);
10220   Vcode_conversion_reused_workbuf = Qnil;
10221
10222   staticpro (&Vcode_conversion_workbuf_name);
10223   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10224
10225   reused_workbuf_in_use = 0;
10226
10227   DEFSYM (Qcharset, "charset");
10228   DEFSYM (Qtarget_idx, "target-idx");
10229   DEFSYM (Qcoding_system_history, "coding-system-history");
10230   Fset (Qcoding_system_history, Qnil);
10231
10232   /* Target FILENAME is the first argument.  */
10233   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10234   /* Target FILENAME is the third argument.  */
10235   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10236
10237   DEFSYM (Qcall_process, "call-process");
10238   /* Target PROGRAM is the first argument.  */
10239   Fput (Qcall_process, Qtarget_idx, make_number (0));
10240
10241   DEFSYM (Qcall_process_region, "call-process-region");
10242   /* Target PROGRAM is the third argument.  */
10243   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10244
10245   DEFSYM (Qstart_process, "start-process");
10246   /* Target PROGRAM is the third argument.  */
10247   Fput (Qstart_process, Qtarget_idx, make_number (2));
10248
10249   DEFSYM (Qopen_network_stream, "open-network-stream");
10250   /* Target SERVICE is the fourth argument.  */
10251   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10252
10253   DEFSYM (Qcoding_system, "coding-system");
10254   DEFSYM (Qcoding_aliases, "coding-aliases");
10255
10256   DEFSYM (Qeol_type, "eol-type");
10257   DEFSYM (Qunix, "unix");
10258   DEFSYM (Qdos, "dos");
10259
10260   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10261   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10262   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10263   DEFSYM (Qdefault_char, "default-char");
10264   DEFSYM (Qundecided, "undecided");
10265   DEFSYM (Qno_conversion, "no-conversion");
10266   DEFSYM (Qraw_text, "raw-text");
10267
10268   DEFSYM (Qiso_2022, "iso-2022");
10269
10270   DEFSYM (Qutf_8, "utf-8");
10271   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10272
10273   DEFSYM (Qutf_16, "utf-16");
10274   DEFSYM (Qbig, "big");
10275   DEFSYM (Qlittle, "little");
10276
10277   DEFSYM (Qshift_jis, "shift-jis");
10278   DEFSYM (Qbig5, "big5");
10279
10280   DEFSYM (Qcoding_system_p, "coding-system-p");
10281
10282   DEFSYM (Qcoding_system_error, "coding-system-error");
10283   Fput (Qcoding_system_error, Qerror_conditions,
10284         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10285   Fput (Qcoding_system_error, Qerror_message,
10286         make_pure_c_string ("Invalid coding system"));
10287
10288   /* Intern this now in case it isn't already done.
10289      Setting this variable twice is harmless.
10290      But don't staticpro it here--that is done in alloc.c.  */
10291   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10292
10293   DEFSYM (Qtranslation_table, "translation-table");
10294   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10295   DEFSYM (Qtranslation_table_id, "translation-table-id");
10296   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10297   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10298
10299   DEFSYM (Qvalid_codes, "valid-codes");
10300
10301   DEFSYM (Qemacs_mule, "emacs-mule");
10302
10303   DEFSYM (QCcategory, ":category");
10304   DEFSYM (QCmnemonic, ":mnemonic");
10305   DEFSYM (QCdefault_char, ":default-char");
10306   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10307   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10308   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10309   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10310   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10311
10312   Vcoding_category_table
10313     = Fmake_vector (make_number (coding_category_max), Qnil);
10314   staticpro (&Vcoding_category_table);
10315   /* Followings are target of code detection.  */
10316   ASET (Vcoding_category_table, coding_category_iso_7,
10317         intern_c_string ("coding-category-iso-7"));
10318   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10319         intern_c_string ("coding-category-iso-7-tight"));
10320   ASET (Vcoding_category_table, coding_category_iso_8_1,
10321         intern_c_string ("coding-category-iso-8-1"));
10322   ASET (Vcoding_category_table, coding_category_iso_8_2,
10323         intern_c_string ("coding-category-iso-8-2"));
10324   ASET (Vcoding_category_table, coding_category_iso_7_else,
10325         intern_c_string ("coding-category-iso-7-else"));
10326   ASET (Vcoding_category_table, coding_category_iso_8_else,
10327         intern_c_string ("coding-category-iso-8-else"));
10328   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10329         intern_c_string ("coding-category-utf-8-auto"));
10330   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10331         intern_c_string ("coding-category-utf-8"));
10332   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10333         intern_c_string ("coding-category-utf-8-sig"));
10334   ASET (Vcoding_category_table, coding_category_utf_16_be,
10335         intern_c_string ("coding-category-utf-16-be"));
10336   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10337         intern_c_string ("coding-category-utf-16-auto"));
10338   ASET (Vcoding_category_table, coding_category_utf_16_le,
10339         intern_c_string ("coding-category-utf-16-le"));
10340   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10341         intern_c_string ("coding-category-utf-16-be-nosig"));
10342   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10343         intern_c_string ("coding-category-utf-16-le-nosig"));
10344   ASET (Vcoding_category_table, coding_category_charset,
10345         intern_c_string ("coding-category-charset"));
10346   ASET (Vcoding_category_table, coding_category_sjis,
10347         intern_c_string ("coding-category-sjis"));
10348   ASET (Vcoding_category_table, coding_category_big5,
10349         intern_c_string ("coding-category-big5"));
10350   ASET (Vcoding_category_table, coding_category_ccl,
10351         intern_c_string ("coding-category-ccl"));
10352   ASET (Vcoding_category_table, coding_category_emacs_mule,
10353         intern_c_string ("coding-category-emacs-mule"));
10354   /* Followings are NOT target of code detection.  */
10355   ASET (Vcoding_category_table, coding_category_raw_text,
10356         intern_c_string ("coding-category-raw-text"));
10357   ASET (Vcoding_category_table, coding_category_undecided,
10358         intern_c_string ("coding-category-undecided"));
10359
10360   DEFSYM (Qinsufficient_source, "insufficient-source");
10361   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10362   DEFSYM (Qinvalid_source, "invalid-source");
10363   DEFSYM (Qinterrupted, "interrupted");
10364   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10365   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10366
10367   defsubr (&Scoding_system_p);
10368   defsubr (&Sread_coding_system);
10369   defsubr (&Sread_non_nil_coding_system);
10370   defsubr (&Scheck_coding_system);
10371   defsubr (&Sdetect_coding_region);
10372   defsubr (&Sdetect_coding_string);
10373   defsubr (&Sfind_coding_systems_region_internal);
10374   defsubr (&Sunencodable_char_position);
10375   defsubr (&Scheck_coding_systems_region);
10376   defsubr (&Sdecode_coding_region);
10377   defsubr (&Sencode_coding_region);
10378   defsubr (&Sdecode_coding_string);
10379   defsubr (&Sencode_coding_string);
10380   defsubr (&Sdecode_sjis_char);
10381   defsubr (&Sencode_sjis_char);
10382   defsubr (&Sdecode_big5_char);
10383   defsubr (&Sencode_big5_char);
10384   defsubr (&Sset_terminal_coding_system_internal);
10385   defsubr (&Sset_safe_terminal_coding_system_internal);
10386   defsubr (&Sterminal_coding_system);
10387   defsubr (&Sset_keyboard_coding_system_internal);
10388   defsubr (&Skeyboard_coding_system);
10389   defsubr (&Sfind_operation_coding_system);
10390   defsubr (&Sset_coding_system_priority);
10391   defsubr (&Sdefine_coding_system_internal);
10392   defsubr (&Sdefine_coding_system_alias);
10393   defsubr (&Scoding_system_put);
10394   defsubr (&Scoding_system_base);
10395   defsubr (&Scoding_system_plist);
10396   defsubr (&Scoding_system_aliases);
10397   defsubr (&Scoding_system_eol_type);
10398   defsubr (&Scoding_system_priority_list);
10399
10400   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10401                doc: /* List of coding systems.
10402
10403 Do not alter the value of this variable manually.  This variable should be
10404 updated by the functions `define-coding-system' and
10405 `define-coding-system-alias'.  */);
10406   Vcoding_system_list = Qnil;
10407
10408   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10409                doc: /* Alist of coding system names.
10410 Each element is one element list of coding system name.
10411 This variable is given to `completing-read' as COLLECTION argument.
10412
10413 Do not alter the value of this variable manually.  This variable should be
10414 updated by the functions `make-coding-system' and
10415 `define-coding-system-alias'.  */);
10416   Vcoding_system_alist = Qnil;
10417
10418   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10419                doc: /* List of coding-categories (symbols) ordered by priority.
10420
10421 On detecting a coding system, Emacs tries code detection algorithms
10422 associated with each coding-category one by one in this order.  When
10423 one algorithm agrees with a byte sequence of source text, the coding
10424 system bound to the corresponding coding-category is selected.
10425
10426 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10427   {
10428     int i;
10429
10430     Vcoding_category_list = Qnil;
10431     for (i = coding_category_max - 1; i >= 0; i--)
10432       Vcoding_category_list
10433         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10434                  Vcoding_category_list);
10435   }
10436
10437   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10438                doc: /* Specify the coding system for read operations.
10439 It is useful to bind this variable with `let', but do not set it globally.
10440 If the value is a coding system, it is used for decoding on read operation.
10441 If not, an appropriate element is used from one of the coding system alists.
10442 There are three such tables: `file-coding-system-alist',
10443 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10444   Vcoding_system_for_read = Qnil;
10445
10446   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10447                doc: /* Specify the coding system for write operations.
10448 Programs bind this variable with `let', but you should not set it globally.
10449 If the value is a coding system, it is used for encoding of output,
10450 when writing it to a file and when sending it to a file or subprocess.
10451
10452 If this does not specify a coding system, an appropriate element
10453 is used from one of the coding system alists.
10454 There are three such tables: `file-coding-system-alist',
10455 `process-coding-system-alist', and `network-coding-system-alist'.
10456 For output to files, if the above procedure does not specify a coding system,
10457 the value of `buffer-file-coding-system' is used.  */);
10458   Vcoding_system_for_write = Qnil;
10459
10460   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10461                doc: /*
10462 Coding system used in the latest file or process I/O.  */);
10463   Vlast_coding_system_used = Qnil;
10464
10465   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10466                doc: /*
10467 Error status of the last code conversion.
10468
10469 When an error was detected in the last code conversion, this variable
10470 is set to one of the following symbols.
10471   `insufficient-source'
10472   `inconsistent-eol'
10473   `invalid-source'
10474   `interrupted'
10475   `insufficient-memory'
10476 When no error was detected, the value doesn't change.  So, to check
10477 the error status of a code conversion by this variable, you must
10478 explicitly set this variable to nil before performing code
10479 conversion.  */);
10480   Vlast_code_conversion_error = Qnil;
10481
10482   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10483                doc: /*
10484 *Non-nil means always inhibit code conversion of end-of-line format.
10485 See info node `Coding Systems' and info node `Text and Binary' concerning
10486 such conversion.  */);
10487   inhibit_eol_conversion = 0;
10488
10489   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10490                doc: /*
10491 Non-nil means process buffer inherits coding system of process output.
10492 Bind it to t if the process output is to be treated as if it were a file
10493 read from some filesystem.  */);
10494   inherit_process_coding_system = 0;
10495
10496   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10497                doc: /*
10498 Alist to decide a coding system to use for a file I/O operation.
10499 The format is ((PATTERN . VAL) ...),
10500 where PATTERN is a regular expression matching a file name,
10501 VAL is a coding system, a cons of coding systems, or a function symbol.
10502 If VAL is a coding system, it is used for both decoding and encoding
10503 the file contents.
10504 If VAL is a cons of coding systems, the car part is used for decoding,
10505 and the cdr part is used for encoding.
10506 If VAL is a function symbol, the function must return a coding system
10507 or a cons of coding systems which are used as above.  The function is
10508 called with an argument that is a list of the arguments with which
10509 `find-operation-coding-system' was called.  If the function can't decide
10510 a coding system, it can return `undecided' so that the normal
10511 code-detection is performed.
10512
10513 See also the function `find-operation-coding-system'
10514 and the variable `auto-coding-alist'.  */);
10515   Vfile_coding_system_alist = Qnil;
10516
10517   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10518                doc: /*
10519 Alist to decide a coding system to use for a process I/O operation.
10520 The format is ((PATTERN . VAL) ...),
10521 where PATTERN is a regular expression matching a program name,
10522 VAL is a coding system, a cons of coding systems, or a function symbol.
10523 If VAL is a coding system, it is used for both decoding what received
10524 from the program and encoding what sent to the program.
10525 If VAL is a cons of coding systems, the car part is used for decoding,
10526 and the cdr part is used for encoding.
10527 If VAL is a function symbol, the function must return a coding system
10528 or a cons of coding systems which are used as above.
10529
10530 See also the function `find-operation-coding-system'.  */);
10531   Vprocess_coding_system_alist = Qnil;
10532
10533   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10534                doc: /*
10535 Alist to decide a coding system to use for a network I/O operation.
10536 The format is ((PATTERN . VAL) ...),
10537 where PATTERN is a regular expression matching a network service name
10538 or is a port number to connect to,
10539 VAL is a coding system, a cons of coding systems, or a function symbol.
10540 If VAL is a coding system, it is used for both decoding what received
10541 from the network stream and encoding what sent to the network stream.
10542 If VAL is a cons of coding systems, the car part is used for decoding,
10543 and the cdr part is used for encoding.
10544 If VAL is a function symbol, the function must return a coding system
10545 or a cons of coding systems which are used as above.
10546
10547 See also the function `find-operation-coding-system'.  */);
10548   Vnetwork_coding_system_alist = Qnil;
10549
10550   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10551                doc: /* Coding system to use with system messages.
10552 Also used for decoding keyboard input on X Window system.  */);
10553   Vlocale_coding_system = Qnil;
10554
10555   /* The eol mnemonics are reset in startup.el system-dependently.  */
10556   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10557                doc: /*
10558 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10559   eol_mnemonic_unix = make_pure_c_string (":");
10560
10561   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10562                doc: /*
10563 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10564   eol_mnemonic_dos = make_pure_c_string ("\\");
10565
10566   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10567                doc: /*
10568 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10569   eol_mnemonic_mac = make_pure_c_string ("/");
10570
10571   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10572                doc: /*
10573 *String displayed in mode line when end-of-line format is not yet determined.  */);
10574   eol_mnemonic_undecided = make_pure_c_string (":");
10575
10576   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10577                doc: /*
10578 *Non-nil enables character translation while encoding and decoding.  */);
10579   Venable_character_translation = Qt;
10580
10581   DEFVAR_LISP ("standard-translation-table-for-decode",
10582                Vstandard_translation_table_for_decode,
10583                doc: /* Table for translating characters while decoding.  */);
10584   Vstandard_translation_table_for_decode = Qnil;
10585
10586   DEFVAR_LISP ("standard-translation-table-for-encode",
10587                Vstandard_translation_table_for_encode,
10588                doc: /* Table for translating characters while encoding.  */);
10589   Vstandard_translation_table_for_encode = Qnil;
10590
10591   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10592                doc: /* Alist of charsets vs revision numbers.
10593 While encoding, if a charset (car part of an element) is found,
10594 designate it with the escape sequence identifying revision (cdr part
10595 of the element).  */);
10596   Vcharset_revision_table = Qnil;
10597
10598   DEFVAR_LISP ("default-process-coding-system",
10599                Vdefault_process_coding_system,
10600                doc: /* Cons of coding systems used for process I/O by default.
10601 The car part is used for decoding a process output,
10602 the cdr part is used for encoding a text to be sent to a process.  */);
10603   Vdefault_process_coding_system = Qnil;
10604
10605   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10606                doc: /*
10607 Table of extra Latin codes in the range 128..159 (inclusive).
10608 This is a vector of length 256.
10609 If Nth element is non-nil, the existence of code N in a file
10610 \(or output of subprocess) doesn't prevent it to be detected as
10611 a coding system of ISO 2022 variant which has a flag
10612 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10613 or reading output of a subprocess.
10614 Only 128th through 159th elements have a meaning.  */);
10615   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10616
10617   DEFVAR_LISP ("select-safe-coding-system-function",
10618                Vselect_safe_coding_system_function,
10619                doc: /*
10620 Function to call to select safe coding system for encoding a text.
10621
10622 If set, this function is called to force a user to select a proper
10623 coding system which can encode the text in the case that a default
10624 coding system used in each operation can't encode the text.  The
10625 function should take care that the buffer is not modified while
10626 the coding system is being selected.
10627
10628 The default value is `select-safe-coding-system' (which see).  */);
10629   Vselect_safe_coding_system_function = Qnil;
10630
10631   DEFVAR_BOOL ("coding-system-require-warning",
10632                coding_system_require_warning,
10633                doc: /* Internal use only.
10634 If non-nil, on writing a file, `select-safe-coding-system-function' is
10635 called even if `coding-system-for-write' is non-nil.  The command
10636 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10637   coding_system_require_warning = 0;
10638
10639
10640   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10641                inhibit_iso_escape_detection,
10642                doc: /*
10643 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10644
10645 When Emacs reads text, it tries to detect how the text is encoded.
10646 This code detection is sensitive to escape sequences.  If Emacs sees
10647 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10648 of the ISO2022 encodings, and decodes text by the corresponding coding
10649 system (e.g. `iso-2022-7bit').
10650
10651 However, there may be a case that you want to read escape sequences in
10652 a file as is.  In such a case, you can set this variable to non-nil.
10653 Then the code detection will ignore any escape sequences, and no text is
10654 detected as encoded in some ISO-2022 encoding.  The result is that all
10655 escape sequences become visible in a buffer.
10656
10657 The default value is nil, and it is strongly recommended not to change
10658 it.  That is because many Emacs Lisp source files that contain
10659 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10660 in Emacs's distribution, and they won't be decoded correctly on
10661 reading if you suppress escape sequence detection.
10662
10663 The other way to read escape sequences in a file without decoding is
10664 to explicitly specify some coding system that doesn't use ISO-2022
10665 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10666   inhibit_iso_escape_detection = 0;
10667
10668   DEFVAR_BOOL ("inhibit-null-byte-detection",
10669                inhibit_null_byte_detection,
10670                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10671 By default, Emacs treats it as binary data, and does not attempt to
10672 decode it.  The effect is as if you specified `no-conversion' for
10673 reading that text.
10674
10675 Set this to non-nil when a regular text happens to include null bytes.
10676 Examples are Index nodes of Info files and null-byte delimited output
10677 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10678 decode text as usual.  */);
10679   inhibit_null_byte_detection = 0;
10680
10681   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10682                doc: /* Char table for translating self-inserting characters.
10683 This is applied to the result of input methods, not their input.
10684 See also `keyboard-translate-table'.
10685
10686 Use of this variable for character code unification was rendered
10687 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10688 internal character representation.  */);
10689     Vtranslation_table_for_input = Qnil;
10690
10691   {
10692     Lisp_Object args[coding_arg_max];
10693     Lisp_Object plist[16];
10694     int i;
10695
10696     for (i = 0; i < coding_arg_max; i++)
10697       args[i] = Qnil;
10698
10699     plist[0] = intern_c_string (":name");
10700     plist[1] = args[coding_arg_name] = Qno_conversion;
10701     plist[2] = intern_c_string (":mnemonic");
10702     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10703     plist[4] = intern_c_string (":coding-type");
10704     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10705     plist[6] = intern_c_string (":ascii-compatible-p");
10706     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10707     plist[8] = intern_c_string (":default-char");
10708     plist[9] = args[coding_arg_default_char] = make_number (0);
10709     plist[10] = intern_c_string (":for-unibyte");
10710     plist[11] = args[coding_arg_for_unibyte] = Qt;
10711     plist[12] = intern_c_string (":docstring");
10712     plist[13] = make_pure_c_string ("Do no conversion.\n\
10713 \n\
10714 When you visit a file with this coding, the file is read into a\n\
10715 unibyte buffer as is, thus each byte of a file is treated as a\n\
10716 character.");
10717     plist[14] = intern_c_string (":eol-type");
10718     plist[15] = args[coding_arg_eol_type] = Qunix;
10719     args[coding_arg_plist] = Flist (16, plist);
10720     Fdefine_coding_system_internal (coding_arg_max, args);
10721
10722     plist[1] = args[coding_arg_name] = Qundecided;
10723     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10724     plist[5] = args[coding_arg_coding_type] = Qundecided;
10725     /* This is already set.
10726        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10727     plist[8] = intern_c_string (":charset-list");
10728     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10729     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10730     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10731     plist[15] = args[coding_arg_eol_type] = Qnil;
10732     args[coding_arg_plist] = Flist (16, plist);
10733     Fdefine_coding_system_internal (coding_arg_max, args);
10734   }
10735
10736   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10737
10738   {
10739     int i;
10740
10741     for (i = 0; i < coding_category_max; i++)
10742       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10743   }
10744 #if defined (DOS_NT)
10745   system_eol_type = Qdos;
10746 #else
10747   system_eol_type = Qunix;
10748 #endif
10749   staticpro (&system_eol_type);
10750 }
10751
10752 char *
10753 emacs_strerror (int error_number)
10754 {
10755   char *str;
10756
10757   synchronize_system_messages_locale ();
10758   str = strerror (error_number);
10759
10760   if (! NILP (Vlocale_coding_system))
10761     {
10762       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10763                                                       Vlocale_coding_system,
10764                                                       0);
10765       str = SSDATA (dec);
10766     }
10767
10768   return str;
10769 }
10770
10771 #endif /* emacs */