src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010, 2011
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (struct coding_system *coding,
 158                    struct coding_detection_info *detect_info)
 159 {
 160   const unsigned char *src = coding->source;
 161   const unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the source is exhausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exhausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (struct coding_system *coding)
 206 {
 207   const unsigned char *src = coding->source + coding->consumed;
 208   const unsigned char *src_end = coding->source + coding->src_bytes;
 209   /* SRC_BASE remembers the start position in source in each loop.
 210      The loop will be exited when there's not enough source code, or
 211      when there's no room in CHARBUF for a decoded character.  */
 212   const unsigned char *src_base;
 213   /* A buffer to produce decoded characters.  */
 214   int *charbuf = coding->charbuf + coding->charbuf_used;
 215   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 216   int multibytep = coding->src_multibyte;
 217
 218   while (1)
 219     {
 220       src_base = src;
 221       if (charbuf < charbuf_end)
 222         /* No more room to produce a decoded character.  */
 223         break;
 224       ONE_MORE_BYTE (c);
 225       /* Decode it. */
 226     }
 227
 228  no_more_source:
 229   if (src_base < src_end
 230       && coding->mode & CODING_MODE_LAST_BLOCK)
 231     /* If the source ends by partial bytes to construct a character,
 232        treat them as eight-bit raw data.  */
 233     while (src_base < src_end && charbuf < charbuf_end)
 234       *charbuf++ = *src_base++;
 235   /* Remember how many bytes and characters we consumed.  If the
 236      source is multibyte, the bytes and chars are not identical.  */
 237   coding->consumed = coding->consumed_char = src_base - coding->source;
 238   /* Remember how many characters we produced.  */
 239   coding->charbuf_used = charbuf - coding->charbuf;
 240 }
 241 #endif
 242
 243 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 244
 245   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 246   internal multibyte format by CODING.  The resulting byte sequence
 247   goes to a place pointed to by DESTINATION, the length of which
 248   should not exceed DST_BYTES.
 249
 250   These functions set the information of original and encoded texts in
 251   the members produced, produced_char, consumed, and consumed_char of
 252   the structure *CODING.  They also set the member result to one of
 253   CODING_RESULT_XXX indicating how the encoding finished.
 254
 255   DST_BYTES zero means that source area and destination area are
 256   overlapped, which means that we can produce a encoded text until it
 257   reaches at the head of not-yet-encoded source text.
 258
 259   Below is a template of these functions.  */
 260 #if 0
 261 static void
 262 encode_coding_XXX (struct coding_system *coding)
 263 {
 264   int multibytep = coding->dst_multibyte;
 265   int *charbuf = coding->charbuf;
 266   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 267   unsigned char *dst = coding->destination + coding->produced;
 268   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 269   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 270   int produced_chars = 0;
 271
 272   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 273     {
 274       int c = *charbuf;
 275       /* Encode C into DST, and increment DST.  */
 276     }
 277  label_no_more_destination:
 278   /* How many chars and bytes we produced.  */
 279   coding->produced_char += produced_chars;
 280   coding->produced = dst - coding->destination;
 281 }
 282 #endif
 283
 284 \f
 285 /*** 1. Preamble ***/
 286
 287 #include <config.h>
 288 #include <stdio.h>
 289 #include <setjmp.h>
 290
 291 #include "lisp.h"
 292 #include "buffer.h"
 293 #include "character.h"
 294 #include "charset.h"
 295 #include "ccl.h"
 296 #include "composite.h"
 297 #include "coding.h"
 298 #include "window.h"
 299 #include "frame.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317 Lisp_Object QCascii_compatible_p;
 318
 319 Lisp_Object Qcall_process, Qcall_process_region;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 /* If a symbol has this property, evaluate the value to define the
 327    symbol as a coding system.  */
 328 static Lisp_Object Qcoding_system_define_form;
 329
 330 int coding_system_require_warning;
 331
 332 Lisp_Object Vselect_safe_coding_system_function;
 333
 334 /* Mnemonic string for each format of end-of-line.  */
 335 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 336 /* Mnemonic string to indicate format of end-of-line is not yet
 337    decided.  */
 338 Lisp_Object eol_mnemonic_undecided;
 339
 340 /* Format of end-of-line decided by system.  This is Qunix on
 341    Unix and Mac, Qdos on DOS/Windows.
 342    This has an effect only for external encoding (i.e. for output to
 343    file and process), not for in-buffer or Lisp string encoding.  */
 344 static Lisp_Object system_eol_type;
 345
 346 #ifdef emacs
 347
 348 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 349
 350 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 351
 352 /* Coding system emacs-mule and raw-text are for converting only
 353    end-of-line format.  */
 354 Lisp_Object Qemacs_mule, Qraw_text;
 355 Lisp_Object Qutf_8_emacs;
 356
 357 /* Coding-systems are handed between Emacs Lisp programs and C internal
 358    routines by the following three variables.  */
 359 /* Coding-system for reading files and receiving data from process.  */
 360 Lisp_Object Vcoding_system_for_read;
 361 /* Coding-system for writing files and sending data to process.  */
 362 Lisp_Object Vcoding_system_for_write;
 363 /* Coding-system actually used in the latest I/O.  */
 364 Lisp_Object Vlast_coding_system_used;
 365 /* Set to non-nil when an error is detected while code conversion.  */
 366 Lisp_Object Vlast_code_conversion_error;
 367 /* A vector of length 256 which contains information about special
 368    Latin codes (especially for dealing with Microsoft codes).  */
 369 Lisp_Object Vlatin_extra_code_table;
 370
 371 /* Flag to inhibit code conversion of end-of-line format.  */
 372 int inhibit_eol_conversion;
 373
 374 /* Flag to inhibit ISO2022 escape sequence detection.  */
 375 int inhibit_iso_escape_detection;
 376
 377 /* Flag to inhibit detection of binary files through null bytes.  */
 378 int inhibit_null_byte_detection;
 379
 380 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 381 int inherit_process_coding_system;
 382
 383 /* Coding system to be used to encode text for terminal display when
 384    terminal coding system is nil.  */
 385 struct coding_system safe_terminal_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)          \
 430   (((charset_id) <= (coding)->max_charset_id            \
 431     ? ((coding)->safe_charsets[charset_id] != 255       \
 432        ? (coding)->safe_charsets[charset_id]            \
 433        : -1)                                            \
 434     : -1))
 435
 436
 437 #define CODING_ISO_FLAGS(coding)        \
 438   ((coding)->spec.iso_2022.flags)
 439 #define CODING_ISO_DESIGNATION(coding, reg)     \
 440   ((coding)->spec.iso_2022.current_designation[reg])
 441 #define CODING_ISO_INVOCATION(coding, plane)    \
 442   ((coding)->spec.iso_2022.current_invocation[plane])
 443 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 444   ((coding)->spec.iso_2022.single_shifting)
 445 #define CODING_ISO_BOL(coding)  \
 446   ((coding)->spec.iso_2022.bol)
 447 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 448   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 449 #define CODING_ISO_CMP_STATUS(coding)   \
 450   (&(coding)->spec.iso_2022.cmp_status)
 451 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 452   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 453 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 454   ((coding)->spec.iso_2022.embedded_utf_8)
 455
 456 /* Control characters of ISO2022.  */
 457                         /* code */      /* function */
 458 #define ISO_CODE_LF     0x0A            /* line-feed */
 459 #define ISO_CODE_CR     0x0D            /* carriage-return */
 460 #define ISO_CODE_SO     0x0E            /* shift-out */
 461 #define ISO_CODE_SI     0x0F            /* shift-in */
 462 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 463 #define ISO_CODE_ESC    0x1B            /* escape */
 464 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 465 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 466 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 467
 468 /* All code (1-byte) of ISO2022 is classified into one of the
 469    followings.  */
 470 enum iso_code_class_type
 471   {
 472     ISO_control_0,              /* Control codes in the range
 473                                    0x00..0x1F and 0x7F, except for the
 474                                    following 5 codes.  */
 475     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 476     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 477     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 478     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 479     ISO_control_1,              /* Control codes in the range
 480                                    0x80..0x9F, except for the
 481                                    following 3 codes.  */
 482     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 483     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 484     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 485     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 486     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 487     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 488     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 489   };
 490
 491 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 492     `iso-flags' attribute of an iso2022 coding system.  */
 493
 494 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 495    instead of the correct short-form sequence (e.g. ESC $ A).  */
 496 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 497
 498 /* If set, reset graphic planes and registers at end-of-line to the
 499    initial state.  */
 500 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 501
 502 /* If set, reset graphic planes and registers before any control
 503    characters to the initial state.  */
 504 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 505
 506 /* If set, encode by 7-bit environment.  */
 507 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 508
 509 /* If set, use locking-shift function.  */
 510 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 511
 512 /* If set, use single-shift function.  Overwrite
 513    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 514 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 515
 516 /* If set, use designation escape sequence.  */
 517 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 518
 519 /* If set, produce revision number sequence.  */
 520 #define CODING_ISO_FLAG_REVISION        0x0080
 521
 522 /* If set, produce ISO6429's direction specifying sequence.  */
 523 #define CODING_ISO_FLAG_DIRECTION       0x0100
 524
 525 /* If set, assume designation states are reset at beginning of line on
 526    output.  */
 527 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 528
 529 /* If set, designation sequence should be placed at beginning of line
 530    on output.  */
 531 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 532
 533 /* If set, do not encode unsafe characters on output.  */
 534 #define CODING_ISO_FLAG_SAFE            0x0800
 535
 536 /* If set, extra latin codes (128..159) are accepted as a valid code
 537    on input.  */
 538 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 539
 540 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 541
 542 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 543
 544 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 545
 546 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 547
 548 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 549
 550 /* A character to be produced on output if encoding of the original
 551    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 552 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 553
 554 /* UTF-8 section */
 555 #define CODING_UTF_8_BOM(coding)        \
 556   ((coding)->spec.utf_8_bom)
 557
 558 /* UTF-16 section */
 559 #define CODING_UTF_16_BOM(coding)       \
 560   ((coding)->spec.utf_16.bom)
 561
 562 #define CODING_UTF_16_ENDIAN(coding)    \
 563   ((coding)->spec.utf_16.endian)
 564
 565 #define CODING_UTF_16_SURROGATE(coding) \
 566   ((coding)->spec.utf_16.surrogate)
 567
 568
 569 /* CCL section */
 570 #define CODING_CCL_DECODER(coding)      \
 571   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 572 #define CODING_CCL_ENCODER(coding)      \
 573   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 574 #define CODING_CCL_VALIDS(coding)                                          \
 575   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 576
 577 /* Index for each coding category in `coding_categories' */
 578
 579 enum coding_category
 580   {
 581     coding_category_iso_7,
 582     coding_category_iso_7_tight,
 583     coding_category_iso_8_1,
 584     coding_category_iso_8_2,
 585     coding_category_iso_7_else,
 586     coding_category_iso_8_else,
 587     coding_category_utf_8_auto,
 588     coding_category_utf_8_nosig,
 589     coding_category_utf_8_sig,
 590     coding_category_utf_16_auto,
 591     coding_category_utf_16_be,
 592     coding_category_utf_16_le,
 593     coding_category_utf_16_be_nosig,
 594     coding_category_utf_16_le_nosig,
 595     coding_category_charset,
 596     coding_category_sjis,
 597     coding_category_big5,
 598     coding_category_ccl,
 599     coding_category_emacs_mule,
 600     /* All above are targets of code detection.  */
 601     coding_category_raw_text,
 602     coding_category_undecided,
 603     coding_category_max
 604   };
 605
 606 /* Definitions of flag bits used in detect_coding_XXXX.  */
 607 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 608 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 609 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 610 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 611 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 612 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 613 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 614 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 615 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 616 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 617 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 618 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 619 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 620 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 621 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 622 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 623 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 624 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 625 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 626 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 627
 628 /* This value is returned if detect_coding_mask () find nothing other
 629    than ASCII characters.  */
 630 #define CATEGORY_MASK_ANY               \
 631   (CATEGORY_MASK_ISO_7                  \
 632    | CATEGORY_MASK_ISO_7_TIGHT          \
 633    | CATEGORY_MASK_ISO_8_1              \
 634    | CATEGORY_MASK_ISO_8_2              \
 635    | CATEGORY_MASK_ISO_7_ELSE           \
 636    | CATEGORY_MASK_ISO_8_ELSE           \
 637    | CATEGORY_MASK_UTF_8_AUTO           \
 638    | CATEGORY_MASK_UTF_8_NOSIG          \
 639    | CATEGORY_MASK_UTF_8_SIG            \
 640    | CATEGORY_MASK_UTF_16_AUTO          \
 641    | CATEGORY_MASK_UTF_16_BE            \
 642    | CATEGORY_MASK_UTF_16_LE            \
 643    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 644    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 645    | CATEGORY_MASK_CHARSET              \
 646    | CATEGORY_MASK_SJIS                 \
 647    | CATEGORY_MASK_BIG5                 \
 648    | CATEGORY_MASK_CCL                  \
 649    | CATEGORY_MASK_EMACS_MULE)
 650
 651
 652 #define CATEGORY_MASK_ISO_7BIT \
 653   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 654
 655 #define CATEGORY_MASK_ISO_8BIT \
 656   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 657
 658 #define CATEGORY_MASK_ISO_ELSE \
 659   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 660
 661 #define CATEGORY_MASK_ISO_ESCAPE        \
 662   (CATEGORY_MASK_ISO_7                  \
 663    | CATEGORY_MASK_ISO_7_TIGHT          \
 664    | CATEGORY_MASK_ISO_7_ELSE           \
 665    | CATEGORY_MASK_ISO_8_ELSE)
 666
 667 #define CATEGORY_MASK_ISO       \
 668   (  CATEGORY_MASK_ISO_7BIT     \
 669      | CATEGORY_MASK_ISO_8BIT   \
 670      | CATEGORY_MASK_ISO_ELSE)
 671
 672 #define CATEGORY_MASK_UTF_16            \
 673   (CATEGORY_MASK_UTF_16_AUTO            \
 674    | CATEGORY_MASK_UTF_16_BE            \
 675    | CATEGORY_MASK_UTF_16_LE            \
 676    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 677    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 678
 679 #define CATEGORY_MASK_UTF_8     \
 680   (CATEGORY_MASK_UTF_8_AUTO     \
 681    | CATEGORY_MASK_UTF_8_NOSIG  \
 682    | CATEGORY_MASK_UTF_8_SIG)
 683
 684 /* List of symbols `coding-category-xxx' ordered by priority.  This
 685    variable is exposed to Emacs Lisp.  */
 686 static Lisp_Object Vcoding_category_list;
 687
 688 /* Table of coding categories (Lisp symbols).  This variable is for
 689    internal use only.  */
 690 static Lisp_Object Vcoding_category_table;
 691
 692 /* Table of coding-categories ordered by priority.  */
 693 static enum coding_category coding_priorities[coding_category_max];
 694
 695 /* Nth element is a coding context for the coding system bound to the
 696    Nth coding category.  */
 697 static struct coding_system coding_categories[coding_category_max];
 698
 699 /*** Commonly used macros and functions ***/
 700
 701 #ifndef min
 702 #define min(a, b) ((a) < (b) ? (a) : (b))
 703 #endif
 704 #ifndef max
 705 #define max(a, b) ((a) > (b) ? (a) : (b))
 706 #endif
 707
 708 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 709   do {                                                  \
 710     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 711     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 712   } while (0)
 713
 714
 715 /* Safely get one byte from the source text pointed by SRC which ends
 716    at SRC_END, and set C to that byte.  If there are not enough bytes
 717    in the source, it jumps to `no_more_source'.  If multibytep is
 718    nonzero, and a multibyte character is found at SRC, set C to the
 719    negative value of the character code.  The caller should declare
 720    and set these variables appropriately in advance:
 721         src, src_end, multibytep */
 722
 723 #define ONE_MORE_BYTE(c)                                \
 724   do {                                                  \
 725     if (src == src_end)                                 \
 726       {                                                 \
 727         if (src_base < src)                             \
 728           record_conversion_result                      \
 729             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 730         goto no_more_source;                            \
 731       }                                                 \
 732     c = *src++;                                         \
 733     if (multibytep && (c & 0x80))                       \
 734       {                                                 \
 735         if ((c & 0xFE) == 0xC0)                         \
 736           c = ((c & 1) << 6) | *src++;                  \
 737         else                                            \
 738           {                                             \
 739             src--;                                      \
 740             c = - string_char (src, &src, NULL);        \
 741             record_conversion_result                    \
 742               (coding, CODING_RESULT_INVALID_SRC);      \
 743           }                                             \
 744       }                                                 \
 745     consumed_chars++;                                   \
 746   } while (0)
 747
 748 /* Safely get two bytes from the source text pointed by SRC which ends
 749    at SRC_END, and set C1 and C2 to those bytes while skipping the
 750    heading multibyte characters.  If there are not enough bytes in the
 751    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 752    a multibyte character is found for C2, set C2 to the negative value
 753    of the character code.  The caller should declare and set these
 754    variables appropriately in advance:
 755         src, src_end, multibytep
 756    It is intended that this macro is used in detect_coding_utf_16.  */
 757
 758 #define TWO_MORE_BYTES(c1, c2)                          \
 759   do {                                                  \
 760     do {                                                \
 761       if (src == src_end)                               \
 762         goto no_more_source;                            \
 763       c1 = *src++;                                      \
 764       if (multibytep && (c1 & 0x80))                    \
 765         {                                               \
 766           if ((c1 & 0xFE) == 0xC0)                      \
 767             c1 = ((c1 & 1) << 6) | *src++;              \
 768           else                                          \
 769             {                                           \
 770               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 771               c1 = -1;                                  \
 772             }                                           \
 773         }                                               \
 774     } while (c1 < 0);                                   \
 775     if (src == src_end)                                 \
 776       goto no_more_source;                              \
 777     c2 = *src++;                                        \
 778     if (multibytep && (c2 & 0x80))                      \
 779       {                                                 \
 780         if ((c2 & 0xFE) == 0xC0)                        \
 781           c2 = ((c2 & 1) << 6) | *src++;                \
 782         else                                            \
 783           c2 = -1;                                      \
 784       }                                                 \
 785   } while (0)
 786
 787
 788 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 789   do {                                                  \
 790     c = *src++;                                         \
 791     if (multibytep && (c & 0x80))                       \
 792       {                                                 \
 793         if ((c & 0xFE) == 0xC0)                         \
 794           c = ((c & 1) << 6) | *src++;                  \
 795         else                                            \
 796           {                                             \
 797             src--;                                      \
 798             c = - string_char (src, &src, NULL);        \
 799             record_conversion_result                    \
 800               (coding, CODING_RESULT_INVALID_SRC);      \
 801           }                                             \
 802       }                                                 \
 803     consumed_chars++;                                   \
 804   } while (0)
 805
 806
 807 /* Store a byte C in the place pointed by DST and increment DST to the
 808    next free point, and increment PRODUCED_CHARS.  The caller should
 809    assure that C is 0..127, and declare and set the variable `dst'
 810    appropriately in advance.
 811 */
 812
 813
 814 #define EMIT_ONE_ASCII_BYTE(c)  \
 815   do {                          \
 816     produced_chars++;           \
 817     *dst++ = (c);               \
 818   } while (0)
 819
 820
 821 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 822
 823 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 824   do {                                  \
 825     produced_chars += 2;                \
 826     *dst++ = (c1), *dst++ = (c2);       \
 827   } while (0)
 828
 829
 830 /* Store a byte C in the place pointed by DST and increment DST to the
 831    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 832    nonzero, store in an appropriate multibyte from.  The caller should
 833    declare and set the variables `dst' and `multibytep' appropriately
 834    in advance.  */
 835
 836 #define EMIT_ONE_BYTE(c)                \
 837   do {                                  \
 838     produced_chars++;                   \
 839     if (multibytep)                     \
 840       {                                 \
 841         int ch = (c);                   \
 842         if (ch >= 0x80)                 \
 843           ch = BYTE8_TO_CHAR (ch);      \
 844         CHAR_STRING_ADVANCE (ch, dst);  \
 845       }                                 \
 846     else                                \
 847       *dst++ = (c);                     \
 848   } while (0)
 849
 850
 851 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 852
 853 #define EMIT_TWO_BYTES(c1, c2)          \
 854   do {                                  \
 855     produced_chars += 2;                \
 856     if (multibytep)                     \
 857       {                                 \
 858         int ch;                         \
 859                                         \
 860         ch = (c1);                      \
 861         if (ch >= 0x80)                 \
 862           ch = BYTE8_TO_CHAR (ch);      \
 863         CHAR_STRING_ADVANCE (ch, dst);  \
 864         ch = (c2);                      \
 865         if (ch >= 0x80)                 \
 866           ch = BYTE8_TO_CHAR (ch);      \
 867         CHAR_STRING_ADVANCE (ch, dst);  \
 868       }                                 \
 869     else                                \
 870       {                                 \
 871         *dst++ = (c1);                  \
 872         *dst++ = (c2);                  \
 873       }                                 \
 874   } while (0)
 875
 876
 877 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 878   do {                                  \
 879     EMIT_ONE_BYTE (c1);                 \
 880     EMIT_TWO_BYTES (c2, c3);            \
 881   } while (0)
 882
 883
 884 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 885   do {                                          \
 886     EMIT_TWO_BYTES (c1, c2);                    \
 887     EMIT_TWO_BYTES (c3, c4);                    \
 888   } while (0)
 889
 890
 891 /* Prototypes for static functions.  */
 892 static void record_conversion_result (struct coding_system *coding,
 893                                       enum coding_result_code result);
 894 static int detect_coding_utf_8 (struct coding_system *,
 895                                 struct coding_detection_info *info);
 896 static void decode_coding_utf_8 (struct coding_system *);
 897 static int encode_coding_utf_8 (struct coding_system *);
 898
 899 static int detect_coding_utf_16 (struct coding_system *,
 900                                  struct coding_detection_info *info);
 901 static void decode_coding_utf_16 (struct coding_system *);
 902 static int encode_coding_utf_16 (struct coding_system *);
 903
 904 static int detect_coding_iso_2022 (struct coding_system *,
 905                                    struct coding_detection_info *info);
 906 static void decode_coding_iso_2022 (struct coding_system *);
 907 static int encode_coding_iso_2022 (struct coding_system *);
 908
 909 static int detect_coding_emacs_mule (struct coding_system *,
 910                                      struct coding_detection_info *info);
 911 static void decode_coding_emacs_mule (struct coding_system *);
 912 static int encode_coding_emacs_mule (struct coding_system *);
 913
 914 static int detect_coding_sjis (struct coding_system *,
 915                                struct coding_detection_info *info);
 916 static void decode_coding_sjis (struct coding_system *);
 917 static int encode_coding_sjis (struct coding_system *);
 918
 919 static int detect_coding_big5 (struct coding_system *,
 920                                struct coding_detection_info *info);
 921 static void decode_coding_big5 (struct coding_system *);
 922 static int encode_coding_big5 (struct coding_system *);
 923
 924 static int detect_coding_ccl (struct coding_system *,
 925                               struct coding_detection_info *info);
 926 static void decode_coding_ccl (struct coding_system *);
 927 static int encode_coding_ccl (struct coding_system *);
 928
 929 static void decode_coding_raw_text (struct coding_system *);
 930 static int encode_coding_raw_text (struct coding_system *);
 931
 932 static void coding_set_source (struct coding_system *);
 933 static void coding_set_destination (struct coding_system *);
 934 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 935 static void coding_alloc_by_making_gap (struct coding_system *,
 936                                         EMACS_INT, EMACS_INT);
 937 static unsigned char *alloc_destination (struct coding_system *,
 938                                          EMACS_INT, unsigned char *);
 939 static void setup_iso_safe_charsets (Lisp_Object);
 940 static unsigned char *encode_designation_at_bol (struct coding_system *,
 941                                                  int *, int *,
 942                                                  unsigned char *);
 943 static int detect_eol (const unsigned char *,
 944                        EMACS_INT, enum coding_category);
 945 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 946 static void decode_eol (struct coding_system *);
 947 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 948 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 949 static int produce_chars (struct coding_system *, Lisp_Object, int);
 950 static INLINE void produce_charset (struct coding_system *, int *,
 951                                     EMACS_INT);
 952 static void produce_annotation (struct coding_system *, EMACS_INT);
 953 static int decode_coding (struct coding_system *);
 954 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 955                                                   struct coding_system *,
 956                                                   int *, EMACS_INT *);
 957 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 958                                               struct coding_system *,
 959                                               int *, EMACS_INT *);
 960 static void consume_chars (struct coding_system *, Lisp_Object, int);
 961 static int encode_coding (struct coding_system *);
 962 static Lisp_Object make_conversion_work_buffer (int);
 963 static Lisp_Object code_conversion_restore (Lisp_Object);
 964 static INLINE int char_encodable_p (int, Lisp_Object);
 965 static Lisp_Object make_subsidiaries (Lisp_Object);
 966
 967 static void
 968 record_conversion_result (struct coding_system *coding,
 969                           enum coding_result_code result)
 970 {
 971   coding->result = result;
 972   switch (result)
 973     {
 974     case CODING_RESULT_INSUFFICIENT_SRC:
 975       Vlast_code_conversion_error = Qinsufficient_source;
 976       break;
 977     case CODING_RESULT_INCONSISTENT_EOL:
 978       Vlast_code_conversion_error = Qinconsistent_eol;
 979       break;
 980     case CODING_RESULT_INVALID_SRC:
 981       Vlast_code_conversion_error = Qinvalid_source;
 982       break;
 983     case CODING_RESULT_INTERRUPT:
 984       Vlast_code_conversion_error = Qinterrupted;
 985       break;
 986     case CODING_RESULT_INSUFFICIENT_MEM:
 987       Vlast_code_conversion_error = Qinsufficient_memory;
 988       break;
 989     case CODING_RESULT_INSUFFICIENT_DST:
 990       /* Don't record this error in Vlast_code_conversion_error
 991          because it happens just temporarily and is resolved when the
 992          whole conversion is finished.  */
 993       break;
 994     case CODING_RESULT_SUCCESS:
 995       break;
 996     default:
 997       Vlast_code_conversion_error = intern ("Unknown error");
 998     }
 999 }
1000
1001 /* This wrapper macro is used to preserve validity of pointers into
1002    buffer text across calls to decode_char, which could cause
1003    relocation of buffers if it loads a charset map, because loading a
1004    charset map allocates large structures.  */
1005 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1006   do {                                                                       \
1007     charset_map_loaded = 0;                                                  \
1008     c = DECODE_CHAR (charset, code);                                         \
1009     if (charset_map_loaded)                                                  \
1010       {                                                                      \
1011         const unsigned char *orig = coding->source;                          \
1012         EMACS_INT offset;                                                    \
1013                                                                              \
1014         coding_set_source (coding);                                          \
1015         offset = coding->source - orig;                                      \
1016         src += offset;                                                       \
1017         src_base += offset;                                                  \
1018         src_end += offset;                                                   \
1019       }                                                                      \
1020   } while (0)
1021
1022
1023 /* If there are at least BYTES length of room at dst, allocate memory
1024    for coding->destination and update dst and dst_end.  We don't have
1025    to take care of coding->source which will be relocated.  It is
1026    handled by calling coding_set_source in encode_coding.  */
1027
1028 #define ASSURE_DESTINATION(bytes)                               \
1029   do {                                                          \
1030     if (dst + (bytes) >= dst_end)                               \
1031       {                                                         \
1032         int more_bytes = charbuf_end - charbuf + (bytes);       \
1033                                                                 \
1034         dst = alloc_destination (coding, more_bytes, dst);      \
1035         dst_end = coding->destination + coding->dst_bytes;      \
1036       }                                                         \
1037   } while (0)
1038
1039
1040 /* Store multibyte form of the character C in P, and advance P to the
1041    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1042    never calls MAYBE_UNIFY_CHAR.  */
1043
1044 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1045   do {                                          \
1046     if ((c) <= MAX_1_BYTE_CHAR)                 \
1047       *(p)++ = (c);                             \
1048     else if ((c) <= MAX_2_BYTE_CHAR)            \
1049       *(p)++ = (0xC0 | ((c) >> 6)),             \
1050         *(p)++ = (0x80 | ((c) & 0x3F));         \
1051     else if ((c) <= MAX_3_BYTE_CHAR)            \
1052       *(p)++ = (0xE0 | ((c) >> 12)),            \
1053         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1054         *(p)++ = (0x80 | ((c) & 0x3F));         \
1055     else if ((c) <= MAX_4_BYTE_CHAR)            \
1056       *(p)++ = (0xF0 | (c >> 18)),              \
1057         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1058         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1059         *(p)++ = (0x80 | (c & 0x3F));           \
1060     else if ((c) <= MAX_5_BYTE_CHAR)            \
1061       *(p)++ = 0xF8,                            \
1062         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1063         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1064         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1065         *(p)++ = (0x80 | (c & 0x3F));           \
1066     else                                        \
1067       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1068   } while (0)
1069
1070
1071 /* Return the character code of character whose multibyte form is at
1072    P, and advance P to the end of the multibyte form.  This is like
1073    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1074
1075 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1076   (!((p)[0] & 0x80)                                             \
1077    ? *(p)++                                                     \
1078    : ! ((p)[0] & 0x20)                                          \
1079    ? ((p) += 2,                                                 \
1080       ((((p)[-2] & 0x1F) << 6)                                  \
1081        | ((p)[-1] & 0x3F)                                       \
1082        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1083    : ! ((p)[0] & 0x10)                                          \
1084    ? ((p) += 3,                                                 \
1085       ((((p)[-3] & 0x0F) << 12)                                 \
1086        | (((p)[-2] & 0x3F) << 6)                                \
1087        | ((p)[-1] & 0x3F)))                                     \
1088    : ! ((p)[0] & 0x08)                                          \
1089    ? ((p) += 4,                                                 \
1090       ((((p)[-4] & 0xF) << 18)                                  \
1091        | (((p)[-3] & 0x3F) << 12)                               \
1092        | (((p)[-2] & 0x3F) << 6)                                \
1093        | ((p)[-1] & 0x3F)))                                     \
1094    : ((p) += 5,                                                 \
1095       ((((p)[-4] & 0x3F) << 18)                                 \
1096        | (((p)[-3] & 0x3F) << 12)                               \
1097        | (((p)[-2] & 0x3F) << 6)                                \
1098        | ((p)[-1] & 0x3F))))
1099
1100
1101 static void
1102 coding_set_source (struct coding_system *coding)
1103 {
1104   if (BUFFERP (coding->src_object))
1105     {
1106       struct buffer *buf = XBUFFER (coding->src_object);
1107
1108       if (coding->src_pos < 0)
1109         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1110       else
1111         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1112     }
1113   else if (STRINGP (coding->src_object))
1114     {
1115       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1116     }
1117   else
1118     /* Otherwise, the source is C string and is never relocated
1119        automatically.  Thus we don't have to update anything.  */
1120     ;
1121 }
1122
1123 static void
1124 coding_set_destination (struct coding_system *coding)
1125 {
1126   if (BUFFERP (coding->dst_object))
1127     {
1128       if (coding->src_pos < 0)
1129         {
1130           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1131           coding->dst_bytes = (GAP_END_ADDR
1132                                - (coding->src_bytes - coding->consumed)
1133                                - coding->destination);
1134         }
1135       else
1136         {
1137           /* We are sure that coding->dst_pos_byte is before the gap
1138              of the buffer. */
1139           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1140                                  + coding->dst_pos_byte - BEG_BYTE);
1141           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1142                                - coding->destination);
1143         }
1144     }
1145   else
1146     /* Otherwise, the destination is C string and is never relocated
1147        automatically.  Thus we don't have to update anything.  */
1148     ;
1149 }
1150
1151
1152 static void
1153 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1154 {
1155   coding->destination = (unsigned char *) xrealloc (coding->destination,
1156                                                     coding->dst_bytes + bytes);
1157   coding->dst_bytes += bytes;
1158 }
1159
1160 static void
1161 coding_alloc_by_making_gap (struct coding_system *coding,
1162                             EMACS_INT gap_head_used, EMACS_INT bytes)
1163 {
1164   if (EQ (coding->src_object, coding->dst_object))
1165     {
1166       /* The gap may contain the produced data at the head and not-yet
1167          consumed data at the tail.  To preserve those data, we at
1168          first make the gap size to zero, then increase the gap
1169          size.  */
1170       EMACS_INT add = GAP_SIZE;
1171
1172       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1173       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1174       make_gap (bytes);
1175       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1176       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1177     }
1178   else
1179     {
1180       Lisp_Object this_buffer;
1181
1182       this_buffer = Fcurrent_buffer ();
1183       set_buffer_internal (XBUFFER (coding->dst_object));
1184       make_gap (bytes);
1185       set_buffer_internal (XBUFFER (this_buffer));
1186     }
1187 }
1188
1189
1190 static unsigned char *
1191 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1192                    unsigned char *dst)
1193 {
1194   EMACS_INT offset = dst - coding->destination;
1195
1196   if (BUFFERP (coding->dst_object))
1197     {
1198       struct buffer *buf = XBUFFER (coding->dst_object);
1199
1200       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1201     }
1202   else
1203     coding_alloc_by_realloc (coding, nbytes);
1204   coding_set_destination (coding);
1205   dst = coding->destination + offset;
1206   return dst;
1207 }
1208
1209 /** Macros for annotations.  */
1210
1211 /* An annotation data is stored in the array coding->charbuf in this
1212    format:
1213      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1214    LENGTH is the number of elements in the annotation.
1215    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1216    NCHARS is the number of characters in the text annotated.
1217
1218    The format of the following elements depend on ANNOTATION_MASK.
1219
1220    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1221    follows:
1222      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1223
1224    NBYTES is the number of bytes specified in the header part of
1225    old-style emacs-mule encoding, or 0 for the other kind of
1226    composition.
1227
1228    METHOD is one of enum composition_method.
1229
1230    Optional COMPOSITION-COMPONENTS are characters and composition
1231    rules.
1232
1233    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1234    follows.
1235
1236    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1237    recover from an invalid annotation, and should be skipped by
1238    produce_annotation.  */
1239
1240 /* Maximum length of the header of annotation data.  */
1241 #define MAX_ANNOTATION_LENGTH 5
1242
1243 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1244   do {                                                  \
1245     *(buf)++ = -(len);                                  \
1246     *(buf)++ = (mask);                                  \
1247     *(buf)++ = (nchars);                                \
1248     coding->annotated = 1;                              \
1249   } while (0);
1250
1251 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1252   do {                                                                      \
1253     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1254     *buf++ = nbytes;                                                        \
1255     *buf++ = method;                                                        \
1256   } while (0)
1257
1258
1259 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1260   do {                                                                  \
1261     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1262     *buf++ = id;                                                        \
1263   } while (0)
1264
1265 \f
1266 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1267
1268
1269
1270 \f
1271 /*** 3. UTF-8 ***/
1272
1273 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1274    Check if a text is encoded in UTF-8.  If it is, return 1, else
1275    return 0.  */
1276
1277 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1278 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1279 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1280 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1281 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1282 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1283
1284 #define UTF_BOM 0xFEFF
1285 #define UTF_8_BOM_1 0xEF
1286 #define UTF_8_BOM_2 0xBB
1287 #define UTF_8_BOM_3 0xBF
1288
1289 static int
1290 detect_coding_utf_8 (struct coding_system *coding,
1291                      struct coding_detection_info *detect_info)
1292 {
1293   const unsigned char *src = coding->source, *src_base;
1294   const unsigned char *src_end = coding->source + coding->src_bytes;
1295   int multibytep = coding->src_multibyte;
1296   int consumed_chars = 0;
1297   int bom_found = 0;
1298   int found = 0;
1299
1300   detect_info->checked |= CATEGORY_MASK_UTF_8;
1301   /* A coding system of this category is always ASCII compatible.  */
1302   src += coding->head_ascii;
1303
1304   while (1)
1305     {
1306       int c, c1, c2, c3, c4;
1307
1308       src_base = src;
1309       ONE_MORE_BYTE (c);
1310       if (c < 0 || UTF_8_1_OCTET_P (c))
1311         continue;
1312       ONE_MORE_BYTE (c1);
1313       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1314         break;
1315       if (UTF_8_2_OCTET_LEADING_P (c))
1316         {
1317           found = 1;
1318           continue;
1319         }
1320       ONE_MORE_BYTE (c2);
1321       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1322         break;
1323       if (UTF_8_3_OCTET_LEADING_P (c))
1324         {
1325           found = 1;
1326           if (src_base == coding->source
1327               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1328             bom_found = 1;
1329           continue;
1330         }
1331       ONE_MORE_BYTE (c3);
1332       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1333         break;
1334       if (UTF_8_4_OCTET_LEADING_P (c))
1335         {
1336           found = 1;
1337           continue;
1338         }
1339       ONE_MORE_BYTE (c4);
1340       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1341         break;
1342       if (UTF_8_5_OCTET_LEADING_P (c))
1343         {
1344           found = 1;
1345           continue;
1346         }
1347       break;
1348     }
1349   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1350   return 0;
1351
1352  no_more_source:
1353   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1354     {
1355       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1356       return 0;
1357     }
1358   if (bom_found)
1359     {
1360       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1361       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1362     }
1363   else
1364     {
1365       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1366       if (found)
1367         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1368     }
1369   return 1;
1370 }
1371
1372
1373 static void
1374 decode_coding_utf_8 (struct coding_system *coding)
1375 {
1376   const unsigned char *src = coding->source + coding->consumed;
1377   const unsigned char *src_end = coding->source + coding->src_bytes;
1378   const unsigned char *src_base;
1379   int *charbuf = coding->charbuf + coding->charbuf_used;
1380   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1381   int consumed_chars = 0, consumed_chars_base = 0;
1382   int multibytep = coding->src_multibyte;
1383   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1384   Lisp_Object attr, charset_list;
1385   int eol_crlf =
1386     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1387   int byte_after_cr = -1;
1388
1389   CODING_GET_INFO (coding, attr, charset_list);
1390
1391   if (bom != utf_without_bom)
1392     {
1393       int c1, c2, c3;
1394
1395       src_base = src;
1396       ONE_MORE_BYTE (c1);
1397       if (! UTF_8_3_OCTET_LEADING_P (c1))
1398         src = src_base;
1399       else
1400         {
1401           ONE_MORE_BYTE (c2);
1402           if (! UTF_8_EXTRA_OCTET_P (c2))
1403             src = src_base;
1404           else
1405             {
1406               ONE_MORE_BYTE (c3);
1407               if (! UTF_8_EXTRA_OCTET_P (c3))
1408                 src = src_base;
1409               else
1410                 {
1411                   if ((c1 != UTF_8_BOM_1)
1412                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1413                     src = src_base;
1414                   else
1415                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1416                 }
1417             }
1418         }
1419     }
1420   CODING_UTF_8_BOM (coding) = utf_without_bom;
1421
1422   while (1)
1423     {
1424       int c, c1, c2, c3, c4, c5;
1425
1426       src_base = src;
1427       consumed_chars_base = consumed_chars;
1428
1429       if (charbuf >= charbuf_end)
1430         {
1431           if (byte_after_cr >= 0)
1432             src_base--;
1433           break;
1434         }
1435
1436       if (byte_after_cr >= 0)
1437         c1 = byte_after_cr, byte_after_cr = -1;
1438       else
1439         ONE_MORE_BYTE (c1);
1440       if (c1 < 0)
1441         {
1442           c = - c1;
1443         }
1444       else if (UTF_8_1_OCTET_P (c1))
1445         {
1446           if (eol_crlf && c1 == '\r')
1447             ONE_MORE_BYTE (byte_after_cr);
1448           c = c1;
1449         }
1450       else
1451         {
1452           ONE_MORE_BYTE (c2);
1453           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1454             goto invalid_code;
1455           if (UTF_8_2_OCTET_LEADING_P (c1))
1456             {
1457               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1458               /* Reject overlong sequences here and below.  Encoders
1459                  producing them are incorrect, they can be misleading,
1460                  and they mess up read/write invariance.  */
1461               if (c < 128)
1462                 goto invalid_code;
1463             }
1464           else
1465             {
1466               ONE_MORE_BYTE (c3);
1467               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1468                 goto invalid_code;
1469               if (UTF_8_3_OCTET_LEADING_P (c1))
1470                 {
1471                   c = (((c1 & 0xF) << 12)
1472                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1473                   if (c < 0x800
1474                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1475                     goto invalid_code;
1476                 }
1477               else
1478                 {
1479                   ONE_MORE_BYTE (c4);
1480                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1481                     goto invalid_code;
1482                   if (UTF_8_4_OCTET_LEADING_P (c1))
1483                     {
1484                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1485                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1486                     if (c < 0x10000)
1487                       goto invalid_code;
1488                     }
1489                   else
1490                     {
1491                       ONE_MORE_BYTE (c5);
1492                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1493                         goto invalid_code;
1494                       if (UTF_8_5_OCTET_LEADING_P (c1))
1495                         {
1496                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1497                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1498                                | (c5 & 0x3F));
1499                           if ((c > MAX_CHAR) || (c < 0x200000))
1500                             goto invalid_code;
1501                         }
1502                       else
1503                         goto invalid_code;
1504                     }
1505                 }
1506             }
1507         }
1508
1509       *charbuf++ = c;
1510       continue;
1511
1512     invalid_code:
1513       src = src_base;
1514       consumed_chars = consumed_chars_base;
1515       ONE_MORE_BYTE (c);
1516       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1517       coding->errors++;
1518     }
1519
1520  no_more_source:
1521   coding->consumed_char += consumed_chars_base;
1522   coding->consumed = src_base - coding->source;
1523   coding->charbuf_used = charbuf - coding->charbuf;
1524 }
1525
1526
1527 static int
1528 encode_coding_utf_8 (struct coding_system *coding)
1529 {
1530   int multibytep = coding->dst_multibyte;
1531   int *charbuf = coding->charbuf;
1532   int *charbuf_end = charbuf + coding->charbuf_used;
1533   unsigned char *dst = coding->destination + coding->produced;
1534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1535   int produced_chars = 0;
1536   int c;
1537
1538   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1539     {
1540       ASSURE_DESTINATION (3);
1541       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1542       CODING_UTF_8_BOM (coding) = utf_without_bom;
1543     }
1544
1545   if (multibytep)
1546     {
1547       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1548
1549       while (charbuf < charbuf_end)
1550         {
1551           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1552
1553           ASSURE_DESTINATION (safe_room);
1554           c = *charbuf++;
1555           if (CHAR_BYTE8_P (c))
1556             {
1557               c = CHAR_TO_BYTE8 (c);
1558               EMIT_ONE_BYTE (c);
1559             }
1560           else
1561             {
1562               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1563               for (p = str; p < pend; p++)
1564                 EMIT_ONE_BYTE (*p);
1565             }
1566         }
1567     }
1568   else
1569     {
1570       int safe_room = MAX_MULTIBYTE_LENGTH;
1571
1572       while (charbuf < charbuf_end)
1573         {
1574           ASSURE_DESTINATION (safe_room);
1575           c = *charbuf++;
1576           if (CHAR_BYTE8_P (c))
1577             *dst++ = CHAR_TO_BYTE8 (c);
1578           else
1579             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1580           produced_chars++;
1581         }
1582     }
1583   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1584   coding->produced_char += produced_chars;
1585   coding->produced = dst - coding->destination;
1586   return 0;
1587 }
1588
1589
1590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591    Check if a text is encoded in one of UTF-16 based coding systems.
1592    If it is, return 1, else return 0.  */
1593
1594 #define UTF_16_HIGH_SURROGATE_P(val) \
1595   (((val) & 0xFC00) == 0xD800)
1596
1597 #define UTF_16_LOW_SURROGATE_P(val) \
1598   (((val) & 0xFC00) == 0xDC00)
1599
1600 #define UTF_16_INVALID_P(val)   \
1601   (((val) == 0xFFFE)            \
1602    || ((val) == 0xFFFF)         \
1603    || UTF_16_LOW_SURROGATE_P (val))
1604
1605
1606 static int
1607 detect_coding_utf_16 (struct coding_system *coding,
1608                       struct coding_detection_info *detect_info)
1609 {
1610   const unsigned char *src = coding->source;
1611   const unsigned char *src_end = coding->source + coding->src_bytes;
1612   int multibytep = coding->src_multibyte;
1613   int c1, c2;
1614
1615   detect_info->checked |= CATEGORY_MASK_UTF_16;
1616   if (coding->mode & CODING_MODE_LAST_BLOCK
1617       && (coding->src_chars & 1))
1618     {
1619       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1620       return 0;
1621     }
1622
1623   TWO_MORE_BYTES (c1, c2);
1624   if ((c1 == 0xFF) && (c2 == 0xFE))
1625     {
1626       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1627                              | CATEGORY_MASK_UTF_16_AUTO);
1628       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1629                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1630                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1631     }
1632   else if ((c1 == 0xFE) && (c2 == 0xFF))
1633     {
1634       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1635                              | CATEGORY_MASK_UTF_16_AUTO);
1636       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1637                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1638                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1639     }
1640   else if (c2 < 0)
1641     {
1642       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1643       return 0;
1644     }
1645   else
1646     {
1647       /* We check the dispersion of Eth and Oth bytes where E is even and
1648          O is odd.  If both are high, we assume binary data.*/
1649       unsigned char e[256], o[256];
1650       unsigned e_num = 1, o_num = 1;
1651
1652       memset (e, 0, 256);
1653       memset (o, 0, 256);
1654       e[c1] = 1;
1655       o[c2] = 1;
1656
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1658                                 |CATEGORY_MASK_UTF_16_BE
1659                                 | CATEGORY_MASK_UTF_16_LE);
1660
1661       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1662              != CATEGORY_MASK_UTF_16)
1663         {
1664           TWO_MORE_BYTES (c1, c2);
1665           if (c2 < 0)
1666             break;
1667           if (! e[c1])
1668             {
1669               e[c1] = 1;
1670               e_num++;
1671               if (e_num >= 128)
1672                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1673             }
1674           if (! o[c2])
1675             {
1676               o[c2] = 1;
1677               o_num++;
1678               if (o_num >= 128)
1679                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1680             }
1681         }
1682       return 0;
1683     }
1684
1685  no_more_source:
1686   return 1;
1687 }
1688
1689 static void
1690 decode_coding_utf_16 (struct coding_system *coding)
1691 {
1692   const unsigned char *src = coding->source + coding->consumed;
1693   const unsigned char *src_end = coding->source + coding->src_bytes;
1694   const unsigned char *src_base;
1695   int *charbuf = coding->charbuf + coding->charbuf_used;
1696   /* We may produces at most 3 chars in one loop.  */
1697   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1698   int consumed_chars = 0, consumed_chars_base = 0;
1699   int multibytep = coding->src_multibyte;
1700   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1701   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1702   int surrogate = CODING_UTF_16_SURROGATE (coding);
1703   Lisp_Object attr, charset_list;
1704   int eol_crlf =
1705     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1706   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1707
1708   CODING_GET_INFO (coding, attr, charset_list);
1709
1710   if (bom == utf_with_bom)
1711     {
1712       int c, c1, c2;
1713
1714       src_base = src;
1715       ONE_MORE_BYTE (c1);
1716       ONE_MORE_BYTE (c2);
1717       c = (c1 << 8) | c2;
1718
1719       if (endian == utf_16_big_endian
1720           ? c != 0xFEFF : c != 0xFFFE)
1721         {
1722           /* The first two bytes are not BOM.  Treat them as bytes
1723              for a normal character.  */
1724           src = src_base;
1725           coding->errors++;
1726         }
1727       CODING_UTF_16_BOM (coding) = utf_without_bom;
1728     }
1729   else if (bom == utf_detect_bom)
1730     {
1731       /* We have already tried to detect BOM and failed in
1732          detect_coding.  */
1733       CODING_UTF_16_BOM (coding) = utf_without_bom;
1734     }
1735
1736   while (1)
1737     {
1738       int c, c1, c2;
1739
1740       src_base = src;
1741       consumed_chars_base = consumed_chars;
1742
1743       if (charbuf >= charbuf_end)
1744         {
1745           if (byte_after_cr1 >= 0)
1746             src_base -= 2;
1747           break;
1748         }
1749
1750       if (byte_after_cr1 >= 0)
1751         c1 = byte_after_cr1, byte_after_cr1 = -1;
1752       else
1753         ONE_MORE_BYTE (c1);
1754       if (c1 < 0)
1755         {
1756           *charbuf++ = -c1;
1757           continue;
1758         }
1759       if (byte_after_cr2 >= 0)
1760         c2 = byte_after_cr2, byte_after_cr2 = -1;
1761       else
1762         ONE_MORE_BYTE (c2);
1763       if (c2 < 0)
1764         {
1765           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1766           *charbuf++ = -c2;
1767           continue;
1768         }
1769       c = (endian == utf_16_big_endian
1770            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1771
1772       if (surrogate)
1773         {
1774           if (! UTF_16_LOW_SURROGATE_P (c))
1775             {
1776               if (endian == utf_16_big_endian)
1777                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1778               else
1779                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1780               *charbuf++ = c1;
1781               *charbuf++ = c2;
1782               coding->errors++;
1783               if (UTF_16_HIGH_SURROGATE_P (c))
1784                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1785               else
1786                 *charbuf++ = c;
1787             }
1788           else
1789             {
1790               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1791               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1792               *charbuf++ = 0x10000 + c;
1793             }
1794         }
1795       else
1796         {
1797           if (UTF_16_HIGH_SURROGATE_P (c))
1798             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1799           else
1800             {
1801               if (eol_crlf && c == '\r')
1802                 {
1803                   ONE_MORE_BYTE (byte_after_cr1);
1804                   ONE_MORE_BYTE (byte_after_cr2);
1805                 }
1806               *charbuf++ = c;
1807             }
1808         }
1809     }
1810
1811  no_more_source:
1812   coding->consumed_char += consumed_chars_base;
1813   coding->consumed = src_base - coding->source;
1814   coding->charbuf_used = charbuf - coding->charbuf;
1815 }
1816
1817 static int
1818 encode_coding_utf_16 (struct coding_system *coding)
1819 {
1820   int multibytep = coding->dst_multibyte;
1821   int *charbuf = coding->charbuf;
1822   int *charbuf_end = charbuf + coding->charbuf_used;
1823   unsigned char *dst = coding->destination + coding->produced;
1824   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1825   int safe_room = 8;
1826   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1827   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1828   int produced_chars = 0;
1829   Lisp_Object attrs, charset_list;
1830   int c;
1831
1832   CODING_GET_INFO (coding, attrs, charset_list);
1833
1834   if (bom != utf_without_bom)
1835     {
1836       ASSURE_DESTINATION (safe_room);
1837       if (big_endian)
1838         EMIT_TWO_BYTES (0xFE, 0xFF);
1839       else
1840         EMIT_TWO_BYTES (0xFF, 0xFE);
1841       CODING_UTF_16_BOM (coding) = utf_without_bom;
1842     }
1843
1844   while (charbuf < charbuf_end)
1845     {
1846       ASSURE_DESTINATION (safe_room);
1847       c = *charbuf++;
1848       if (c > MAX_UNICODE_CHAR)
1849         c = coding->default_char;
1850
1851       if (c < 0x10000)
1852         {
1853           if (big_endian)
1854             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1855           else
1856             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1857         }
1858       else
1859         {
1860           int c1, c2;
1861
1862           c -= 0x10000;
1863           c1 = (c >> 10) + 0xD800;
1864           c2 = (c & 0x3FF) + 0xDC00;
1865           if (big_endian)
1866             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1867           else
1868             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1869         }
1870     }
1871   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1872   coding->produced = dst - coding->destination;
1873   coding->produced_char += produced_chars;
1874   return 0;
1875 }
1876
1877 \f
1878 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1879
1880 /* Emacs' internal format for representation of multiple character
1881    sets is a kind of multi-byte encoding, i.e. characters are
1882    represented by variable-length sequences of one-byte codes.
1883
1884    ASCII characters and control characters (e.g. `tab', `newline') are
1885    represented by one-byte sequences which are their ASCII codes, in
1886    the range 0x00 through 0x7F.
1887
1888    8-bit characters of the range 0x80..0x9F are represented by
1889    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1890    code + 0x20).
1891
1892    8-bit characters of the range 0xA0..0xFF are represented by
1893    one-byte sequences which are their 8-bit code.
1894
1895    The other characters are represented by a sequence of `base
1896    leading-code', optional `extended leading-code', and one or two
1897    `position-code's.  The length of the sequence is determined by the
1898    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1899    whereas extended leading-code and position-code take the range 0xA0
1900    through 0xFF.  See `charset.h' for more details about leading-code
1901    and position-code.
1902
1903    --- CODE RANGE of Emacs' internal format ---
1904    character set        range
1905    -------------        -----
1906    ascii                0x00..0x7F
1907    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1908    eight-bit-graphic    0xA0..0xBF
1909    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1910    ---------------------------------------------
1911
1912    As this is the internal character representation, the format is
1913    usually not used externally (i.e. in a file or in a data sent to a
1914    process).  But, it is possible to have a text externally in this
1915    format (i.e. by encoding by the coding system `emacs-mule').
1916
1917    In that case, a sequence of one-byte codes has a slightly different
1918    form.
1919
1920    At first, all characters in eight-bit-control are represented by
1921    one-byte sequences which are their 8-bit code.
1922
1923    Next, character composition data are represented by the byte
1924    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1925    where,
1926         METHOD is 0xF2 plus one of composition method (enum
1927         composition_method),
1928
1929         BYTES is 0xA0 plus a byte length of this composition data,
1930
1931         CHARS is 0xA0 plus a number of characters composed by this
1932         data,
1933
1934         COMPONENTs are characters of multibyte form or composition
1935         rules encoded by two-byte of ASCII codes.
1936
1937    In addition, for backward compatibility, the following formats are
1938    also recognized as composition data on decoding.
1939
1940    0x80 MSEQ ...
1941    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1942
1943    Here,
1944         MSEQ is a multibyte form but in these special format:
1945           ASCII: 0xA0 ASCII_CODE+0x80,
1946           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1947         RULE is a one byte code of the range 0xA0..0xF0 that
1948         represents a composition rule.
1949   */
1950
1951 char emacs_mule_bytes[256];
1952
1953
1954 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1955    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1956    else return 0.  */
1957
1958 static int
1959 detect_coding_emacs_mule (struct coding_system *coding,
1960                           struct coding_detection_info *detect_info)
1961 {
1962   const unsigned char *src = coding->source, *src_base;
1963   const unsigned char *src_end = coding->source + coding->src_bytes;
1964   int multibytep = coding->src_multibyte;
1965   int consumed_chars = 0;
1966   int c;
1967   int found = 0;
1968
1969   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1970   /* A coding system of this category is always ASCII compatible.  */
1971   src += coding->head_ascii;
1972
1973   while (1)
1974     {
1975       src_base = src;
1976       ONE_MORE_BYTE (c);
1977       if (c < 0)
1978         continue;
1979       if (c == 0x80)
1980         {
1981           /* Perhaps the start of composite character.  We simply skip
1982              it because analyzing it is too heavy for detecting.  But,
1983              at least, we check that the composite character
1984              constitutes of more than 4 bytes.  */
1985           const unsigned char *src_base;
1986
1987         repeat:
1988           src_base = src;
1989           do
1990             {
1991               ONE_MORE_BYTE (c);
1992             }
1993           while (c >= 0xA0);
1994
1995           if (src - src_base <= 4)
1996             break;
1997           found = CATEGORY_MASK_EMACS_MULE;
1998           if (c == 0x80)
1999             goto repeat;
2000         }
2001
2002       if (c < 0x80)
2003         {
2004           if (c < 0x20
2005               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2006             break;
2007         }
2008       else
2009         {
2010           int more_bytes = emacs_mule_bytes[c] - 1;
2011
2012           while (more_bytes > 0)
2013             {
2014               ONE_MORE_BYTE (c);
2015               if (c < 0xA0)
2016                 {
2017                   src--;        /* Unread the last byte.  */
2018                   break;
2019                 }
2020               more_bytes--;
2021             }
2022           if (more_bytes != 0)
2023             break;
2024           found = CATEGORY_MASK_EMACS_MULE;
2025         }
2026     }
2027   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2028   return 0;
2029
2030  no_more_source:
2031   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2032     {
2033       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2034       return 0;
2035     }
2036   detect_info->found |= found;
2037   return 1;
2038 }
2039
2040
2041 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2042    character.  If CMP_STATUS indicates that we must expect MSEQ or
2043    RULE described above, decode it and return the negative value of
2044    the decoded character or rule.  If an invalid byte is found, return
2045    -1.  If SRC is too short, return -2.  */
2046
2047 int
2048 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2049                  int *nbytes, int *nchars, int *id,
2050                  struct composition_status *cmp_status)
2051 {
2052   const unsigned char *src_end = coding->source + coding->src_bytes;
2053   const unsigned char *src_base = src;
2054   int multibytep = coding->src_multibyte;
2055   int charset_id;
2056   unsigned code;
2057   int c;
2058   int consumed_chars = 0;
2059   int mseq_found = 0;
2060
2061   ONE_MORE_BYTE (c);
2062   if (c < 0)
2063     {
2064       c = -c;
2065       charset_id = emacs_mule_charset[0];
2066     }
2067   else
2068     {
2069       if (c >= 0xA0)
2070         {
2071           if (cmp_status->state != COMPOSING_NO
2072               && cmp_status->old_form)
2073             {
2074               if (cmp_status->state == COMPOSING_CHAR)
2075                 {
2076                   if (c == 0xA0)
2077                     {
2078                       ONE_MORE_BYTE (c);
2079                       c -= 0x80;
2080                       if (c < 0)
2081                         goto invalid_code;
2082                     }
2083                   else
2084                     c -= 0x20;
2085                   mseq_found = 1;
2086                 }
2087               else
2088                 {
2089                   *nbytes = src - src_base;
2090                   *nchars = consumed_chars;
2091                   return -c;
2092                 }
2093             }
2094           else
2095             goto invalid_code;
2096         }
2097
2098       switch (emacs_mule_bytes[c])
2099         {
2100         case 2:
2101           if ((charset_id = emacs_mule_charset[c]) < 0)
2102             goto invalid_code;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code = c & 0x7F;
2107           break;
2108
2109         case 3:
2110           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2111               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2112             {
2113               ONE_MORE_BYTE (c);
2114               if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2115                 goto invalid_code;
2116               ONE_MORE_BYTE (c);
2117               if (c < 0xA0)
2118                 goto invalid_code;
2119               code = c & 0x7F;
2120             }
2121           else
2122             {
2123               if ((charset_id = emacs_mule_charset[c]) < 0)
2124                 goto invalid_code;
2125               ONE_MORE_BYTE (c);
2126               if (c < 0xA0)
2127                 goto invalid_code;
2128               code = (c & 0x7F) << 8;
2129               ONE_MORE_BYTE (c);
2130               if (c < 0xA0)
2131                 goto invalid_code;
2132               code |= c & 0x7F;
2133             }
2134           break;
2135
2136         case 4:
2137           ONE_MORE_BYTE (c);
2138           if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2139             goto invalid_code;
2140           ONE_MORE_BYTE (c);
2141           if (c < 0xA0)
2142             goto invalid_code;
2143           code = (c & 0x7F) << 8;
2144           ONE_MORE_BYTE (c);
2145           if (c < 0xA0)
2146             goto invalid_code;
2147           code |= c & 0x7F;
2148           break;
2149
2150         case 1:
2151           code = c;
2152           charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2153           break;
2154
2155         default:
2156           abort ();
2157         }
2158       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2159                           CHARSET_FROM_ID (charset_id), code, c);
2160       if (c < 0)
2161         goto invalid_code;
2162     }
2163   *nbytes = src - src_base;
2164   *nchars = consumed_chars;
2165   if (id)
2166     *id = charset_id;
2167   return (mseq_found ? -c : c);
2168
2169  no_more_source:
2170   return -2;
2171
2172  invalid_code:
2173   return -1;
2174 }
2175
2176
2177 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2178
2179 /* Handle these composition sequence ('|': the end of header elements,
2180    BYTES and CHARS >= 0xA0):
2181
2182    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2183    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2184    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2185
2186    and these old form:
2187
2188    (4) relative composition: 0x80 | MSEQ ... MSEQ
2189    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2190
2191    When the starter 0x80 and the following header elements are found,
2192    this annotation header is produced.
2193
2194         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2195
2196    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2197    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2198
2199    Then, upon reading the following elements, these codes are produced
2200    until the composition end is found:
2201
2202    (1) CHAR ... CHAR
2203    (2) ALT ... ALT CHAR ... CHAR
2204    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2205    (4) CHAR ... CHAR
2206    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2207
2208    When the composition end is found, LENGTH and NCHARS in the
2209    annotation header is updated as below:
2210
2211    (1) LENGTH: unchanged, NCHARS: unchanged
2212    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2213    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2214    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2215    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2216
2217    If an error is found while composing, the annotation header is
2218    changed to the original composition header (plus filler -1s) as
2219    below:
2220
2221    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2222    (5)          [ 0x80 0xFF -1 -1- -1 ]
2223
2224    and the sequence [ -2 DECODED-RULE ] is changed to the original
2225    byte sequence as below:
2226         o the original byte sequence is B: [ B -1 ]
2227         o the original byte sequence is B1 B2: [ B1 B2 ]
2228
2229    Most of the routines are implemented by macros because many
2230    variables and labels in the caller decode_coding_emacs_mule must be
2231    accessible, and they are usually called just once (thus doesn't
2232    increase the size of compiled object).  */
2233
2234 /* Decode a composition rule represented by C as a component of
2235    composition sequence of Emacs 20 style.  Set RULE to the decoded
2236    rule. */
2237
2238 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2239   do {                                                  \
2240     int gref, nref;                                     \
2241                                                         \
2242     c -= 0xA0;                                          \
2243     if (c < 0 || c >= 81)                               \
2244       goto invalid_code;                                \
2245     gref = c / 9, nref = c % 9;                         \
2246     if (gref == 4) gref = 10;                           \
2247     if (nref == 4) nref = 10;                           \
2248     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2249   } while (0)
2250
2251
2252 /* Decode a composition rule represented by C and the following byte
2253    at SRC as a component of composition sequence of Emacs 21 style.
2254    Set RULE to the decoded rule.  */
2255
2256 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2257   do {                                                  \
2258     int gref, nref;                                     \
2259                                                         \
2260     gref = c - 0x20;                                    \
2261     if (gref < 0 || gref >= 81)                         \
2262       goto invalid_code;                                \
2263     ONE_MORE_BYTE (c);                                  \
2264     nref = c - 0x20;                                    \
2265     if (nref < 0 || nref >= 81)                         \
2266       goto invalid_code;                                \
2267     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2268   } while (0)
2269
2270
2271 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2272    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2273    byte length of this composition information, CHARS is the number of
2274    characters composed by this composition.  */
2275
2276 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2277   do {                                                                  \
2278     enum composition_method method = c - 0xF2;                          \
2279     int nbytes, nchars;                                                 \
2280                                                                         \
2281     ONE_MORE_BYTE (c);                                                  \
2282     if (c < 0)                                                          \
2283       goto invalid_code;                                                \
2284     nbytes = c - 0xA0;                                                  \
2285     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2286       goto invalid_code;                                                \
2287     ONE_MORE_BYTE (c);                                                  \
2288     nchars = c - 0xA0;                                                  \
2289     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2290       goto invalid_code;                                                \
2291     cmp_status->old_form = 0;                                           \
2292     cmp_status->method = method;                                        \
2293     if (method == COMPOSITION_RELATIVE)                                 \
2294       cmp_status->state = COMPOSING_CHAR;                               \
2295     else                                                                \
2296       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2297     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2298     cmp_status->nchars = nchars;                                        \
2299     cmp_status->ncomps = nbytes - 4;                                    \
2300     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2301   } while (0)
2302
2303
2304 /* Start of Emacs 20 style format for relative composition.  */
2305
2306 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2307   do {                                                          \
2308     cmp_status->old_form = 1;                                   \
2309     cmp_status->method = COMPOSITION_RELATIVE;                  \
2310     cmp_status->state = COMPOSING_CHAR;                         \
2311     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2312     cmp_status->nchars = cmp_status->ncomps = 0;                \
2313     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2314   } while (0)
2315
2316
2317 /* Start of Emacs 20 style format for rule-base composition.  */
2318
2319 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2320   do {                                                          \
2321     cmp_status->old_form = 1;                                   \
2322     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2323     cmp_status->state = COMPOSING_CHAR;                         \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2325     cmp_status->nchars = cmp_status->ncomps = 0;                \
2326     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2327   } while (0)
2328
2329
2330 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2331   do {                                                  \
2332     const unsigned char *current_src = src;             \
2333                                                         \
2334     ONE_MORE_BYTE (c);                                  \
2335     if (c < 0)                                          \
2336       goto invalid_code;                                \
2337     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2338         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2339       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2340     else if (c < 0xA0)                                  \
2341       goto invalid_code;                                \
2342     else if (c < 0xC0)                                  \
2343       {                                                 \
2344         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2345         /* Re-read C as a composition component.  */    \
2346         src = current_src;                              \
2347       }                                                 \
2348     else if (c == 0xFF)                                 \
2349       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2350     else                                                \
2351       goto invalid_code;                                \
2352   } while (0)
2353
2354 #define EMACS_MULE_COMPOSITION_END()                            \
2355   do {                                                          \
2356     int idx = - cmp_status->length;                             \
2357                                                                 \
2358     if (cmp_status->old_form)                                   \
2359       charbuf[idx + 2] = cmp_status->nchars;                    \
2360     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2361       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2362     cmp_status->state = COMPOSING_NO;                           \
2363   } while (0)
2364
2365
2366 static int
2367 emacs_mule_finish_composition (int *charbuf,
2368                                struct composition_status *cmp_status)
2369 {
2370   int idx = - cmp_status->length;
2371   int new_chars;
2372
2373   if (cmp_status->old_form && cmp_status->nchars > 0)
2374     {
2375       charbuf[idx + 2] = cmp_status->nchars;
2376       new_chars = 0;
2377       if (cmp_status->method == COMPOSITION_WITH_RULE
2378           && cmp_status->state == COMPOSING_CHAR)
2379         {
2380           /* The last rule was invalid.  */
2381           int rule = charbuf[-1] + 0xA0;
2382
2383           charbuf[-2] = BYTE8_TO_CHAR (rule);
2384           charbuf[-1] = -1;
2385           new_chars = 1;
2386         }
2387     }
2388   else
2389     {
2390       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2391
2392       if (cmp_status->method == COMPOSITION_WITH_RULE)
2393         {
2394           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2395           charbuf[idx++] = -3;
2396           charbuf[idx++] = 0;
2397           new_chars = 1;
2398         }
2399       else
2400         {
2401           int nchars = charbuf[idx + 1] + 0xA0;
2402           int nbytes = charbuf[idx + 2] + 0xA0;
2403
2404           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2405           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2406           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2407           charbuf[idx++] = -1;
2408           new_chars = 4;
2409         }
2410     }
2411   cmp_status->state = COMPOSING_NO;
2412   return new_chars;
2413 }
2414
2415 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2416   do {                                                                    \
2417     if (cmp_status->state != COMPOSING_NO)                                \
2418       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2419   } while (0)
2420
2421
2422 static void
2423 decode_coding_emacs_mule (struct coding_system *coding)
2424 {
2425   const unsigned char *src = coding->source + coding->consumed;
2426   const unsigned char *src_end = coding->source + coding->src_bytes;
2427   const unsigned char *src_base;
2428   int *charbuf = coding->charbuf + coding->charbuf_used;
2429   /* We may produce two annotations (charset and composition) in one
2430      loop and one more charset annotation at the end.  */
2431   int *charbuf_end
2432     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2433   int consumed_chars = 0, consumed_chars_base;
2434   int multibytep = coding->src_multibyte;
2435   Lisp_Object attrs, charset_list;
2436   int char_offset = coding->produced_char;
2437   int last_offset = char_offset;
2438   int last_id = charset_ascii;
2439   int eol_crlf =
2440     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2441   int byte_after_cr = -1;
2442   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2443
2444   CODING_GET_INFO (coding, attrs, charset_list);
2445
2446   if (cmp_status->state != COMPOSING_NO)
2447     {
2448       int i;
2449
2450       for (i = 0; i < cmp_status->length; i++)
2451         *charbuf++ = cmp_status->carryover[i];
2452       coding->annotated = 1;
2453     }
2454
2455   while (1)
2456     {
2457       int c, id;
2458
2459       src_base = src;
2460       consumed_chars_base = consumed_chars;
2461
2462       if (charbuf >= charbuf_end)
2463         {
2464           if (byte_after_cr >= 0)
2465             src_base--;
2466           break;
2467         }
2468
2469       if (byte_after_cr >= 0)
2470         c = byte_after_cr, byte_after_cr = -1;
2471       else
2472         ONE_MORE_BYTE (c);
2473
2474       if (c < 0 || c == 0x80)
2475         {
2476           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2477           if (c < 0)
2478             {
2479               *charbuf++ = -c;
2480               char_offset++;
2481             }
2482           else
2483             DECODE_EMACS_MULE_COMPOSITION_START ();
2484           continue;
2485         }
2486
2487       if (c < 0x80)
2488         {
2489           if (eol_crlf && c == '\r')
2490             ONE_MORE_BYTE (byte_after_cr);
2491           id = charset_ascii;
2492           if (cmp_status->state != COMPOSING_NO)
2493             {
2494               if (cmp_status->old_form)
2495                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2496               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2497                 cmp_status->ncomps--;
2498             }
2499         }
2500       else
2501         {
2502           int nchars, nbytes;
2503           /* emacs_mule_char can load a charset map from a file, which
2504              allocates a large structure and might cause buffer text
2505              to be relocated as result.  Thus, we need to remember the
2506              original pointer to buffer text, and fix up all related
2507              pointers after the call.  */
2508           const unsigned char *orig = coding->source;
2509           EMACS_INT offset;
2510
2511           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2512                                cmp_status);
2513           offset = coding->source - orig;
2514           if (offset)
2515             {
2516               src += offset;
2517               src_base += offset;
2518               src_end += offset;
2519             }
2520           if (c < 0)
2521             {
2522               if (c == -1)
2523                 goto invalid_code;
2524               if (c == -2)
2525                 break;
2526             }
2527           src = src_base + nbytes;
2528           consumed_chars = consumed_chars_base + nchars;
2529           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2530             cmp_status->ncomps -= nchars;
2531         }
2532
2533       /* Now if C >= 0, we found a normally encoded character, if C <
2534          0, we found an old-style composition component character or
2535          rule.  */
2536
2537       if (cmp_status->state == COMPOSING_NO)
2538         {
2539           if (last_id != id)
2540             {
2541               if (last_id != charset_ascii)
2542                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2543                                   last_id);
2544               last_id = id;
2545               last_offset = char_offset;
2546             }
2547           *charbuf++ = c;
2548           char_offset++;
2549         }
2550       else if (cmp_status->state == COMPOSING_CHAR)
2551         {
2552           if (cmp_status->old_form)
2553             {
2554               if (c >= 0)
2555                 {
2556                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2557                   *charbuf++ = c;
2558                   char_offset++;
2559                 }
2560               else
2561                 {
2562                   *charbuf++ = -c;
2563                   cmp_status->nchars++;
2564                   cmp_status->length++;
2565                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2566                     EMACS_MULE_COMPOSITION_END ();
2567                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2568                     cmp_status->state = COMPOSING_RULE;
2569                 }
2570             }
2571           else
2572             {
2573               *charbuf++ = c;
2574               cmp_status->length++;
2575               cmp_status->nchars--;
2576               if (cmp_status->nchars == 0)
2577                 EMACS_MULE_COMPOSITION_END ();
2578             }
2579         }
2580       else if (cmp_status->state == COMPOSING_RULE)
2581         {
2582           int rule;
2583
2584           if (c >= 0)
2585             {
2586               EMACS_MULE_COMPOSITION_END ();
2587               *charbuf++ = c;
2588               char_offset++;
2589             }
2590           else
2591             {
2592               c = -c;
2593               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2594               if (rule < 0)
2595                 goto invalid_code;
2596               *charbuf++ = -2;
2597               *charbuf++ = rule;
2598               cmp_status->length += 2;
2599               cmp_status->state = COMPOSING_CHAR;
2600             }
2601         }
2602       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2603         {
2604           *charbuf++ = c;
2605           cmp_status->length++;
2606           if (cmp_status->ncomps == 0)
2607             cmp_status->state = COMPOSING_CHAR;
2608           else if (cmp_status->ncomps > 0)
2609             {
2610               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2611                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2612             }
2613           else
2614             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2615         }
2616       else                      /* COMPOSING_COMPONENT_RULE */
2617         {
2618           int rule;
2619
2620           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2621           if (rule < 0)
2622             goto invalid_code;
2623           *charbuf++ = -2;
2624           *charbuf++ = rule;
2625           cmp_status->length += 2;
2626           cmp_status->ncomps--;
2627           if (cmp_status->ncomps > 0)
2628             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2629           else
2630             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2631         }
2632       continue;
2633
2634     invalid_code:
2635       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2636       src = src_base;
2637       consumed_chars = consumed_chars_base;
2638       ONE_MORE_BYTE (c);
2639       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2640       char_offset++;
2641       coding->errors++;
2642     }
2643
2644  no_more_source:
2645   if (cmp_status->state != COMPOSING_NO)
2646     {
2647       if (coding->mode & CODING_MODE_LAST_BLOCK)
2648         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2649       else
2650         {
2651           int i;
2652
2653           charbuf -= cmp_status->length;
2654           for (i = 0; i < cmp_status->length; i++)
2655             cmp_status->carryover[i] = charbuf[i];
2656         }
2657     }
2658   if (last_id != charset_ascii)
2659     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2660   coding->consumed_char += consumed_chars_base;
2661   coding->consumed = src_base - coding->source;
2662   coding->charbuf_used = charbuf - coding->charbuf;
2663 }
2664
2665
2666 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2667   do {                                          \
2668     if (id < 0xA0)                              \
2669       codes[0] = id, codes[1] = 0;              \
2670     else if (id < 0xE0)                         \
2671       codes[0] = 0x9A, codes[1] = id;           \
2672     else if (id < 0xF0)                         \
2673       codes[0] = 0x9B, codes[1] = id;           \
2674     else if (id < 0xF5)                         \
2675       codes[0] = 0x9C, codes[1] = id;           \
2676     else                                        \
2677       codes[0] = 0x9D, codes[1] = id;           \
2678   } while (0);
2679
2680
2681 static int
2682 encode_coding_emacs_mule (struct coding_system *coding)
2683 {
2684   int multibytep = coding->dst_multibyte;
2685   int *charbuf = coding->charbuf;
2686   int *charbuf_end = charbuf + coding->charbuf_used;
2687   unsigned char *dst = coding->destination + coding->produced;
2688   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2689   int safe_room = 8;
2690   int produced_chars = 0;
2691   Lisp_Object attrs, charset_list;
2692   int c;
2693   int preferred_charset_id = -1;
2694
2695   CODING_GET_INFO (coding, attrs, charset_list);
2696   if (! EQ (charset_list, Vemacs_mule_charset_list))
2697     {
2698       CODING_ATTR_CHARSET_LIST (attrs)
2699         = charset_list = Vemacs_mule_charset_list;
2700     }
2701
2702   while (charbuf < charbuf_end)
2703     {
2704       ASSURE_DESTINATION (safe_room);
2705       c = *charbuf++;
2706
2707       if (c < 0)
2708         {
2709           /* Handle an annotation.  */
2710           switch (*charbuf)
2711             {
2712             case CODING_ANNOTATE_COMPOSITION_MASK:
2713               /* Not yet implemented.  */
2714               break;
2715             case CODING_ANNOTATE_CHARSET_MASK:
2716               preferred_charset_id = charbuf[3];
2717               if (preferred_charset_id >= 0
2718                   && NILP (Fmemq (make_number (preferred_charset_id),
2719                                   charset_list)))
2720                 preferred_charset_id = -1;
2721               break;
2722             default:
2723               abort ();
2724             }
2725           charbuf += -c - 1;
2726           continue;
2727         }
2728
2729       if (ASCII_CHAR_P (c))
2730         EMIT_ONE_ASCII_BYTE (c);
2731       else if (CHAR_BYTE8_P (c))
2732         {
2733           c = CHAR_TO_BYTE8 (c);
2734           EMIT_ONE_BYTE (c);
2735         }
2736       else
2737         {
2738           struct charset *charset;
2739           unsigned code;
2740           int dimension;
2741           int emacs_mule_id;
2742           unsigned char leading_codes[2];
2743
2744           if (preferred_charset_id >= 0)
2745             {
2746               charset = CHARSET_FROM_ID (preferred_charset_id);
2747               if (CHAR_CHARSET_P (c, charset))
2748                 code = ENCODE_CHAR (charset, c);
2749               else
2750                 charset = char_charset (c, charset_list, &code);
2751             }
2752           else
2753             charset = char_charset (c, charset_list, &code);
2754           if (! charset)
2755             {
2756               c = coding->default_char;
2757               if (ASCII_CHAR_P (c))
2758                 {
2759                   EMIT_ONE_ASCII_BYTE (c);
2760                   continue;
2761                 }
2762               charset = char_charset (c, charset_list, &code);
2763             }
2764           dimension = CHARSET_DIMENSION (charset);
2765           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2766           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2767           EMIT_ONE_BYTE (leading_codes[0]);
2768           if (leading_codes[1])
2769             EMIT_ONE_BYTE (leading_codes[1]);
2770           if (dimension == 1)
2771             EMIT_ONE_BYTE (code | 0x80);
2772           else
2773             {
2774               code |= 0x8080;
2775               EMIT_ONE_BYTE (code >> 8);
2776               EMIT_ONE_BYTE (code & 0xFF);
2777             }
2778         }
2779     }
2780   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2781   coding->produced_char += produced_chars;
2782   coding->produced = dst - coding->destination;
2783   return 0;
2784 }
2785
2786 \f
2787 /*** 7. ISO2022 handlers ***/
2788
2789 /* The following note describes the coding system ISO2022 briefly.
2790    Since the intention of this note is to help understand the
2791    functions in this file, some parts are NOT ACCURATE or are OVERLY
2792    SIMPLIFIED.  For thorough understanding, please refer to the
2793    original document of ISO2022.  This is equivalent to the standard
2794    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2795
2796    ISO2022 provides many mechanisms to encode several character sets
2797    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2798    is encoded using bytes less than 128.  This may make the encoded
2799    text a little bit longer, but the text passes more easily through
2800    several types of gateway, some of which strip off the MSB (Most
2801    Significant Bit).
2802
2803    There are two kinds of character sets: control character sets and
2804    graphic character sets.  The former contain control characters such
2805    as `newline' and `escape' to provide control functions (control
2806    functions are also provided by escape sequences).  The latter
2807    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2808    two control character sets and many graphic character sets.
2809
2810    Graphic character sets are classified into one of the following
2811    four classes, according to the number of bytes (DIMENSION) and
2812    number of characters in one dimension (CHARS) of the set:
2813    - DIMENSION1_CHARS94
2814    - DIMENSION1_CHARS96
2815    - DIMENSION2_CHARS94
2816    - DIMENSION2_CHARS96
2817
2818    In addition, each character set is assigned an identification tag,
2819    unique for each set, called the "final character" (denoted as <F>
2820    hereafter).  The <F> of each character set is decided by ECMA(*)
2821    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2822    (0x30..0x3F are for private use only).
2823
2824    Note (*): ECMA = European Computer Manufacturers Association
2825
2826    Here are examples of graphic character sets [NAME(<F>)]:
2827         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2828         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2829         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2830         o DIMENSION2_CHARS96 -- none for the moment
2831
2832    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2833         C0 [0x00..0x1F] -- control character plane 0
2834         GL [0x20..0x7F] -- graphic character plane 0
2835         C1 [0x80..0x9F] -- control character plane 1
2836         GR [0xA0..0xFF] -- graphic character plane 1
2837
2838    A control character set is directly designated and invoked to C0 or
2839    C1 by an escape sequence.  The most common case is that:
2840    - ISO646's  control character set is designated/invoked to C0, and
2841    - ISO6429's control character set is designated/invoked to C1,
2842    and usually these designations/invocations are omitted in encoded
2843    text.  In a 7-bit environment, only C0 can be used, and a control
2844    character for C1 is encoded by an appropriate escape sequence to
2845    fit into the environment.  All control characters for C1 are
2846    defined to have corresponding escape sequences.
2847
2848    A graphic character set is at first designated to one of four
2849    graphic registers (G0 through G3), then these graphic registers are
2850    invoked to GL or GR.  These designations and invocations can be
2851    done independently.  The most common case is that G0 is invoked to
2852    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2853    these invocations and designations are omitted in encoded text.
2854    In a 7-bit environment, only GL can be used.
2855
2856    When a graphic character set of CHARS94 is invoked to GL, codes
2857    0x20 and 0x7F of the GL area work as control characters SPACE and
2858    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2859    be used.
2860
2861    There are two ways of invocation: locking-shift and single-shift.
2862    With locking-shift, the invocation lasts until the next different
2863    invocation, whereas with single-shift, the invocation affects the
2864    following character only and doesn't affect the locking-shift
2865    state.  Invocations are done by the following control characters or
2866    escape sequences:
2867
2868    ----------------------------------------------------------------------
2869    abbrev  function                  cntrl escape seq   description
2870    ----------------------------------------------------------------------
2871    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2872    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2873    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2874    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2875    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2876    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2877    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2878    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2879    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2880    ----------------------------------------------------------------------
2881    (*) These are not used by any known coding system.
2882
2883    Control characters for these functions are defined by macros
2884    ISO_CODE_XXX in `coding.h'.
2885
2886    Designations are done by the following escape sequences:
2887    ----------------------------------------------------------------------
2888    escape sequence      description
2889    ----------------------------------------------------------------------
2890    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2891    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2892    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2893    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2894    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2895    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2896    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2897    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2898    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2899    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2900    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2901    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2902    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2903    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2904    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2905    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2906    ----------------------------------------------------------------------
2907
2908    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2909    of dimension 1, chars 94, and final character <F>, etc...
2910
2911    Note (*): Although these designations are not allowed in ISO2022,
2912    Emacs accepts them on decoding, and produces them on encoding
2913    CHARS96 character sets in a coding system which is characterized as
2914    7-bit environment, non-locking-shift, and non-single-shift.
2915
2916    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2917    '(' must be omitted.  We refer to this as "short-form" hereafter.
2918
2919    Now you may notice that there are a lot of ways of encoding the
2920    same multilingual text in ISO2022.  Actually, there exist many
2921    coding systems such as Compound Text (used in X11's inter client
2922    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2923    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2924    localized platforms), and all of these are variants of ISO2022.
2925
2926    In addition to the above, Emacs handles two more kinds of escape
2927    sequences: ISO6429's direction specification and Emacs' private
2928    sequence for specifying character composition.
2929
2930    ISO6429's direction specification takes the following form:
2931         o CSI ']'      -- end of the current direction
2932         o CSI '0' ']'  -- end of the current direction
2933         o CSI '1' ']'  -- start of left-to-right text
2934         o CSI '2' ']'  -- start of right-to-left text
2935    The control character CSI (0x9B: control sequence introducer) is
2936    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2937
2938    Character composition specification takes the following form:
2939         o ESC '0' -- start relative composition
2940         o ESC '1' -- end composition
2941         o ESC '2' -- start rule-base composition (*)
2942         o ESC '3' -- start relative composition with alternate chars  (**)
2943         o ESC '4' -- start rule-base composition with alternate chars  (**)
2944   Since these are not standard escape sequences of any ISO standard,
2945   the use of them with these meanings is restricted to Emacs only.
2946
2947   (*) This form is used only in Emacs 20.7 and older versions,
2948   but newer versions can safely decode it.
2949   (**) This form is used only in Emacs 21.1 and newer versions,
2950   and older versions can't decode it.
2951
2952   Here's a list of example usages of these composition escape
2953   sequences (categorized by `enum composition_method').
2954
2955   COMPOSITION_RELATIVE:
2956         ESC 0 CHAR [ CHAR ] ESC 1
2957   COMPOSITION_WITH_RULE:
2958         ESC 2 CHAR [ RULE CHAR ] ESC 1
2959   COMPOSITION_WITH_ALTCHARS:
2960         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2961   COMPOSITION_WITH_RULE_ALTCHARS:
2962         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2963
2964 enum iso_code_class_type iso_code_class[256];
2965
2966 #define SAFE_CHARSET_P(coding, id)      \
2967   ((id) <= (coding)->max_charset_id     \
2968    && (coding)->safe_charsets[id] != 255)
2969
2970
2971 #define SHIFT_OUT_OK(category)  \
2972   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2973
2974 static void
2975 setup_iso_safe_charsets (Lisp_Object attrs)
2976 {
2977   Lisp_Object charset_list, safe_charsets;
2978   Lisp_Object request;
2979   Lisp_Object reg_usage;
2980   Lisp_Object tail;
2981   int reg94, reg96;
2982   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2983   int max_charset_id;
2984
2985   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2986   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2987       && ! EQ (charset_list, Viso_2022_charset_list))
2988     {
2989       CODING_ATTR_CHARSET_LIST (attrs)
2990         = charset_list = Viso_2022_charset_list;
2991       ASET (attrs, coding_attr_safe_charsets, Qnil);
2992     }
2993
2994   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2995     return;
2996
2997   max_charset_id = 0;
2998   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2999     {
3000       int id = XINT (XCAR (tail));
3001       if (max_charset_id < id)
3002         max_charset_id = id;
3003     }
3004
3005   safe_charsets = make_uninit_string (max_charset_id + 1);
3006   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3007   request = AREF (attrs, coding_attr_iso_request);
3008   reg_usage = AREF (attrs, coding_attr_iso_usage);
3009   reg94 = XINT (XCAR (reg_usage));
3010   reg96 = XINT (XCDR (reg_usage));
3011
3012   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3013     {
3014       Lisp_Object id;
3015       Lisp_Object reg;
3016       struct charset *charset;
3017
3018       id = XCAR (tail);
3019       charset = CHARSET_FROM_ID (XINT (id));
3020       reg = Fcdr (Fassq (id, request));
3021       if (! NILP (reg))
3022         SSET (safe_charsets, XINT (id), XINT (reg));
3023       else if (charset->iso_chars_96)
3024         {
3025           if (reg96 < 4)
3026             SSET (safe_charsets, XINT (id), reg96);
3027         }
3028       else
3029         {
3030           if (reg94 < 4)
3031             SSET (safe_charsets, XINT (id), reg94);
3032         }
3033     }
3034   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3035 }
3036
3037
3038 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3039    Check if a text is encoded in one of ISO-2022 based coding systems.
3040    If it is, return 1, else return 0.  */
3041
3042 static int
3043 detect_coding_iso_2022 (struct coding_system *coding,
3044                         struct coding_detection_info *detect_info)
3045 {
3046   const unsigned char *src = coding->source, *src_base = src;
3047   const unsigned char *src_end = coding->source + coding->src_bytes;
3048   int multibytep = coding->src_multibyte;
3049   int single_shifting = 0;
3050   int id;
3051   int c, c1;
3052   int consumed_chars = 0;
3053   int i;
3054   int rejected = 0;
3055   int found = 0;
3056   int composition_count = -1;
3057
3058   detect_info->checked |= CATEGORY_MASK_ISO;
3059
3060   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3061     {
3062       struct coding_system *this = &(coding_categories[i]);
3063       Lisp_Object attrs, val;
3064
3065       if (this->id < 0)
3066         continue;
3067       attrs = CODING_ID_ATTRS (this->id);
3068       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3069           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3070         setup_iso_safe_charsets (attrs);
3071       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3072       this->max_charset_id = SCHARS (val) - 1;
3073       this->safe_charsets = SDATA (val);
3074     }
3075
3076   /* A coding system of this category is always ASCII compatible.  */
3077   src += coding->head_ascii;
3078
3079   while (rejected != CATEGORY_MASK_ISO)
3080     {
3081       src_base = src;
3082       ONE_MORE_BYTE (c);
3083       switch (c)
3084         {
3085         case ISO_CODE_ESC:
3086           if (inhibit_iso_escape_detection)
3087             break;
3088           single_shifting = 0;
3089           ONE_MORE_BYTE (c);
3090           if (c >= '(' && c <= '/')
3091             {
3092               /* Designation sequence for a charset of dimension 1.  */
3093               ONE_MORE_BYTE (c1);
3094               if (c1 < ' ' || c1 >= 0x80
3095                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3096                 /* Invalid designation sequence.  Just ignore.  */
3097                 break;
3098             }
3099           else if (c == '$')
3100             {
3101               /* Designation sequence for a charset of dimension 2.  */
3102               ONE_MORE_BYTE (c);
3103               if (c >= '@' && c <= 'B')
3104                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3105                 id = iso_charset_table[1][0][c];
3106               else if (c >= '(' && c <= '/')
3107                 {
3108                   ONE_MORE_BYTE (c1);
3109                   if (c1 < ' ' || c1 >= 0x80
3110                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3111                     /* Invalid designation sequence.  Just ignore.  */
3112                     break;
3113                 }
3114               else
3115                 /* Invalid designation sequence.  Just ignore it.  */
3116                 break;
3117             }
3118           else if (c == 'N' || c == 'O')
3119             {
3120               /* ESC <Fe> for SS2 or SS3.  */
3121               single_shifting = 1;
3122               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3123               break;
3124             }
3125           else if (c == '1')
3126             {
3127               /* End of composition.  */
3128               if (composition_count < 0
3129                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3130                 /* Invalid */
3131                 break;
3132               composition_count = -1;
3133               found |= CATEGORY_MASK_ISO;
3134             }
3135           else if (c >= '0' && c <= '4')
3136             {
3137               /* ESC <Fp> for start/end composition.  */
3138               composition_count = 0;
3139               break;
3140             }
3141           else
3142             {
3143               /* Invalid escape sequence.  Just ignore it.  */
3144               break;
3145             }
3146
3147           /* We found a valid designation sequence for CHARSET.  */
3148           rejected |= CATEGORY_MASK_ISO_8BIT;
3149           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3150                               id))
3151             found |= CATEGORY_MASK_ISO_7;
3152           else
3153             rejected |= CATEGORY_MASK_ISO_7;
3154           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3155                               id))
3156             found |= CATEGORY_MASK_ISO_7_TIGHT;
3157           else
3158             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3159           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3160                               id))
3161             found |= CATEGORY_MASK_ISO_7_ELSE;
3162           else
3163             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3164           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3165                               id))
3166             found |= CATEGORY_MASK_ISO_8_ELSE;
3167           else
3168             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3169           break;
3170
3171         case ISO_CODE_SO:
3172         case ISO_CODE_SI:
3173           /* Locking shift out/in.  */
3174           if (inhibit_iso_escape_detection)
3175             break;
3176           single_shifting = 0;
3177           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3178           break;
3179
3180         case ISO_CODE_CSI:
3181           /* Control sequence introducer.  */
3182           single_shifting = 0;
3183           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3184           found |= CATEGORY_MASK_ISO_8_ELSE;
3185           goto check_extra_latin;
3186
3187         case ISO_CODE_SS2:
3188         case ISO_CODE_SS3:
3189           /* Single shift.   */
3190           if (inhibit_iso_escape_detection)
3191             break;
3192           single_shifting = 0;
3193           rejected |= CATEGORY_MASK_ISO_7BIT;
3194           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3195               & CODING_ISO_FLAG_SINGLE_SHIFT)
3196             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3198               & CODING_ISO_FLAG_SINGLE_SHIFT)
3199             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3200           if (single_shifting)
3201             break;
3202           goto check_extra_latin;
3203
3204         default:
3205           if (c < 0)
3206             continue;
3207           if (c < 0x80)
3208             {
3209               if (composition_count >= 0)
3210                 composition_count++;
3211               single_shifting = 0;
3212               break;
3213             }
3214           if (c >= 0xA0)
3215             {
3216               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3217               found |= CATEGORY_MASK_ISO_8_1;
3218               /* Check the length of succeeding codes of the range
3219                  0xA0..0FF.  If the byte length is even, we include
3220                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3221                  only when we are not single shifting.  */
3222               if (! single_shifting
3223                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3224                 {
3225                   int i = 1;
3226                   while (src < src_end)
3227                     {
3228                       src_base = src;
3229                       ONE_MORE_BYTE (c);
3230                       if (c < 0xA0)
3231                         {
3232                           src = src_base;
3233                           break;
3234                         }
3235                       i++;
3236                     }
3237
3238                   if (i & 1 && src < src_end)
3239                     {
3240                       rejected |= CATEGORY_MASK_ISO_8_2;
3241                       if (composition_count >= 0)
3242                         composition_count += i;
3243                     }
3244                   else
3245                     {
3246                       found |= CATEGORY_MASK_ISO_8_2;
3247                       if (composition_count >= 0)
3248                         composition_count += i / 2;
3249                     }
3250                 }
3251               break;
3252             }
3253         check_extra_latin:
3254           single_shifting = 0;
3255           if (! VECTORP (Vlatin_extra_code_table)
3256               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3257             {
3258               rejected = CATEGORY_MASK_ISO;
3259               break;
3260             }
3261           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3262               & CODING_ISO_FLAG_LATIN_EXTRA)
3263             found |= CATEGORY_MASK_ISO_8_1;
3264           else
3265             rejected |= CATEGORY_MASK_ISO_8_1;
3266           rejected |= CATEGORY_MASK_ISO_8_2;
3267         }
3268     }
3269   detect_info->rejected |= CATEGORY_MASK_ISO;
3270   return 0;
3271
3272  no_more_source:
3273   detect_info->rejected |= rejected;
3274   detect_info->found |= (found & ~rejected);
3275   return 1;
3276 }
3277
3278
3279 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3280    escape sequence should be kept.  */
3281 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3282   do {                                                                  \
3283     int id, prev;                                                       \
3284                                                                         \
3285     if (final < '0' || final >= 128                                     \
3286         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3287         || !SAFE_CHARSET_P (coding, id))                                \
3288       {                                                                 \
3289         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3290         chars_96 = -1;                                                  \
3291         break;                                                          \
3292       }                                                                 \
3293     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3294     if (id == charset_jisx0201_roman)                                   \
3295       {                                                                 \
3296         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3297           id = charset_ascii;                                           \
3298       }                                                                 \
3299     else if (id == charset_jisx0208_1978)                               \
3300       {                                                                 \
3301         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3302           id = charset_jisx0208;                                        \
3303       }                                                                 \
3304     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3305     /* If there was an invalid designation to REG previously, and this  \
3306        designation is ASCII to REG, we should keep this designation     \
3307        sequence.  */                                                    \
3308     if (prev == -2 && id == charset_ascii)                              \
3309       chars_96 = -1;                                                    \
3310   } while (0)
3311
3312
3313 /* Handle these composition sequence (ALT: alternate char):
3314
3315    (1) relative composition: ESC 0 CHAR ... ESC 1
3316    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3317    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3318    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3319
3320    When the start sequence (ESC 0/2/3/4) is found, this annotation
3321    header is produced.
3322
3323         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3324
3325    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3326    produced until the end sequence (ESC 1) is found:
3327
3328    (1) CHAR ... CHAR
3329    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3330    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3331    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3332
3333    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3334    annotation header is updated as below:
3335
3336    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3337    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3338    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3339    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3340
3341    If an error is found while composing, the annotation header is
3342    changed to:
3343
3344         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3345
3346    and the sequence [ -2 DECODED-RULE ] is changed to the original
3347    byte sequence as below:
3348         o the original byte sequence is B: [ B -1 ]
3349         o the original byte sequence is B1 B2: [ B1 B2 ]
3350    and the sequence [ -1 -1 ] is changed to the original byte
3351    sequence:
3352         [ ESC '0' ]
3353 */
3354
3355 /* Decode a composition rule C1 and maybe one more byte from the
3356    source, and set RULE to the encoded composition rule, NBYTES to the
3357    length of the composition rule.  If the rule is invalid, set RULE
3358    to some negative value.  */
3359
3360 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3361   do {                                                                  \
3362     rule = c1 - 32;                                                     \
3363     if (rule < 0)                                                       \
3364       break;                                                            \
3365     if (rule < 81)              /* old format (before ver.21) */        \
3366       {                                                                 \
3367         int gref = (rule) / 9;                                          \
3368         int nref = (rule) % 9;                                          \
3369         if (gref == 4) gref = 10;                                       \
3370         if (nref == 4) nref = 10;                                       \
3371         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3372         nbytes = 1;                                                     \
3373       }                                                                 \
3374     else                        /* new format (after ver.21) */         \
3375       {                                                                 \
3376         int c;                                                          \
3377                                                                         \
3378         ONE_MORE_BYTE (c);                                              \
3379         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3380         if (rule >= 0)                                                  \
3381           rule += 0x100;   /* to destinguish it from the old format */  \
3382         nbytes = 2;                                                     \
3383       }                                                                 \
3384   } while (0)
3385
3386 #define ENCODE_COMPOSITION_RULE(rule)                           \
3387   do {                                                          \
3388     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3389                                                                 \
3390     if (rule < 0x100)           /* old format */                \
3391       {                                                         \
3392         if (gref == 10) gref = 4;                               \
3393         if (nref == 10) nref = 4;                               \
3394         charbuf[idx] = 32 + gref * 9 + nref;                    \
3395         charbuf[idx + 1] = -1;                                  \
3396         new_chars++;                                            \
3397       }                                                         \
3398     else                                /* new format */        \
3399       {                                                         \
3400         charbuf[idx] = 32 + 81 + gref;                          \
3401         charbuf[idx + 1] = 32 + nref;                           \
3402         new_chars += 2;                                         \
3403       }                                                         \
3404   } while (0)
3405
3406 /* Finish the current composition as invalid.  */
3407
3408 static int finish_composition (int *, struct composition_status *);
3409
3410 static int
3411 finish_composition (int *charbuf, struct composition_status *cmp_status)
3412 {
3413   int idx = - cmp_status->length;
3414   int new_chars;
3415
3416   /* Recover the original ESC sequence */
3417   charbuf[idx++] = ISO_CODE_ESC;
3418   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3419                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3420                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3421                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3422                     : '4');
3423   charbuf[idx++] = -2;
3424   charbuf[idx++] = 0;
3425   charbuf[idx++] = -1;
3426   new_chars = cmp_status->nchars;
3427   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3428     for (; idx < 0; idx++)
3429       {
3430         int elt = charbuf[idx];
3431
3432         if (elt == -2)
3433           {
3434             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3435             idx++;
3436           }
3437         else if (elt == -1)
3438           {
3439             charbuf[idx++] = ISO_CODE_ESC;
3440             charbuf[idx] = '0';
3441             new_chars += 2;
3442           }
3443       }
3444   cmp_status->state = COMPOSING_NO;
3445   return new_chars;
3446 }
3447
3448 /* If characters are under composition, finish the composition.  */
3449 #define MAYBE_FINISH_COMPOSITION()                              \
3450   do {                                                          \
3451     if (cmp_status->state != COMPOSING_NO)                      \
3452       char_offset += finish_composition (charbuf, cmp_status);  \
3453   } while (0)
3454
3455 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3456
3457    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3458    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3459    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3460    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3461
3462    Produce this annotation sequence now:
3463
3464    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3465 */
3466
3467 #define DECODE_COMPOSITION_START(c1)                                       \
3468   do {                                                                     \
3469     if (c1 == '0'                                                          \
3470         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3471              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3472             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3473                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3474       {                                                                    \
3475         *charbuf++ = -1;                                                   \
3476         *charbuf++= -1;                                                    \
3477         cmp_status->state = COMPOSING_CHAR;                                \
3478         cmp_status->length += 2;                                           \
3479       }                                                                    \
3480     else                                                                   \
3481       {                                                                    \
3482         MAYBE_FINISH_COMPOSITION ();                                       \
3483         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3484                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3485                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3486                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3487         cmp_status->state                                                  \
3488           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3489         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3490         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3491         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3492         coding->annotated = 1;                                             \
3493       }                                                                    \
3494   } while (0)
3495
3496
3497 /* Handle composition end sequence ESC 1.  */
3498
3499 #define DECODE_COMPOSITION_END()                                        \
3500   do {                                                                  \
3501     if (cmp_status->nchars == 0                                         \
3502         || ((cmp_status->state == COMPOSING_CHAR)                       \
3503             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3504       {                                                                 \
3505         MAYBE_FINISH_COMPOSITION ();                                    \
3506         goto invalid_code;                                              \
3507       }                                                                 \
3508     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3509       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3510     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3511       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3512     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3513     char_offset += cmp_status->nchars;                                  \
3514     cmp_status->state = COMPOSING_NO;                                   \
3515   } while (0)
3516
3517 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3518
3519 #define STORE_COMPOSITION_RULE(rule)    \
3520   do {                                  \
3521     *charbuf++ = -2;                    \
3522     *charbuf++ = rule;                  \
3523     cmp_status->length += 2;            \
3524     cmp_status->state--;                \
3525   } while (0)
3526
3527 /* Store a composed char or a component char C in charbuf, and update
3528    cmp_status.  */
3529
3530 #define STORE_COMPOSITION_CHAR(c)                                       \
3531   do {                                                                  \
3532     *charbuf++ = (c);                                                   \
3533     cmp_status->length++;                                               \
3534     if (cmp_status->state == COMPOSING_CHAR)                            \
3535       cmp_status->nchars++;                                             \
3536     else                                                                \
3537       cmp_status->ncomps++;                                             \
3538     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3539         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3540             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3541       cmp_status->state++;                                              \
3542   } while (0)
3543
3544
3545 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3546
3547 static void
3548 decode_coding_iso_2022 (struct coding_system *coding)
3549 {
3550   const unsigned char *src = coding->source + coding->consumed;
3551   const unsigned char *src_end = coding->source + coding->src_bytes;
3552   const unsigned char *src_base;
3553   int *charbuf = coding->charbuf + coding->charbuf_used;
3554   /* We may produce two annotations (charset and composition) in one
3555      loop and one more charset annotation at the end.  */
3556   int *charbuf_end
3557     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3558   int consumed_chars = 0, consumed_chars_base;
3559   int multibytep = coding->src_multibyte;
3560   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3561   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3562   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3563   int charset_id_2, charset_id_3;
3564   struct charset *charset;
3565   int c;
3566   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3567   Lisp_Object attrs, charset_list;
3568   int char_offset = coding->produced_char;
3569   int last_offset = char_offset;
3570   int last_id = charset_ascii;
3571   int eol_crlf =
3572     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3573   int byte_after_cr = -1;
3574   int i;
3575
3576   CODING_GET_INFO (coding, attrs, charset_list);
3577   setup_iso_safe_charsets (attrs);
3578   /* Charset list may have been changed.  */
3579   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3580   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3581
3582   if (cmp_status->state != COMPOSING_NO)
3583     {
3584       for (i = 0; i < cmp_status->length; i++)
3585         *charbuf++ = cmp_status->carryover[i];
3586       coding->annotated = 1;
3587     }
3588
3589   while (1)
3590     {
3591       int c1, c2, c3;
3592
3593       src_base = src;
3594       consumed_chars_base = consumed_chars;
3595
3596       if (charbuf >= charbuf_end)
3597         {
3598           if (byte_after_cr >= 0)
3599             src_base--;
3600           break;
3601         }
3602
3603       if (byte_after_cr >= 0)
3604         c1 = byte_after_cr, byte_after_cr = -1;
3605       else
3606         ONE_MORE_BYTE (c1);
3607       if (c1 < 0)
3608         goto invalid_code;
3609
3610       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3611         {
3612           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3613           char_offset++;
3614           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3615           continue;
3616         }
3617
3618       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3619         {
3620           if (c1 == ISO_CODE_ESC)
3621             {
3622               if (src + 1 >= src_end)
3623                 goto no_more_source;
3624               *charbuf++ = ISO_CODE_ESC;
3625               char_offset++;
3626               if (src[0] == '%' && src[1] == '@')
3627                 {
3628                   src += 2;
3629                   consumed_chars += 2;
3630                   char_offset += 2;
3631                   /* We are sure charbuf can contain two more chars. */
3632                   *charbuf++ = '%';
3633                   *charbuf++ = '@';
3634                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3635                 }
3636             }
3637           else
3638             {
3639               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3640               char_offset++;
3641             }
3642           continue;
3643         }
3644
3645       if ((cmp_status->state == COMPOSING_RULE
3646            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3647           && c1 != ISO_CODE_ESC)
3648         {
3649           int rule, nbytes;
3650
3651           DECODE_COMPOSITION_RULE (rule, nbytes);
3652           if (rule < 0)
3653             goto invalid_code;
3654           STORE_COMPOSITION_RULE (rule);
3655           continue;
3656         }
3657
3658       /* We produce at most one character.  */
3659       switch (iso_code_class [c1])
3660         {
3661         case ISO_0x20_or_0x7F:
3662           if (charset_id_0 < 0
3663               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3664             /* This is SPACE or DEL.  */
3665             charset = CHARSET_FROM_ID (charset_ascii);
3666           else
3667             charset = CHARSET_FROM_ID (charset_id_0);
3668           break;
3669
3670         case ISO_graphic_plane_0:
3671           if (charset_id_0 < 0)
3672             charset = CHARSET_FROM_ID (charset_ascii);
3673           else
3674             charset = CHARSET_FROM_ID (charset_id_0);
3675           break;
3676
3677         case ISO_0xA0_or_0xFF:
3678           if (charset_id_1 < 0
3679               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3680               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3681             goto invalid_code;
3682           /* This is a graphic character, we fall down ... */
3683
3684         case ISO_graphic_plane_1:
3685           if (charset_id_1 < 0)
3686             goto invalid_code;
3687           charset = CHARSET_FROM_ID (charset_id_1);
3688           break;
3689
3690         case ISO_control_0:
3691           if (eol_crlf && c1 == '\r')
3692             ONE_MORE_BYTE (byte_after_cr);
3693           MAYBE_FINISH_COMPOSITION ();
3694           charset = CHARSET_FROM_ID (charset_ascii);
3695           break;
3696
3697         case ISO_control_1:
3698           goto invalid_code;
3699
3700         case ISO_shift_out:
3701           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3702               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3703             goto invalid_code;
3704           CODING_ISO_INVOCATION (coding, 0) = 1;
3705           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3706           continue;
3707
3708         case ISO_shift_in:
3709           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3710             goto invalid_code;
3711           CODING_ISO_INVOCATION (coding, 0) = 0;
3712           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3713           continue;
3714
3715         case ISO_single_shift_2_7:
3716           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3717             goto invalid_code;
3718         case ISO_single_shift_2:
3719           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3720             goto invalid_code;
3721           /* SS2 is handled as an escape sequence of ESC 'N' */
3722           c1 = 'N';
3723           goto label_escape_sequence;
3724
3725         case ISO_single_shift_3:
3726           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3727             goto invalid_code;
3728           /* SS2 is handled as an escape sequence of ESC 'O' */
3729           c1 = 'O';
3730           goto label_escape_sequence;
3731
3732         case ISO_control_sequence_introducer:
3733           /* CSI is handled as an escape sequence of ESC '[' ...  */
3734           c1 = '[';
3735           goto label_escape_sequence;
3736
3737         case ISO_escape:
3738           ONE_MORE_BYTE (c1);
3739         label_escape_sequence:
3740           /* Escape sequences handled here are invocation,
3741              designation, direction specification, and character
3742              composition specification.  */
3743           switch (c1)
3744             {
3745             case '&':           /* revision of following character set */
3746               ONE_MORE_BYTE (c1);
3747               if (!(c1 >= '@' && c1 <= '~'))
3748                 goto invalid_code;
3749               ONE_MORE_BYTE (c1);
3750               if (c1 != ISO_CODE_ESC)
3751                 goto invalid_code;
3752               ONE_MORE_BYTE (c1);
3753               goto label_escape_sequence;
3754
3755             case '$':           /* designation of 2-byte character set */
3756               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3757                 goto invalid_code;
3758               {
3759                 int reg, chars96;
3760
3761                 ONE_MORE_BYTE (c1);
3762                 if (c1 >= '@' && c1 <= 'B')
3763                   {     /* designation of JISX0208.1978, GB2312.1980,
3764                            or JISX0208.1980 */
3765                     reg = 0, chars96 = 0;
3766                   }
3767                 else if (c1 >= 0x28 && c1 <= 0x2B)
3768                   { /* designation of DIMENSION2_CHARS94 character set */
3769                     reg = c1 - 0x28, chars96 = 0;
3770                     ONE_MORE_BYTE (c1);
3771                   }
3772                 else if (c1 >= 0x2C && c1 <= 0x2F)
3773                   { /* designation of DIMENSION2_CHARS96 character set */
3774                     reg = c1 - 0x2C, chars96 = 1;
3775                     ONE_MORE_BYTE (c1);
3776                   }
3777                 else
3778                   goto invalid_code;
3779                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3780                 /* We must update these variables now.  */
3781                 if (reg == 0)
3782                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3783                 else if (reg == 1)
3784                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3785                 if (chars96 < 0)
3786                   goto invalid_code;
3787               }
3788               continue;
3789
3790             case 'n':           /* invocation of locking-shift-2 */
3791               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3792                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3793                 goto invalid_code;
3794               CODING_ISO_INVOCATION (coding, 0) = 2;
3795               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3796               continue;
3797
3798             case 'o':           /* invocation of locking-shift-3 */
3799               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3800                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3801                 goto invalid_code;
3802               CODING_ISO_INVOCATION (coding, 0) = 3;
3803               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3804               continue;
3805
3806             case 'N':           /* invocation of single-shift-2 */
3807               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3808                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3809                 goto invalid_code;
3810               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3811               if (charset_id_2 < 0)
3812                 charset = CHARSET_FROM_ID (charset_ascii);
3813               else
3814                 charset = CHARSET_FROM_ID (charset_id_2);
3815               ONE_MORE_BYTE (c1);
3816               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3817                 goto invalid_code;
3818               break;
3819
3820             case 'O':           /* invocation of single-shift-3 */
3821               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3822                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3823                 goto invalid_code;
3824               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3825               if (charset_id_3 < 0)
3826                 charset = CHARSET_FROM_ID (charset_ascii);
3827               else
3828                 charset = CHARSET_FROM_ID (charset_id_3);
3829               ONE_MORE_BYTE (c1);
3830               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3831                 goto invalid_code;
3832               break;
3833
3834             case '0': case '2': case '3': case '4': /* start composition */
3835               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3836                 goto invalid_code;
3837               if (last_id != charset_ascii)
3838                 {
3839                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3840                   last_id = charset_ascii;
3841                   last_offset = char_offset;
3842                 }
3843               DECODE_COMPOSITION_START (c1);
3844               continue;
3845
3846             case '1':           /* end composition */
3847               if (cmp_status->state == COMPOSING_NO)
3848                 goto invalid_code;
3849               DECODE_COMPOSITION_END ();
3850               continue;
3851
3852             case '[':           /* specification of direction */
3853               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3854                 goto invalid_code;
3855               /* For the moment, nested direction is not supported.
3856                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3857                  left-to-right, and nonzero means right-to-left.  */
3858               ONE_MORE_BYTE (c1);
3859               switch (c1)
3860                 {
3861                 case ']':       /* end of the current direction */
3862                   coding->mode &= ~CODING_MODE_DIRECTION;
3863
3864                 case '0':       /* end of the current direction */
3865                 case '1':       /* start of left-to-right direction */
3866                   ONE_MORE_BYTE (c1);
3867                   if (c1 == ']')
3868                     coding->mode &= ~CODING_MODE_DIRECTION;
3869                   else
3870                     goto invalid_code;
3871                   break;
3872
3873                 case '2':       /* start of right-to-left direction */
3874                   ONE_MORE_BYTE (c1);
3875                   if (c1 == ']')
3876                     coding->mode |= CODING_MODE_DIRECTION;
3877                   else
3878                     goto invalid_code;
3879                   break;
3880
3881                 default:
3882                   goto invalid_code;
3883                 }
3884               continue;
3885
3886             case '%':
3887               ONE_MORE_BYTE (c1);
3888               if (c1 == '/')
3889                 {
3890                   /* CTEXT extended segment:
3891                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3892                      We keep these bytes as is for the moment.
3893                      They may be decoded by post-read-conversion.  */
3894                   int dim, M, L;
3895                   int size;
3896
3897                   ONE_MORE_BYTE (dim);
3898                   if (dim < '0' || dim > '4')
3899                     goto invalid_code;
3900                   ONE_MORE_BYTE (M);
3901                   if (M < 128)
3902                     goto invalid_code;
3903                   ONE_MORE_BYTE (L);
3904                   if (L < 128)
3905                     goto invalid_code;
3906                   size = ((M - 128) * 128) + (L - 128);
3907                   if (charbuf + 6 > charbuf_end)
3908                     goto break_loop;
3909                   *charbuf++ = ISO_CODE_ESC;
3910                   *charbuf++ = '%';
3911                   *charbuf++ = '/';
3912                   *charbuf++ = dim;
3913                   *charbuf++ = BYTE8_TO_CHAR (M);
3914                   *charbuf++ = BYTE8_TO_CHAR (L);
3915                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3916                 }
3917               else if (c1 == 'G')
3918                 {
3919                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3920                      ESC % G --UTF-8-BYTES-- ESC % @
3921                      We keep these bytes as is for the moment.
3922                      They may be decoded by post-read-conversion.  */
3923                   if (charbuf + 3 > charbuf_end)
3924                     goto break_loop;
3925                   *charbuf++ = ISO_CODE_ESC;
3926                   *charbuf++ = '%';
3927                   *charbuf++ = 'G';
3928                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3929                 }
3930               else
3931                 goto invalid_code;
3932               continue;
3933               break;
3934
3935             default:
3936               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3937                 goto invalid_code;
3938               {
3939                 int reg, chars96;
3940
3941                 if (c1 >= 0x28 && c1 <= 0x2B)
3942                   { /* designation of DIMENSION1_CHARS94 character set */
3943                     reg = c1 - 0x28, chars96 = 0;
3944                     ONE_MORE_BYTE (c1);
3945                   }
3946                 else if (c1 >= 0x2C && c1 <= 0x2F)
3947                   { /* designation of DIMENSION1_CHARS96 character set */
3948                     reg = c1 - 0x2C, chars96 = 1;
3949                     ONE_MORE_BYTE (c1);
3950                   }
3951                 else
3952                   goto invalid_code;
3953                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3954                 /* We must update these variables now.  */
3955                 if (reg == 0)
3956                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3957                 else if (reg == 1)
3958                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3959                 if (chars96 < 0)
3960                   goto invalid_code;
3961               }
3962               continue;
3963             }
3964         }
3965
3966       if (cmp_status->state == COMPOSING_NO
3967           && charset->id != charset_ascii
3968           && last_id != charset->id)
3969         {
3970           if (last_id != charset_ascii)
3971             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3972           last_id = charset->id;
3973           last_offset = char_offset;
3974         }
3975
3976       /* Now we know CHARSET and 1st position code C1 of a character.
3977          Produce a decoded character while getting 2nd and 3rd
3978          position codes C2, C3 if necessary.  */
3979       if (CHARSET_DIMENSION (charset) > 1)
3980         {
3981           ONE_MORE_BYTE (c2);
3982           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3983               || ((c1 & 0x80) != (c2 & 0x80)))
3984             /* C2 is not in a valid range.  */
3985             goto invalid_code;
3986           if (CHARSET_DIMENSION (charset) == 2)
3987             c1 = (c1 << 8) | c2;
3988           else
3989             {
3990               ONE_MORE_BYTE (c3);
3991               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3992                   || ((c1 & 0x80) != (c3 & 0x80)))
3993                 /* C3 is not in a valid range.  */
3994                 goto invalid_code;
3995               c1 = (c1 << 16) | (c2 << 8) | c2;
3996             }
3997         }
3998       c1 &= 0x7F7F7F;
3999       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4000       if (c < 0)
4001         {
4002           MAYBE_FINISH_COMPOSITION ();
4003           for (; src_base < src; src_base++, char_offset++)
4004             {
4005               if (ASCII_BYTE_P (*src_base))
4006                 *charbuf++ = *src_base;
4007               else
4008                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4009             }
4010         }
4011       else if (cmp_status->state == COMPOSING_NO)
4012         {
4013           *charbuf++ = c;
4014           char_offset++;
4015         }
4016       else if ((cmp_status->state == COMPOSING_CHAR
4017                 ? cmp_status->nchars
4018                 : cmp_status->ncomps)
4019                >= MAX_COMPOSITION_COMPONENTS)
4020         {
4021           /* Too long composition.  */
4022           MAYBE_FINISH_COMPOSITION ();
4023           *charbuf++ = c;
4024           char_offset++;
4025         }
4026       else
4027         STORE_COMPOSITION_CHAR (c);
4028       continue;
4029
4030     invalid_code:
4031       MAYBE_FINISH_COMPOSITION ();
4032       src = src_base;
4033       consumed_chars = consumed_chars_base;
4034       ONE_MORE_BYTE (c);
4035       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4036       char_offset++;
4037       coding->errors++;
4038       continue;
4039
4040     break_loop:
4041       break;
4042     }
4043
4044  no_more_source:
4045   if (cmp_status->state != COMPOSING_NO)
4046     {
4047       if (coding->mode & CODING_MODE_LAST_BLOCK)
4048         MAYBE_FINISH_COMPOSITION ();
4049       else
4050         {
4051           charbuf -= cmp_status->length;
4052           for (i = 0; i < cmp_status->length; i++)
4053             cmp_status->carryover[i] = charbuf[i];
4054         }
4055     }
4056   else if (last_id != charset_ascii)
4057     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4058   coding->consumed_char += consumed_chars_base;
4059   coding->consumed = src_base - coding->source;
4060   coding->charbuf_used = charbuf - coding->charbuf;
4061 }
4062
4063
4064 /* ISO2022 encoding stuff.  */
4065
4066 /*
4067    It is not enough to say just "ISO2022" on encoding, we have to
4068    specify more details.  In Emacs, each coding system of ISO2022
4069    variant has the following specifications:
4070         1. Initial designation to G0 thru G3.
4071         2. Allows short-form designation?
4072         3. ASCII should be designated to G0 before control characters?
4073         4. ASCII should be designated to G0 at end of line?
4074         5. 7-bit environment or 8-bit environment?
4075         6. Use locking-shift?
4076         7. Use Single-shift?
4077    And the following two are only for Japanese:
4078         8. Use ASCII in place of JIS0201-1976-Roman?
4079         9. Use JISX0208-1983 in place of JISX0208-1978?
4080    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4081    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4082    details.
4083 */
4084
4085 /* Produce codes (escape sequence) for designating CHARSET to graphic
4086    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4087    '@', 'A', or 'B' and the coding system CODING allows, produce
4088    designation sequence of short-form.  */
4089
4090 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4091   do {                                                                  \
4092     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4093     const char *intermediate_char_94 = "()*+";                          \
4094     const char *intermediate_char_96 = ",-./";                          \
4095     int revision = -1;                                                  \
4096     int c;                                                              \
4097                                                                         \
4098     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4099       revision = CHARSET_ISO_REVISION (charset);                        \
4100                                                                         \
4101     if (revision >= 0)                                                  \
4102       {                                                                 \
4103         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4104         EMIT_ONE_BYTE ('@' + revision);                                 \
4105       }                                                                 \
4106     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4107     if (CHARSET_DIMENSION (charset) == 1)                               \
4108       {                                                                 \
4109         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4110           c = intermediate_char_94[reg];                                \
4111         else                                                            \
4112           c = intermediate_char_96[reg];                                \
4113         EMIT_ONE_ASCII_BYTE (c);                                        \
4114       }                                                                 \
4115     else                                                                \
4116       {                                                                 \
4117         EMIT_ONE_ASCII_BYTE ('$');                                      \
4118         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4119           {                                                             \
4120             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4121                 || reg != 0                                             \
4122                 || final_char < '@' || final_char > 'B')                \
4123               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4124           }                                                             \
4125         else                                                            \
4126           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4127       }                                                                 \
4128     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4129                                                                         \
4130     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4131   } while (0)
4132
4133
4134 /* The following two macros produce codes (control character or escape
4135    sequence) for ISO2022 single-shift functions (single-shift-2 and
4136    single-shift-3).  */
4137
4138 #define ENCODE_SINGLE_SHIFT_2                                           \
4139   do {                                                                  \
4140     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4141       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4142     else                                                                \
4143       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4144     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4145   } while (0)
4146
4147
4148 #define ENCODE_SINGLE_SHIFT_3                                           \
4149   do {                                                                  \
4150     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4151       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4152     else                                                                \
4153       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4154     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4155   } while (0)
4156
4157
4158 /* The following four macros produce codes (control character or
4159    escape sequence) for ISO2022 locking-shift functions (shift-in,
4160    shift-out, locking-shift-2, and locking-shift-3).  */
4161
4162 #define ENCODE_SHIFT_IN                                 \
4163   do {                                                  \
4164     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4165     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4166   } while (0)
4167
4168
4169 #define ENCODE_SHIFT_OUT                                \
4170   do {                                                  \
4171     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4172     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4173   } while (0)
4174
4175
4176 #define ENCODE_LOCKING_SHIFT_2                          \
4177   do {                                                  \
4178     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4179     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4180   } while (0)
4181
4182
4183 #define ENCODE_LOCKING_SHIFT_3                          \
4184   do {                                                  \
4185     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4186     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4187   } while (0)
4188
4189
4190 /* Produce codes for a DIMENSION1 character whose character set is
4191    CHARSET and whose position-code is C1.  Designation and invocation
4192    sequences are also produced in advance if necessary.  */
4193
4194 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4195   do {                                                                  \
4196     int id = CHARSET_ID (charset);                                      \
4197                                                                         \
4198     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4199         && id == charset_ascii)                                         \
4200       {                                                                 \
4201         id = charset_jisx0201_roman;                                    \
4202         charset = CHARSET_FROM_ID (id);                                 \
4203       }                                                                 \
4204                                                                         \
4205     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4206       {                                                                 \
4207         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4208           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4209         else                                                            \
4210           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4211         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4212         break;                                                          \
4213       }                                                                 \
4214     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4215       {                                                                 \
4216         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4217         break;                                                          \
4218       }                                                                 \
4219     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4220       {                                                                 \
4221         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4222         break;                                                          \
4223       }                                                                 \
4224     else                                                                \
4225       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4226          must invoke it, or, at first, designate it to some graphic     \
4227          register.  Then repeat the loop to actually produce the        \
4228          character.  */                                                 \
4229       dst = encode_invocation_designation (charset, coding, dst,        \
4230                                            &produced_chars);            \
4231   } while (1)
4232
4233
4234 /* Produce codes for a DIMENSION2 character whose character set is
4235    CHARSET and whose position-codes are C1 and C2.  Designation and
4236    invocation codes are also produced in advance if necessary.  */
4237
4238 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4239   do {                                                                  \
4240     int id = CHARSET_ID (charset);                                      \
4241                                                                         \
4242     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4243         && id == charset_jisx0208)                                      \
4244       {                                                                 \
4245         id = charset_jisx0208_1978;                                     \
4246         charset = CHARSET_FROM_ID (id);                                 \
4247       }                                                                 \
4248                                                                         \
4249     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4250       {                                                                 \
4251         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4252           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4253         else                                                            \
4254           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4255         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4256         break;                                                          \
4257       }                                                                 \
4258     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4259       {                                                                 \
4260         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4261         break;                                                          \
4262       }                                                                 \
4263     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4264       {                                                                 \
4265         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4266         break;                                                          \
4267       }                                                                 \
4268     else                                                                \
4269       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4270          must invoke it, or, at first, designate it to some graphic     \
4271          register.  Then repeat the loop to actually produce the        \
4272          character.  */                                                 \
4273       dst = encode_invocation_designation (charset, coding, dst,        \
4274                                            &produced_chars);            \
4275   } while (1)
4276
4277
4278 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4279   do {                                                                     \
4280     int code = ENCODE_CHAR ((charset), (c));                               \
4281                                                                            \
4282     if (CHARSET_DIMENSION (charset) == 1)                                  \
4283       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4284     else                                                                   \
4285       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4286   } while (0)
4287
4288
4289 /* Produce designation and invocation codes at a place pointed by DST
4290    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4291    Return new DST.  */
4292
4293 unsigned char *
4294 encode_invocation_designation (struct charset *charset,
4295                                struct coding_system *coding,
4296                                unsigned char *dst, int *p_nchars)
4297 {
4298   int multibytep = coding->dst_multibyte;
4299   int produced_chars = *p_nchars;
4300   int reg;                      /* graphic register number */
4301   int id = CHARSET_ID (charset);
4302
4303   /* At first, check designations.  */
4304   for (reg = 0; reg < 4; reg++)
4305     if (id == CODING_ISO_DESIGNATION (coding, reg))
4306       break;
4307
4308   if (reg >= 4)
4309     {
4310       /* CHARSET is not yet designated to any graphic registers.  */
4311       /* At first check the requested designation.  */
4312       reg = CODING_ISO_REQUEST (coding, id);
4313       if (reg < 0)
4314         /* Since CHARSET requests no special designation, designate it
4315            to graphic register 0.  */
4316         reg = 0;
4317
4318       ENCODE_DESIGNATION (charset, reg, coding);
4319     }
4320
4321   if (CODING_ISO_INVOCATION (coding, 0) != reg
4322       && CODING_ISO_INVOCATION (coding, 1) != reg)
4323     {
4324       /* Since the graphic register REG is not invoked to any graphic
4325          planes, invoke it to graphic plane 0.  */
4326       switch (reg)
4327         {
4328         case 0:                 /* graphic register 0 */
4329           ENCODE_SHIFT_IN;
4330           break;
4331
4332         case 1:                 /* graphic register 1 */
4333           ENCODE_SHIFT_OUT;
4334           break;
4335
4336         case 2:                 /* graphic register 2 */
4337           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4338             ENCODE_SINGLE_SHIFT_2;
4339           else
4340             ENCODE_LOCKING_SHIFT_2;
4341           break;
4342
4343         case 3:                 /* graphic register 3 */
4344           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4345             ENCODE_SINGLE_SHIFT_3;
4346           else
4347             ENCODE_LOCKING_SHIFT_3;
4348           break;
4349         }
4350     }
4351
4352   *p_nchars = produced_chars;
4353   return dst;
4354 }
4355
4356 /* The following three macros produce codes for indicating direction
4357    of text.  */
4358 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4359   do {                                                                  \
4360     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4361       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4362     else                                                                \
4363       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4364   } while (0)
4365
4366
4367 #define ENCODE_DIRECTION_R2L()                  \
4368   do {                                          \
4369     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4370     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4371   } while (0)
4372
4373
4374 #define ENCODE_DIRECTION_L2R()                  \
4375   do {                                          \
4376     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4377     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4378   } while (0)
4379
4380
4381 /* Produce codes for designation and invocation to reset the graphic
4382    planes and registers to initial state.  */
4383 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4384   do {                                                                  \
4385     int reg;                                                            \
4386     struct charset *charset;                                            \
4387                                                                         \
4388     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4389       ENCODE_SHIFT_IN;                                                  \
4390     for (reg = 0; reg < 4; reg++)                                       \
4391       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4392           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4393               != CODING_ISO_INITIAL (coding, reg)))                     \
4394         {                                                               \
4395           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4396           ENCODE_DESIGNATION (charset, reg, coding);                    \
4397         }                                                               \
4398   } while (0)
4399
4400
4401 /* Produce designation sequences of charsets in the line started from
4402    SRC to a place pointed by DST, and return updated DST.
4403
4404    If the current block ends before any end-of-line, we may fail to
4405    find all the necessary designations.  */
4406
4407 static unsigned char *
4408 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4409                            int *charbuf_end, unsigned char *dst)
4410 {
4411   struct charset *charset;
4412   /* Table of charsets to be designated to each graphic register.  */
4413   int r[4];
4414   int c, found = 0, reg;
4415   int produced_chars = 0;
4416   int multibytep = coding->dst_multibyte;
4417   Lisp_Object attrs;
4418   Lisp_Object charset_list;
4419
4420   attrs = CODING_ID_ATTRS (coding->id);
4421   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4422   if (EQ (charset_list, Qiso_2022))
4423     charset_list = Viso_2022_charset_list;
4424
4425   for (reg = 0; reg < 4; reg++)
4426     r[reg] = -1;
4427
4428   while (found < 4)
4429     {
4430       int id;
4431
4432       c = *charbuf++;
4433       if (c == '\n')
4434         break;
4435       charset = char_charset (c, charset_list, NULL);
4436       id = CHARSET_ID (charset);
4437       reg = CODING_ISO_REQUEST (coding, id);
4438       if (reg >= 0 && r[reg] < 0)
4439         {
4440           found++;
4441           r[reg] = id;
4442         }
4443     }
4444
4445   if (found)
4446     {
4447       for (reg = 0; reg < 4; reg++)
4448         if (r[reg] >= 0
4449             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4450           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4451     }
4452
4453   return dst;
4454 }
4455
4456 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4457
4458 static int
4459 encode_coding_iso_2022 (struct coding_system *coding)
4460 {
4461   int multibytep = coding->dst_multibyte;
4462   int *charbuf = coding->charbuf;
4463   int *charbuf_end = charbuf + coding->charbuf_used;
4464   unsigned char *dst = coding->destination + coding->produced;
4465   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4466   int safe_room = 16;
4467   int bol_designation
4468     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4469        && CODING_ISO_BOL (coding));
4470   int produced_chars = 0;
4471   Lisp_Object attrs, eol_type, charset_list;
4472   int ascii_compatible;
4473   int c;
4474   int preferred_charset_id = -1;
4475
4476   CODING_GET_INFO (coding, attrs, charset_list);
4477   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4478   if (VECTORP (eol_type))
4479     eol_type = Qunix;
4480
4481   setup_iso_safe_charsets (attrs);
4482   /* Charset list may have been changed.  */
4483   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4484   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4485
4486   ascii_compatible
4487     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4488        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4489                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4490
4491   while (charbuf < charbuf_end)
4492     {
4493       ASSURE_DESTINATION (safe_room);
4494
4495       if (bol_designation)
4496         {
4497           unsigned char *dst_prev = dst;
4498
4499           /* We have to produce designation sequences if any now.  */
4500           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4501           bol_designation = 0;
4502           /* We are sure that designation sequences are all ASCII bytes.  */
4503           produced_chars += dst - dst_prev;
4504         }
4505
4506       c = *charbuf++;
4507
4508       if (c < 0)
4509         {
4510           /* Handle an annotation.  */
4511           switch (*charbuf)
4512             {
4513             case CODING_ANNOTATE_COMPOSITION_MASK:
4514               /* Not yet implemented.  */
4515               break;
4516             case CODING_ANNOTATE_CHARSET_MASK:
4517               preferred_charset_id = charbuf[2];
4518               if (preferred_charset_id >= 0
4519                   && NILP (Fmemq (make_number (preferred_charset_id),
4520                                   charset_list)))
4521                 preferred_charset_id = -1;
4522               break;
4523             default:
4524               abort ();
4525             }
4526           charbuf += -c - 1;
4527           continue;
4528         }
4529
4530       /* Now encode the character C.  */
4531       if (c < 0x20 || c == 0x7F)
4532         {
4533           if (c == '\n'
4534               || (c == '\r' && EQ (eol_type, Qmac)))
4535             {
4536               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4537                 ENCODE_RESET_PLANE_AND_REGISTER ();
4538               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4539                 {
4540                   int i;
4541
4542                   for (i = 0; i < 4; i++)
4543                     CODING_ISO_DESIGNATION (coding, i)
4544                       = CODING_ISO_INITIAL (coding, i);
4545                 }
4546               bol_designation
4547                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4548             }
4549           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4550             ENCODE_RESET_PLANE_AND_REGISTER ();
4551           EMIT_ONE_ASCII_BYTE (c);
4552         }
4553       else if (ASCII_CHAR_P (c))
4554         {
4555           if (ascii_compatible)
4556             EMIT_ONE_ASCII_BYTE (c);
4557           else
4558             {
4559               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4560               ENCODE_ISO_CHARACTER (charset, c);
4561             }
4562         }
4563       else if (CHAR_BYTE8_P (c))
4564         {
4565           c = CHAR_TO_BYTE8 (c);
4566           EMIT_ONE_BYTE (c);
4567         }
4568       else
4569         {
4570           struct charset *charset;
4571
4572           if (preferred_charset_id >= 0)
4573             {
4574               charset = CHARSET_FROM_ID (preferred_charset_id);
4575               if (! CHAR_CHARSET_P (c, charset))
4576                 charset = char_charset (c, charset_list, NULL);
4577             }
4578           else
4579             charset = char_charset (c, charset_list, NULL);
4580           if (!charset)
4581             {
4582               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4583                 {
4584                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4585                   charset = CHARSET_FROM_ID (charset_ascii);
4586                 }
4587               else
4588                 {
4589                   c = coding->default_char;
4590                   charset = char_charset (c, charset_list, NULL);
4591                 }
4592             }
4593           ENCODE_ISO_CHARACTER (charset, c);
4594         }
4595     }
4596
4597   if (coding->mode & CODING_MODE_LAST_BLOCK
4598       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4599     {
4600       ASSURE_DESTINATION (safe_room);
4601       ENCODE_RESET_PLANE_AND_REGISTER ();
4602     }
4603   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4604   CODING_ISO_BOL (coding) = bol_designation;
4605   coding->produced_char += produced_chars;
4606   coding->produced = dst - coding->destination;
4607   return 0;
4608 }
4609
4610 \f
4611 /*** 8,9. SJIS and BIG5 handlers ***/
4612
4613 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4614    quite widely.  So, for the moment, Emacs supports them in the bare
4615    C code.  But, in the future, they may be supported only by CCL.  */
4616
4617 /* SJIS is a coding system encoding three character sets: ASCII, right
4618    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4619    as is.  A character of charset katakana-jisx0201 is encoded by
4620    "position-code + 0x80".  A character of charset japanese-jisx0208
4621    is encoded in 2-byte but two position-codes are divided and shifted
4622    so that it fit in the range below.
4623
4624    --- CODE RANGE of SJIS ---
4625    (character set)      (range)
4626    ASCII                0x00 .. 0x7F
4627    KATAKANA-JISX0201    0xA0 .. 0xDF
4628    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4629             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4630    -------------------------------
4631
4632 */
4633
4634 /* BIG5 is a coding system encoding two character sets: ASCII and
4635    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4636    character set and is encoded in two-byte.
4637
4638    --- CODE RANGE of BIG5 ---
4639    (character set)      (range)
4640    ASCII                0x00 .. 0x7F
4641    Big5 (1st byte)      0xA1 .. 0xFE
4642         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4643    --------------------------
4644
4645   */
4646
4647 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4648    Check if a text is encoded in SJIS.  If it is, return
4649    CATEGORY_MASK_SJIS, else return 0.  */
4650
4651 static int
4652 detect_coding_sjis (struct coding_system *coding,
4653                     struct coding_detection_info *detect_info)
4654 {
4655   const unsigned char *src = coding->source, *src_base;
4656   const unsigned char *src_end = coding->source + coding->src_bytes;
4657   int multibytep = coding->src_multibyte;
4658   int consumed_chars = 0;
4659   int found = 0;
4660   int c;
4661   Lisp_Object attrs, charset_list;
4662   int max_first_byte_of_2_byte_code;
4663
4664   CODING_GET_INFO (coding, attrs, charset_list);
4665   max_first_byte_of_2_byte_code
4666     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4667
4668   detect_info->checked |= CATEGORY_MASK_SJIS;
4669   /* A coding system of this category is always ASCII compatible.  */
4670   src += coding->head_ascii;
4671
4672   while (1)
4673     {
4674       src_base = src;
4675       ONE_MORE_BYTE (c);
4676       if (c < 0x80)
4677         continue;
4678       if ((c >= 0x81 && c <= 0x9F)
4679           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4680         {
4681           ONE_MORE_BYTE (c);
4682           if (c < 0x40 || c == 0x7F || c > 0xFC)
4683             break;
4684           found = CATEGORY_MASK_SJIS;
4685         }
4686       else if (c >= 0xA0 && c < 0xE0)
4687         found = CATEGORY_MASK_SJIS;
4688       else
4689         break;
4690     }
4691   detect_info->rejected |= CATEGORY_MASK_SJIS;
4692   return 0;
4693
4694  no_more_source:
4695   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4696     {
4697       detect_info->rejected |= CATEGORY_MASK_SJIS;
4698       return 0;
4699     }
4700   detect_info->found |= found;
4701   return 1;
4702 }
4703
4704 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4705    Check if a text is encoded in BIG5.  If it is, return
4706    CATEGORY_MASK_BIG5, else return 0.  */
4707
4708 static int
4709 detect_coding_big5 (struct coding_system *coding,
4710                     struct coding_detection_info *detect_info)
4711 {
4712   const unsigned char *src = coding->source, *src_base;
4713   const unsigned char *src_end = coding->source + coding->src_bytes;
4714   int multibytep = coding->src_multibyte;
4715   int consumed_chars = 0;
4716   int found = 0;
4717   int c;
4718
4719   detect_info->checked |= CATEGORY_MASK_BIG5;
4720   /* A coding system of this category is always ASCII compatible.  */
4721   src += coding->head_ascii;
4722
4723   while (1)
4724     {
4725       src_base = src;
4726       ONE_MORE_BYTE (c);
4727       if (c < 0x80)
4728         continue;
4729       if (c >= 0xA1)
4730         {
4731           ONE_MORE_BYTE (c);
4732           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4733             return 0;
4734           found = CATEGORY_MASK_BIG5;
4735         }
4736       else
4737         break;
4738     }
4739   detect_info->rejected |= CATEGORY_MASK_BIG5;
4740   return 0;
4741
4742  no_more_source:
4743   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4744     {
4745       detect_info->rejected |= CATEGORY_MASK_BIG5;
4746       return 0;
4747     }
4748   detect_info->found |= found;
4749   return 1;
4750 }
4751
4752 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4753    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4754
4755 static void
4756 decode_coding_sjis (struct coding_system *coding)
4757 {
4758   const unsigned char *src = coding->source + coding->consumed;
4759   const unsigned char *src_end = coding->source + coding->src_bytes;
4760   const unsigned char *src_base;
4761   int *charbuf = coding->charbuf + coding->charbuf_used;
4762   /* We may produce one charset annotation in one loop and one more at
4763      the end.  */
4764   int *charbuf_end
4765     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4766   int consumed_chars = 0, consumed_chars_base;
4767   int multibytep = coding->src_multibyte;
4768   struct charset *charset_roman, *charset_kanji, *charset_kana;
4769   struct charset *charset_kanji2;
4770   Lisp_Object attrs, charset_list, val;
4771   int char_offset = coding->produced_char;
4772   int last_offset = char_offset;
4773   int last_id = charset_ascii;
4774   int eol_crlf =
4775     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4776   int byte_after_cr = -1;
4777
4778   CODING_GET_INFO (coding, attrs, charset_list);
4779
4780   val = charset_list;
4781   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4782   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4783   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4784   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4785
4786   while (1)
4787     {
4788       int c, c1;
4789       struct charset *charset;
4790
4791       src_base = src;
4792       consumed_chars_base = consumed_chars;
4793
4794       if (charbuf >= charbuf_end)
4795         {
4796           if (byte_after_cr >= 0)
4797             src_base--;
4798           break;
4799         }
4800
4801       if (byte_after_cr >= 0)
4802         c = byte_after_cr, byte_after_cr = -1;
4803       else
4804         ONE_MORE_BYTE (c);
4805       if (c < 0)
4806         goto invalid_code;
4807       if (c < 0x80)
4808         {
4809           if (eol_crlf && c == '\r')
4810             ONE_MORE_BYTE (byte_after_cr);
4811           charset = charset_roman;
4812         }
4813       else if (c == 0x80 || c == 0xA0)
4814         goto invalid_code;
4815       else if (c >= 0xA1 && c <= 0xDF)
4816         {
4817           /* SJIS -> JISX0201-Kana */
4818           c &= 0x7F;
4819           charset = charset_kana;
4820         }
4821       else if (c <= 0xEF)
4822         {
4823           /* SJIS -> JISX0208 */
4824           ONE_MORE_BYTE (c1);
4825           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4826             goto invalid_code;
4827           c = (c << 8) | c1;
4828           SJIS_TO_JIS (c);
4829           charset = charset_kanji;
4830         }
4831       else if (c <= 0xFC && charset_kanji2)
4832         {
4833           /* SJIS -> JISX0213-2 */
4834           ONE_MORE_BYTE (c1);
4835           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4836             goto invalid_code;
4837           c = (c << 8) | c1;
4838           SJIS_TO_JIS2 (c);
4839           charset = charset_kanji2;
4840         }
4841       else
4842         goto invalid_code;
4843       if (charset->id != charset_ascii
4844           && last_id != charset->id)
4845         {
4846           if (last_id != charset_ascii)
4847             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4848           last_id = charset->id;
4849           last_offset = char_offset;
4850         }
4851       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4852       *charbuf++ = c;
4853       char_offset++;
4854       continue;
4855
4856     invalid_code:
4857       src = src_base;
4858       consumed_chars = consumed_chars_base;
4859       ONE_MORE_BYTE (c);
4860       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4861       char_offset++;
4862       coding->errors++;
4863     }
4864
4865  no_more_source:
4866   if (last_id != charset_ascii)
4867     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4868   coding->consumed_char += consumed_chars_base;
4869   coding->consumed = src_base - coding->source;
4870   coding->charbuf_used = charbuf - coding->charbuf;
4871 }
4872
4873 static void
4874 decode_coding_big5 (struct coding_system *coding)
4875 {
4876   const unsigned char *src = coding->source + coding->consumed;
4877   const unsigned char *src_end = coding->source + coding->src_bytes;
4878   const unsigned char *src_base;
4879   int *charbuf = coding->charbuf + coding->charbuf_used;
4880   /* We may produce one charset annotation in one loop and one more at
4881      the end.  */
4882   int *charbuf_end
4883     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4884   int consumed_chars = 0, consumed_chars_base;
4885   int multibytep = coding->src_multibyte;
4886   struct charset *charset_roman, *charset_big5;
4887   Lisp_Object attrs, charset_list, val;
4888   int char_offset = coding->produced_char;
4889   int last_offset = char_offset;
4890   int last_id = charset_ascii;
4891   int eol_crlf =
4892     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4893   int byte_after_cr = -1;
4894
4895   CODING_GET_INFO (coding, attrs, charset_list);
4896   val = charset_list;
4897   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4898   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4899
4900   while (1)
4901     {
4902       int c, c1;
4903       struct charset *charset;
4904
4905       src_base = src;
4906       consumed_chars_base = consumed_chars;
4907
4908       if (charbuf >= charbuf_end)
4909         {
4910           if (byte_after_cr >= 0)
4911             src_base--;
4912           break;
4913         }
4914
4915       if (byte_after_cr >= 0)
4916         c = byte_after_cr, byte_after_cr = -1;
4917       else
4918         ONE_MORE_BYTE (c);
4919
4920       if (c < 0)
4921         goto invalid_code;
4922       if (c < 0x80)
4923         {
4924           if (eol_crlf && c == '\r')
4925             ONE_MORE_BYTE (byte_after_cr);
4926           charset = charset_roman;
4927         }
4928       else
4929         {
4930           /* BIG5 -> Big5 */
4931           if (c < 0xA1 || c > 0xFE)
4932             goto invalid_code;
4933           ONE_MORE_BYTE (c1);
4934           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4935             goto invalid_code;
4936           c = c << 8 | c1;
4937           charset = charset_big5;
4938         }
4939       if (charset->id != charset_ascii
4940           && last_id != charset->id)
4941         {
4942           if (last_id != charset_ascii)
4943             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4944           last_id = charset->id;
4945           last_offset = char_offset;
4946         }
4947       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4948       *charbuf++ = c;
4949       char_offset++;
4950       continue;
4951
4952     invalid_code:
4953       src = src_base;
4954       consumed_chars = consumed_chars_base;
4955       ONE_MORE_BYTE (c);
4956       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4957       char_offset++;
4958       coding->errors++;
4959     }
4960
4961  no_more_source:
4962   if (last_id != charset_ascii)
4963     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4964   coding->consumed_char += consumed_chars_base;
4965   coding->consumed = src_base - coding->source;
4966   coding->charbuf_used = charbuf - coding->charbuf;
4967 }
4968
4969 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4970    This function can encode charsets `ascii', `katakana-jisx0201',
4971    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4972    are sure that all these charsets are registered as official charset
4973    (i.e. do not have extended leading-codes).  Characters of other
4974    charsets are produced without any encoding.  If SJIS_P is 1, encode
4975    SJIS text, else encode BIG5 text.  */
4976
4977 static int
4978 encode_coding_sjis (struct coding_system *coding)
4979 {
4980   int multibytep = coding->dst_multibyte;
4981   int *charbuf = coding->charbuf;
4982   int *charbuf_end = charbuf + coding->charbuf_used;
4983   unsigned char *dst = coding->destination + coding->produced;
4984   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4985   int safe_room = 4;
4986   int produced_chars = 0;
4987   Lisp_Object attrs, charset_list, val;
4988   int ascii_compatible;
4989   struct charset *charset_roman, *charset_kanji, *charset_kana;
4990   struct charset *charset_kanji2;
4991   int c;
4992
4993   CODING_GET_INFO (coding, attrs, charset_list);
4994   val = charset_list;
4995   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4996   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4997   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4998   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4999
5000   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5001
5002   while (charbuf < charbuf_end)
5003     {
5004       ASSURE_DESTINATION (safe_room);
5005       c = *charbuf++;
5006       /* Now encode the character C.  */
5007       if (ASCII_CHAR_P (c) && ascii_compatible)
5008         EMIT_ONE_ASCII_BYTE (c);
5009       else if (CHAR_BYTE8_P (c))
5010         {
5011           c = CHAR_TO_BYTE8 (c);
5012           EMIT_ONE_BYTE (c);
5013         }
5014       else
5015         {
5016           unsigned code;
5017           struct charset *charset = char_charset (c, charset_list, &code);
5018
5019           if (!charset)
5020             {
5021               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5022                 {
5023                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5024                   charset = CHARSET_FROM_ID (charset_ascii);
5025                 }
5026               else
5027                 {
5028                   c = coding->default_char;
5029                   charset = char_charset (c, charset_list, &code);
5030                 }
5031             }
5032           if (code == CHARSET_INVALID_CODE (charset))
5033             abort ();
5034           if (charset == charset_kanji)
5035             {
5036               int c1, c2;
5037               JIS_TO_SJIS (code);
5038               c1 = code >> 8, c2 = code & 0xFF;
5039               EMIT_TWO_BYTES (c1, c2);
5040             }
5041           else if (charset == charset_kana)
5042             EMIT_ONE_BYTE (code | 0x80);
5043           else if (charset_kanji2 && charset == charset_kanji2)
5044             {
5045               int c1, c2;
5046
5047               c1 = code >> 8;
5048               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5049                   || c1 == 0x28
5050                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5051                 {
5052                   JIS_TO_SJIS2 (code);
5053                   c1 = code >> 8, c2 = code & 0xFF;
5054                   EMIT_TWO_BYTES (c1, c2);
5055                 }
5056               else
5057                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5058             }
5059           else
5060             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5061         }
5062     }
5063   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5064   coding->produced_char += produced_chars;
5065   coding->produced = dst - coding->destination;
5066   return 0;
5067 }
5068
5069 static int
5070 encode_coding_big5 (struct coding_system *coding)
5071 {
5072   int multibytep = coding->dst_multibyte;
5073   int *charbuf = coding->charbuf;
5074   int *charbuf_end = charbuf + coding->charbuf_used;
5075   unsigned char *dst = coding->destination + coding->produced;
5076   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5077   int safe_room = 4;
5078   int produced_chars = 0;
5079   Lisp_Object attrs, charset_list, val;
5080   int ascii_compatible;
5081   struct charset *charset_roman, *charset_big5;
5082   int c;
5083
5084   CODING_GET_INFO (coding, attrs, charset_list);
5085   val = charset_list;
5086   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5087   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5088   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5089
5090   while (charbuf < charbuf_end)
5091     {
5092       ASSURE_DESTINATION (safe_room);
5093       c = *charbuf++;
5094       /* Now encode the character C.  */
5095       if (ASCII_CHAR_P (c) && ascii_compatible)
5096         EMIT_ONE_ASCII_BYTE (c);
5097       else if (CHAR_BYTE8_P (c))
5098         {
5099           c = CHAR_TO_BYTE8 (c);
5100           EMIT_ONE_BYTE (c);
5101         }
5102       else
5103         {
5104           unsigned code;
5105           struct charset *charset = char_charset (c, charset_list, &code);
5106
5107           if (! charset)
5108             {
5109               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5110                 {
5111                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5112                   charset = CHARSET_FROM_ID (charset_ascii);
5113                 }
5114               else
5115                 {
5116                   c = coding->default_char;
5117                   charset = char_charset (c, charset_list, &code);
5118                 }
5119             }
5120           if (code == CHARSET_INVALID_CODE (charset))
5121             abort ();
5122           if (charset == charset_big5)
5123             {
5124               int c1, c2;
5125
5126               c1 = code >> 8, c2 = code & 0xFF;
5127               EMIT_TWO_BYTES (c1, c2);
5128             }
5129           else
5130             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5131         }
5132     }
5133   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5134   coding->produced_char += produced_chars;
5135   coding->produced = dst - coding->destination;
5136   return 0;
5137 }
5138
5139 \f
5140 /*** 10. CCL handlers ***/
5141
5142 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5143    Check if a text is encoded in a coding system of which
5144    encoder/decoder are written in CCL program.  If it is, return
5145    CATEGORY_MASK_CCL, else return 0.  */
5146
5147 static int
5148 detect_coding_ccl (struct coding_system *coding,
5149                    struct coding_detection_info *detect_info)
5150 {
5151   const unsigned char *src = coding->source, *src_base;
5152   const unsigned char *src_end = coding->source + coding->src_bytes;
5153   int multibytep = coding->src_multibyte;
5154   int consumed_chars = 0;
5155   int found = 0;
5156   unsigned char *valids;
5157   int head_ascii = coding->head_ascii;
5158   Lisp_Object attrs;
5159
5160   detect_info->checked |= CATEGORY_MASK_CCL;
5161
5162   coding = &coding_categories[coding_category_ccl];
5163   valids = CODING_CCL_VALIDS (coding);
5164   attrs = CODING_ID_ATTRS (coding->id);
5165   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5166     src += head_ascii;
5167
5168   while (1)
5169     {
5170       int c;
5171
5172       src_base = src;
5173       ONE_MORE_BYTE (c);
5174       if (c < 0 || ! valids[c])
5175         break;
5176       if ((valids[c] > 1))
5177         found = CATEGORY_MASK_CCL;
5178     }
5179   detect_info->rejected |= CATEGORY_MASK_CCL;
5180   return 0;
5181
5182  no_more_source:
5183   detect_info->found |= found;
5184   return 1;
5185 }
5186
5187 static void
5188 decode_coding_ccl (struct coding_system *coding)
5189 {
5190   const unsigned char *src = coding->source + coding->consumed;
5191   const unsigned char *src_end = coding->source + coding->src_bytes;
5192   int *charbuf = coding->charbuf + coding->charbuf_used;
5193   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5194   int consumed_chars = 0;
5195   int multibytep = coding->src_multibyte;
5196   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5197   int source_charbuf[1024];
5198   int source_byteidx[1025];
5199   Lisp_Object attrs, charset_list;
5200
5201   CODING_GET_INFO (coding, attrs, charset_list);
5202
5203   while (1)
5204     {
5205       const unsigned char *p = src;
5206       int i = 0;
5207
5208       if (multibytep)
5209         {
5210           while (i < 1024 && p < src_end)
5211             {
5212               source_byteidx[i] = p - src;
5213               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5214             }
5215           source_byteidx[i] = p - src;
5216         }
5217       else
5218         while (i < 1024 && p < src_end)
5219           source_charbuf[i++] = *p++;
5220
5221       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5222         ccl->last_block = 1;
5223       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5224                   charset_list);
5225       charbuf += ccl->produced;
5226       if (multibytep)
5227         src += source_byteidx[ccl->consumed];
5228       else
5229         src += ccl->consumed;
5230       consumed_chars += ccl->consumed;
5231       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5232         break;
5233     }
5234
5235   switch (ccl->status)
5236     {
5237     case CCL_STAT_SUSPEND_BY_SRC:
5238       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5239       break;
5240     case CCL_STAT_SUSPEND_BY_DST:
5241       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5242       break;
5243     case CCL_STAT_QUIT:
5244     case CCL_STAT_INVALID_CMD:
5245       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5246       break;
5247     default:
5248       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5249       break;
5250     }
5251   coding->consumed_char += consumed_chars;
5252   coding->consumed = src - coding->source;
5253   coding->charbuf_used = charbuf - coding->charbuf;
5254 }
5255
5256 static int
5257 encode_coding_ccl (struct coding_system *coding)
5258 {
5259   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5260   int multibytep = coding->dst_multibyte;
5261   int *charbuf = coding->charbuf;
5262   int *charbuf_end = charbuf + coding->charbuf_used;
5263   unsigned char *dst = coding->destination + coding->produced;
5264   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5265   int destination_charbuf[1024];
5266   int i, produced_chars = 0;
5267   Lisp_Object attrs, charset_list;
5268
5269   CODING_GET_INFO (coding, attrs, charset_list);
5270   if (coding->consumed_char == coding->src_chars
5271       && coding->mode & CODING_MODE_LAST_BLOCK)
5272     ccl->last_block = 1;
5273
5274   while (charbuf < charbuf_end)
5275     {
5276       ccl_driver (ccl, charbuf, destination_charbuf,
5277                   charbuf_end - charbuf, 1024, charset_list);
5278       if (multibytep)
5279         {
5280           ASSURE_DESTINATION (ccl->produced * 2);
5281           for (i = 0; i < ccl->produced; i++)
5282             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5283         }
5284       else
5285         {
5286           ASSURE_DESTINATION (ccl->produced);
5287           for (i = 0; i < ccl->produced; i++)
5288             *dst++ = destination_charbuf[i] & 0xFF;
5289           produced_chars += ccl->produced;
5290         }
5291       charbuf += ccl->consumed;
5292       if (ccl->status == CCL_STAT_QUIT
5293           || ccl->status == CCL_STAT_INVALID_CMD)
5294         break;
5295     }
5296
5297   switch (ccl->status)
5298     {
5299     case CCL_STAT_SUSPEND_BY_SRC:
5300       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5301       break;
5302     case CCL_STAT_SUSPEND_BY_DST:
5303       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5304       break;
5305     case CCL_STAT_QUIT:
5306     case CCL_STAT_INVALID_CMD:
5307       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5308       break;
5309     default:
5310       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5311       break;
5312     }
5313
5314   coding->produced_char += produced_chars;
5315   coding->produced = dst - coding->destination;
5316   return 0;
5317 }
5318
5319
5320 \f
5321 /*** 10, 11. no-conversion handlers ***/
5322
5323 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5324
5325 static void
5326 decode_coding_raw_text (struct coding_system *coding)
5327 {
5328   int eol_crlf =
5329     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5330
5331   coding->chars_at_source = 1;
5332   coding->consumed_char = coding->src_chars;
5333   coding->consumed = coding->src_bytes;
5334   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5335     {
5336       coding->consumed_char--;
5337       coding->consumed--;
5338       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5339     }
5340   else
5341     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5342 }
5343
5344 static int
5345 encode_coding_raw_text (struct coding_system *coding)
5346 {
5347   int multibytep = coding->dst_multibyte;
5348   int *charbuf = coding->charbuf;
5349   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5350   unsigned char *dst = coding->destination + coding->produced;
5351   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5352   int produced_chars = 0;
5353   int c;
5354
5355   if (multibytep)
5356     {
5357       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5358
5359       if (coding->src_multibyte)
5360         while (charbuf < charbuf_end)
5361           {
5362             ASSURE_DESTINATION (safe_room);
5363             c = *charbuf++;
5364             if (ASCII_CHAR_P (c))
5365               EMIT_ONE_ASCII_BYTE (c);
5366             else if (CHAR_BYTE8_P (c))
5367               {
5368                 c = CHAR_TO_BYTE8 (c);
5369                 EMIT_ONE_BYTE (c);
5370               }
5371             else
5372               {
5373                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5374
5375                 CHAR_STRING_ADVANCE (c, p1);
5376                 while (p0 < p1)
5377                   {
5378                     EMIT_ONE_BYTE (*p0);
5379                     p0++;
5380                   }
5381               }
5382           }
5383       else
5384         while (charbuf < charbuf_end)
5385           {
5386             ASSURE_DESTINATION (safe_room);
5387             c = *charbuf++;
5388             EMIT_ONE_BYTE (c);
5389           }
5390     }
5391   else
5392     {
5393       if (coding->src_multibyte)
5394         {
5395           int safe_room = MAX_MULTIBYTE_LENGTH;
5396
5397           while (charbuf < charbuf_end)
5398             {
5399               ASSURE_DESTINATION (safe_room);
5400               c = *charbuf++;
5401               if (ASCII_CHAR_P (c))
5402                 *dst++ = c;
5403               else if (CHAR_BYTE8_P (c))
5404                 *dst++ = CHAR_TO_BYTE8 (c);
5405               else
5406                 CHAR_STRING_ADVANCE (c, dst);
5407             }
5408         }
5409       else
5410         {
5411           ASSURE_DESTINATION (charbuf_end - charbuf);
5412           while (charbuf < charbuf_end && dst < dst_end)
5413             *dst++ = *charbuf++;
5414         }
5415       produced_chars = dst - (coding->destination + coding->produced);
5416     }
5417   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5418   coding->produced_char += produced_chars;
5419   coding->produced = dst - coding->destination;
5420   return 0;
5421 }
5422
5423 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5424    Check if a text is encoded in a charset-based coding system.  If it
5425    is, return 1, else return 0.  */
5426
5427 static int
5428 detect_coding_charset (struct coding_system *coding,
5429                        struct coding_detection_info *detect_info)
5430 {
5431   const unsigned char *src = coding->source, *src_base;
5432   const unsigned char *src_end = coding->source + coding->src_bytes;
5433   int multibytep = coding->src_multibyte;
5434   int consumed_chars = 0;
5435   Lisp_Object attrs, valids, name;
5436   int found = 0;
5437   int head_ascii = coding->head_ascii;
5438   int check_latin_extra = 0;
5439
5440   detect_info->checked |= CATEGORY_MASK_CHARSET;
5441
5442   coding = &coding_categories[coding_category_charset];
5443   attrs = CODING_ID_ATTRS (coding->id);
5444   valids = AREF (attrs, coding_attr_charset_valids);
5445   name = CODING_ID_NAME (coding->id);
5446   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5447                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5448       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5449                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5450     check_latin_extra = 1;
5451
5452   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5453     src += head_ascii;
5454
5455   while (1)
5456     {
5457       int c;
5458       Lisp_Object val;
5459       struct charset *charset;
5460       int dim, idx;
5461
5462       src_base = src;
5463       ONE_MORE_BYTE (c);
5464       if (c < 0)
5465         continue;
5466       val = AREF (valids, c);
5467       if (NILP (val))
5468         break;
5469       if (c >= 0x80)
5470         {
5471           if (c < 0xA0
5472               && check_latin_extra
5473               && (!VECTORP (Vlatin_extra_code_table)
5474                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5475             break;
5476           found = CATEGORY_MASK_CHARSET;
5477         }
5478       if (INTEGERP (val))
5479         {
5480           charset = CHARSET_FROM_ID (XFASTINT (val));
5481           dim = CHARSET_DIMENSION (charset);
5482           for (idx = 1; idx < dim; idx++)
5483             {
5484               if (src == src_end)
5485                 goto too_short;
5486               ONE_MORE_BYTE (c);
5487               if (c < charset->code_space[(dim - 1 - idx) * 2]
5488                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5489                 break;
5490             }
5491           if (idx < dim)
5492             break;
5493         }
5494       else
5495         {
5496           idx = 1;
5497           for (; CONSP (val); val = XCDR (val))
5498             {
5499               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5500               dim = CHARSET_DIMENSION (charset);
5501               while (idx < dim)
5502                 {
5503                   if (src == src_end)
5504                     goto too_short;
5505                   ONE_MORE_BYTE (c);
5506                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5507                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5508                     break;
5509                   idx++;
5510                 }
5511               if (idx == dim)
5512                 {
5513                   val = Qnil;
5514                   break;
5515                 }
5516             }
5517           if (CONSP (val))
5518             break;
5519         }
5520     }
5521  too_short:
5522   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5523   return 0;
5524
5525  no_more_source:
5526   detect_info->found |= found;
5527   return 1;
5528 }
5529
5530 static void
5531 decode_coding_charset (struct coding_system *coding)
5532 {
5533   const unsigned char *src = coding->source + coding->consumed;
5534   const unsigned char *src_end = coding->source + coding->src_bytes;
5535   const unsigned char *src_base;
5536   int *charbuf = coding->charbuf + coding->charbuf_used;
5537   /* We may produce one charset annotation in one loop and one more at
5538      the end.  */
5539   int *charbuf_end
5540     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5541   int consumed_chars = 0, consumed_chars_base;
5542   int multibytep = coding->src_multibyte;
5543   Lisp_Object attrs, charset_list, valids;
5544   int char_offset = coding->produced_char;
5545   int last_offset = char_offset;
5546   int last_id = charset_ascii;
5547   int eol_crlf =
5548     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5549   int byte_after_cr = -1;
5550
5551   CODING_GET_INFO (coding, attrs, charset_list);
5552   valids = AREF (attrs, coding_attr_charset_valids);
5553
5554   while (1)
5555     {
5556       int c;
5557       Lisp_Object val;
5558       struct charset *charset;
5559       int dim;
5560       int len = 1;
5561       unsigned code;
5562
5563       src_base = src;
5564       consumed_chars_base = consumed_chars;
5565
5566       if (charbuf >= charbuf_end)
5567         {
5568           if (byte_after_cr >= 0)
5569             src_base--;
5570           break;
5571         }
5572
5573       if (byte_after_cr >= 0)
5574         {
5575           c = byte_after_cr;
5576           byte_after_cr = -1;
5577         }
5578       else
5579         {
5580           ONE_MORE_BYTE (c);
5581           if (eol_crlf && c == '\r')
5582             ONE_MORE_BYTE (byte_after_cr);
5583         }
5584       if (c < 0)
5585         goto invalid_code;
5586       code = c;
5587
5588       val = AREF (valids, c);
5589       if (! INTEGERP (val) && ! CONSP (val))
5590         goto invalid_code;
5591       if (INTEGERP (val))
5592         {
5593           charset = CHARSET_FROM_ID (XFASTINT (val));
5594           dim = CHARSET_DIMENSION (charset);
5595           while (len < dim)
5596             {
5597               ONE_MORE_BYTE (c);
5598               code = (code << 8) | c;
5599               len++;
5600             }
5601           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5602                               charset, code, c);
5603         }
5604       else
5605         {
5606           /* VAL is a list of charset IDs.  It is assured that the
5607              list is sorted by charset dimensions (smaller one
5608              comes first).  */
5609           while (CONSP (val))
5610             {
5611               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5612               dim = CHARSET_DIMENSION (charset);
5613               while (len < dim)
5614                 {
5615                   ONE_MORE_BYTE (c);
5616                   code = (code << 8) | c;
5617                   len++;
5618                 }
5619               CODING_DECODE_CHAR (coding, src, src_base,
5620                                   src_end, charset, code, c);
5621               if (c >= 0)
5622                 break;
5623               val = XCDR (val);
5624             }
5625         }
5626       if (c < 0)
5627         goto invalid_code;
5628       if (charset->id != charset_ascii
5629           && last_id != charset->id)
5630         {
5631           if (last_id != charset_ascii)
5632             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5633           last_id = charset->id;
5634           last_offset = char_offset;
5635         }
5636
5637       *charbuf++ = c;
5638       char_offset++;
5639       continue;
5640
5641     invalid_code:
5642       src = src_base;
5643       consumed_chars = consumed_chars_base;
5644       ONE_MORE_BYTE (c);
5645       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5646       char_offset++;
5647       coding->errors++;
5648     }
5649
5650  no_more_source:
5651   if (last_id != charset_ascii)
5652     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5653   coding->consumed_char += consumed_chars_base;
5654   coding->consumed = src_base - coding->source;
5655   coding->charbuf_used = charbuf - coding->charbuf;
5656 }
5657
5658 static int
5659 encode_coding_charset (struct coding_system *coding)
5660 {
5661   int multibytep = coding->dst_multibyte;
5662   int *charbuf = coding->charbuf;
5663   int *charbuf_end = charbuf + coding->charbuf_used;
5664   unsigned char *dst = coding->destination + coding->produced;
5665   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5666   int safe_room = MAX_MULTIBYTE_LENGTH;
5667   int produced_chars = 0;
5668   Lisp_Object attrs, charset_list;
5669   int ascii_compatible;
5670   int c;
5671
5672   CODING_GET_INFO (coding, attrs, charset_list);
5673   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5674
5675   while (charbuf < charbuf_end)
5676     {
5677       struct charset *charset;
5678       unsigned code;
5679
5680       ASSURE_DESTINATION (safe_room);
5681       c = *charbuf++;
5682       if (ascii_compatible && ASCII_CHAR_P (c))
5683         EMIT_ONE_ASCII_BYTE (c);
5684       else if (CHAR_BYTE8_P (c))
5685         {
5686           c = CHAR_TO_BYTE8 (c);
5687           EMIT_ONE_BYTE (c);
5688         }
5689       else
5690         {
5691           charset = char_charset (c, charset_list, &code);
5692           if (charset)
5693             {
5694               if (CHARSET_DIMENSION (charset) == 1)
5695                 EMIT_ONE_BYTE (code);
5696               else if (CHARSET_DIMENSION (charset) == 2)
5697                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5698               else if (CHARSET_DIMENSION (charset) == 3)
5699                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5700               else
5701                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5702                                  (code >> 8) & 0xFF, code & 0xFF);
5703             }
5704           else
5705             {
5706               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5707                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5708               else
5709                 c = coding->default_char;
5710               EMIT_ONE_BYTE (c);
5711             }
5712         }
5713     }
5714
5715   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5716   coding->produced_char += produced_chars;
5717   coding->produced = dst - coding->destination;
5718   return 0;
5719 }
5720
5721 \f
5722 /*** 7. C library functions ***/
5723
5724 /* Setup coding context CODING from information about CODING_SYSTEM.
5725    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5726    CODING_SYSTEM is invalid, signal an error.  */
5727
5728 void
5729 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5730 {
5731   Lisp_Object attrs;
5732   Lisp_Object eol_type;
5733   Lisp_Object coding_type;
5734   Lisp_Object val;
5735
5736   if (NILP (coding_system))
5737     coding_system = Qundecided;
5738
5739   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5740
5741   attrs = CODING_ID_ATTRS (coding->id);
5742   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5743
5744   coding->mode = 0;
5745   coding->head_ascii = -1;
5746   if (VECTORP (eol_type))
5747     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5748                             | CODING_REQUIRE_DETECTION_MASK);
5749   else if (! EQ (eol_type, Qunix))
5750     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5751                             | CODING_REQUIRE_ENCODING_MASK);
5752   else
5753     coding->common_flags = 0;
5754   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5755     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5756   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5757     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5758   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5759     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5760
5761   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5762   coding->max_charset_id = SCHARS (val) - 1;
5763   coding->safe_charsets = SDATA (val);
5764   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5765   coding->carryover_bytes = 0;
5766
5767   coding_type = CODING_ATTR_TYPE (attrs);
5768   if (EQ (coding_type, Qundecided))
5769     {
5770       coding->detector = NULL;
5771       coding->decoder = decode_coding_raw_text;
5772       coding->encoder = encode_coding_raw_text;
5773       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5774     }
5775   else if (EQ (coding_type, Qiso_2022))
5776     {
5777       int i;
5778       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5779
5780       /* Invoke graphic register 0 to plane 0.  */
5781       CODING_ISO_INVOCATION (coding, 0) = 0;
5782       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5783       CODING_ISO_INVOCATION (coding, 1)
5784         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5785       /* Setup the initial status of designation.  */
5786       for (i = 0; i < 4; i++)
5787         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5788       /* Not single shifting initially.  */
5789       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5790       /* Beginning of buffer should also be regarded as bol. */
5791       CODING_ISO_BOL (coding) = 1;
5792       coding->detector = detect_coding_iso_2022;
5793       coding->decoder = decode_coding_iso_2022;
5794       coding->encoder = encode_coding_iso_2022;
5795       if (flags & CODING_ISO_FLAG_SAFE)
5796         coding->mode |= CODING_MODE_SAFE_ENCODING;
5797       coding->common_flags
5798         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5799             | CODING_REQUIRE_FLUSHING_MASK);
5800       if (flags & CODING_ISO_FLAG_COMPOSITION)
5801         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5802       if (flags & CODING_ISO_FLAG_DESIGNATION)
5803         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5804       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5805         {
5806           setup_iso_safe_charsets (attrs);
5807           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5808           coding->max_charset_id = SCHARS (val) - 1;
5809           coding->safe_charsets = SDATA (val);
5810         }
5811       CODING_ISO_FLAGS (coding) = flags;
5812       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5813       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5814       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5815       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5816     }
5817   else if (EQ (coding_type, Qcharset))
5818     {
5819       coding->detector = detect_coding_charset;
5820       coding->decoder = decode_coding_charset;
5821       coding->encoder = encode_coding_charset;
5822       coding->common_flags
5823         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5824     }
5825   else if (EQ (coding_type, Qutf_8))
5826     {
5827       val = AREF (attrs, coding_attr_utf_bom);
5828       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5829                                    : EQ (val, Qt) ? utf_with_bom
5830                                    : utf_without_bom);
5831       coding->detector = detect_coding_utf_8;
5832       coding->decoder = decode_coding_utf_8;
5833       coding->encoder = encode_coding_utf_8;
5834       coding->common_flags
5835         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5836       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5837         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5838     }
5839   else if (EQ (coding_type, Qutf_16))
5840     {
5841       val = AREF (attrs, coding_attr_utf_bom);
5842       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5843                                     : EQ (val, Qt) ? utf_with_bom
5844                                     : utf_without_bom);
5845       val = AREF (attrs, coding_attr_utf_16_endian);
5846       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5847                                        : utf_16_little_endian);
5848       CODING_UTF_16_SURROGATE (coding) = 0;
5849       coding->detector = detect_coding_utf_16;
5850       coding->decoder = decode_coding_utf_16;
5851       coding->encoder = encode_coding_utf_16;
5852       coding->common_flags
5853         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5854       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5855         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5856     }
5857   else if (EQ (coding_type, Qccl))
5858     {
5859       coding->detector = detect_coding_ccl;
5860       coding->decoder = decode_coding_ccl;
5861       coding->encoder = encode_coding_ccl;
5862       coding->common_flags
5863         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5864             | CODING_REQUIRE_FLUSHING_MASK);
5865     }
5866   else if (EQ (coding_type, Qemacs_mule))
5867     {
5868       coding->detector = detect_coding_emacs_mule;
5869       coding->decoder = decode_coding_emacs_mule;
5870       coding->encoder = encode_coding_emacs_mule;
5871       coding->common_flags
5872         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5873       coding->spec.emacs_mule.full_support = 1;
5874       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5875           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5876         {
5877           Lisp_Object tail, safe_charsets;
5878           int max_charset_id = 0;
5879
5880           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5881                tail = XCDR (tail))
5882             if (max_charset_id < XFASTINT (XCAR (tail)))
5883               max_charset_id = XFASTINT (XCAR (tail));
5884           safe_charsets = make_uninit_string (max_charset_id + 1);
5885           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5886           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5887                tail = XCDR (tail))
5888             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5889           coding->max_charset_id = max_charset_id;
5890           coding->safe_charsets = SDATA (safe_charsets);
5891           coding->spec.emacs_mule.full_support = 1;
5892         }
5893       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5894       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5895     }
5896   else if (EQ (coding_type, Qshift_jis))
5897     {
5898       coding->detector = detect_coding_sjis;
5899       coding->decoder = decode_coding_sjis;
5900       coding->encoder = encode_coding_sjis;
5901       coding->common_flags
5902         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5903     }
5904   else if (EQ (coding_type, Qbig5))
5905     {
5906       coding->detector = detect_coding_big5;
5907       coding->decoder = decode_coding_big5;
5908       coding->encoder = encode_coding_big5;
5909       coding->common_flags
5910         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5911     }
5912   else                          /* EQ (coding_type, Qraw_text) */
5913     {
5914       coding->detector = NULL;
5915       coding->decoder = decode_coding_raw_text;
5916       coding->encoder = encode_coding_raw_text;
5917       if (! EQ (eol_type, Qunix))
5918         {
5919           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5920           if (! VECTORP (eol_type))
5921             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5922         }
5923
5924     }
5925
5926   return;
5927 }
5928
5929 /* Return a list of charsets supported by CODING.  */
5930
5931 Lisp_Object
5932 coding_charset_list (struct coding_system *coding)
5933 {
5934   Lisp_Object attrs, charset_list;
5935
5936   CODING_GET_INFO (coding, attrs, charset_list);
5937   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5938     {
5939       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5940
5941       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5942         charset_list = Viso_2022_charset_list;
5943     }
5944   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5945     {
5946       charset_list = Vemacs_mule_charset_list;
5947     }
5948   return charset_list;
5949 }
5950
5951
5952 /* Return a list of charsets supported by CODING-SYSTEM.  */
5953
5954 Lisp_Object
5955 coding_system_charset_list (Lisp_Object coding_system)
5956 {
5957   int id;
5958   Lisp_Object attrs, charset_list;
5959
5960   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5961   attrs = CODING_ID_ATTRS (id);
5962
5963   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5964     {
5965       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5966
5967       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5968         charset_list = Viso_2022_charset_list;
5969       else
5970         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5971     }
5972   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5973     {
5974       charset_list = Vemacs_mule_charset_list;
5975     }
5976   else
5977     {
5978       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5979     }
5980   return charset_list;
5981 }
5982
5983
5984 /* Return raw-text or one of its subsidiaries that has the same
5985    eol_type as CODING-SYSTEM.  */
5986
5987 Lisp_Object
5988 raw_text_coding_system (Lisp_Object coding_system)
5989 {
5990   Lisp_Object spec, attrs;
5991   Lisp_Object eol_type, raw_text_eol_type;
5992
5993   if (NILP (coding_system))
5994     return Qraw_text;
5995   spec = CODING_SYSTEM_SPEC (coding_system);
5996   attrs = AREF (spec, 0);
5997
5998   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5999     return coding_system;
6000
6001   eol_type = AREF (spec, 2);
6002   if (VECTORP (eol_type))
6003     return Qraw_text;
6004   spec = CODING_SYSTEM_SPEC (Qraw_text);
6005   raw_text_eol_type = AREF (spec, 2);
6006   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6007           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6008           : AREF (raw_text_eol_type, 2));
6009 }
6010
6011
6012 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6013    the subsidiary that has the same eol-spec as PARENT (if it is not
6014    nil and specifies end-of-line format) or the system's setting
6015    (system_eol_type).  */
6016
6017 Lisp_Object
6018 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6019 {
6020   Lisp_Object spec, eol_type;
6021
6022   if (NILP (coding_system))
6023     coding_system = Qraw_text;
6024   spec = CODING_SYSTEM_SPEC (coding_system);
6025   eol_type = AREF (spec, 2);
6026   if (VECTORP (eol_type))
6027     {
6028       Lisp_Object parent_eol_type;
6029
6030       if (! NILP (parent))
6031         {
6032           Lisp_Object parent_spec;
6033
6034           parent_spec = CODING_SYSTEM_SPEC (parent);
6035           parent_eol_type = AREF (parent_spec, 2);
6036           if (VECTORP (parent_eol_type))
6037             parent_eol_type = system_eol_type;
6038         }
6039       else
6040         parent_eol_type = system_eol_type;
6041       if (EQ (parent_eol_type, Qunix))
6042         coding_system = AREF (eol_type, 0);
6043       else if (EQ (parent_eol_type, Qdos))
6044         coding_system = AREF (eol_type, 1);
6045       else if (EQ (parent_eol_type, Qmac))
6046         coding_system = AREF (eol_type, 2);
6047     }
6048   return coding_system;
6049 }
6050
6051
6052 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6053    decided for writing to a process.  If not, complement them, and
6054    return a new coding system.  */
6055
6056 Lisp_Object
6057 complement_process_encoding_system (Lisp_Object coding_system)
6058 {
6059   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6060   Lisp_Object spec, attrs;
6061   int i;
6062
6063   for (i = 0; i < 3; i++)
6064     {
6065       if (i == 1)
6066         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6067       else if (i == 2)
6068         coding_system = preferred_coding_system ();
6069       spec = CODING_SYSTEM_SPEC (coding_system);
6070       if (NILP (spec))
6071         continue;
6072       attrs = AREF (spec, 0);
6073       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6074         coding_base = CODING_ATTR_BASE_NAME (attrs);
6075       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6076         eol_base = coding_system;
6077       if (! NILP (coding_base) && ! NILP (eol_base))
6078         break;
6079     }
6080
6081   if (i > 0)
6082     /* The original CODING_SYSTEM didn't specify text-conversion or
6083        eol-conversion.  Be sure that we return a fully complemented
6084        coding system.  */
6085     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6086   return coding_system;
6087 }
6088
6089
6090 /* Emacs has a mechanism to automatically detect a coding system if it
6091    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6092    it's impossible to distinguish some coding systems accurately
6093    because they use the same range of codes.  So, at first, coding
6094    systems are categorized into 7, those are:
6095
6096    o coding-category-emacs-mule
6097
6098         The category for a coding system which has the same code range
6099         as Emacs' internal format.  Assigned the coding-system (Lisp
6100         symbol) `emacs-mule' by default.
6101
6102    o coding-category-sjis
6103
6104         The category for a coding system which has the same code range
6105         as SJIS.  Assigned the coding-system (Lisp
6106         symbol) `japanese-shift-jis' by default.
6107
6108    o coding-category-iso-7
6109
6110         The category for a coding system which has the same code range
6111         as ISO2022 of 7-bit environment.  This doesn't use any locking
6112         shift and single shift functions.  This can encode/decode all
6113         charsets.  Assigned the coding-system (Lisp symbol)
6114         `iso-2022-7bit' by default.
6115
6116    o coding-category-iso-7-tight
6117
6118         Same as coding-category-iso-7 except that this can
6119         encode/decode only the specified charsets.
6120
6121    o coding-category-iso-8-1
6122
6123         The category for a coding system which has the same code range
6124         as ISO2022 of 8-bit environment and graphic plane 1 used only
6125         for DIMENSION1 charset.  This doesn't use any locking shift
6126         and single shift functions.  Assigned the coding-system (Lisp
6127         symbol) `iso-latin-1' by default.
6128
6129    o coding-category-iso-8-2
6130
6131         The category for a coding system which has the same code range
6132         as ISO2022 of 8-bit environment and graphic plane 1 used only
6133         for DIMENSION2 charset.  This doesn't use any locking shift
6134         and single shift functions.  Assigned the coding-system (Lisp
6135         symbol) `japanese-iso-8bit' by default.
6136
6137    o coding-category-iso-7-else
6138
6139         The category for a coding system which has the same code range
6140         as ISO2022 of 7-bit environment but uses locking shift or
6141         single shift functions.  Assigned the coding-system (Lisp
6142         symbol) `iso-2022-7bit-lock' by default.
6143
6144    o coding-category-iso-8-else
6145
6146         The category for a coding system which has the same code range
6147         as ISO2022 of 8-bit environment but uses locking shift or
6148         single shift functions.  Assigned the coding-system (Lisp
6149         symbol) `iso-2022-8bit-ss2' by default.
6150
6151    o coding-category-big5
6152
6153         The category for a coding system which has the same code range
6154         as BIG5.  Assigned the coding-system (Lisp symbol)
6155         `cn-big5' by default.
6156
6157    o coding-category-utf-8
6158
6159         The category for a coding system which has the same code range
6160         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6161         symbol) `utf-8' by default.
6162
6163    o coding-category-utf-16-be
6164
6165         The category for a coding system in which a text has an
6166         Unicode signature (cf. Unicode Standard) in the order of BIG
6167         endian at the head.  Assigned the coding-system (Lisp symbol)
6168         `utf-16-be' by default.
6169
6170    o coding-category-utf-16-le
6171
6172         The category for a coding system in which a text has an
6173         Unicode signature (cf. Unicode Standard) in the order of
6174         LITTLE endian at the head.  Assigned the coding-system (Lisp
6175         symbol) `utf-16-le' by default.
6176
6177    o coding-category-ccl
6178
6179         The category for a coding system of which encoder/decoder is
6180         written in CCL programs.  The default value is nil, i.e., no
6181         coding system is assigned.
6182
6183    o coding-category-binary
6184
6185         The category for a coding system not categorized in any of the
6186         above.  Assigned the coding-system (Lisp symbol)
6187         `no-conversion' by default.
6188
6189    Each of them is a Lisp symbol and the value is an actual
6190    `coding-system's (this is also a Lisp symbol) assigned by a user.
6191    What Emacs does actually is to detect a category of coding system.
6192    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6193    decide only one possible category, it selects a category of the
6194    highest priority.  Priorities of categories are also specified by a
6195    user in a Lisp variable `coding-category-list'.
6196
6197 */
6198
6199 #define EOL_SEEN_NONE   0
6200 #define EOL_SEEN_LF     1
6201 #define EOL_SEEN_CR     2
6202 #define EOL_SEEN_CRLF   4
6203
6204 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6205    SOURCE is encoded.  If CATEGORY is one of
6206    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6207    two-byte, else they are encoded by one-byte.
6208
6209    Return one of EOL_SEEN_XXX.  */
6210
6211 #define MAX_EOL_CHECK_COUNT 3
6212
6213 static int
6214 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6215             enum coding_category category)
6216 {
6217   const unsigned char *src = source, *src_end = src + src_bytes;
6218   unsigned char c;
6219   int total  = 0;
6220   int eol_seen = EOL_SEEN_NONE;
6221
6222   if ((1 << category) & CATEGORY_MASK_UTF_16)
6223     {
6224       int msb, lsb;
6225
6226       msb = category == (coding_category_utf_16_le
6227                          | coding_category_utf_16_le_nosig);
6228       lsb = 1 - msb;
6229
6230       while (src + 1 < src_end)
6231         {
6232           c = src[lsb];
6233           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6234             {
6235               int this_eol;
6236
6237               if (c == '\n')
6238                 this_eol = EOL_SEEN_LF;
6239               else if (src + 3 >= src_end
6240                        || src[msb + 2] != 0
6241                        || src[lsb + 2] != '\n')
6242                 this_eol = EOL_SEEN_CR;
6243               else
6244                 {
6245                   this_eol = EOL_SEEN_CRLF;
6246                   src += 2;
6247                 }
6248
6249               if (eol_seen == EOL_SEEN_NONE)
6250                 /* This is the first end-of-line.  */
6251                 eol_seen = this_eol;
6252               else if (eol_seen != this_eol)
6253                 {
6254                   /* The found type is different from what found before.
6255                      Allow for stray ^M characters in DOS EOL files.  */
6256                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6257                       || (eol_seen == EOL_SEEN_CRLF
6258                           && this_eol == EOL_SEEN_CR))
6259                     eol_seen = EOL_SEEN_CRLF;
6260                   else
6261                     {
6262                       eol_seen = EOL_SEEN_LF;
6263                       break;
6264                     }
6265                 }
6266               if (++total == MAX_EOL_CHECK_COUNT)
6267                 break;
6268             }
6269           src += 2;
6270         }
6271     }
6272   else
6273     while (src < src_end)
6274       {
6275         c = *src++;
6276         if (c == '\n' || c == '\r')
6277           {
6278             int this_eol;
6279
6280             if (c == '\n')
6281               this_eol = EOL_SEEN_LF;
6282             else if (src >= src_end || *src != '\n')
6283               this_eol = EOL_SEEN_CR;
6284             else
6285               this_eol = EOL_SEEN_CRLF, src++;
6286
6287             if (eol_seen == EOL_SEEN_NONE)
6288               /* This is the first end-of-line.  */
6289               eol_seen = this_eol;
6290             else if (eol_seen != this_eol)
6291               {
6292                 /* The found type is different from what found before.
6293                    Allow for stray ^M characters in DOS EOL files.  */
6294                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6295                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6296                   eol_seen = EOL_SEEN_CRLF;
6297                 else
6298                   {
6299                     eol_seen = EOL_SEEN_LF;
6300                     break;
6301                   }
6302               }
6303             if (++total == MAX_EOL_CHECK_COUNT)
6304               break;
6305           }
6306       }
6307   return eol_seen;
6308 }
6309
6310
6311 static Lisp_Object
6312 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6313 {
6314   Lisp_Object eol_type;
6315
6316   eol_type = CODING_ID_EOL_TYPE (coding->id);
6317   if (eol_seen & EOL_SEEN_LF)
6318     {
6319       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6320       eol_type = Qunix;
6321     }
6322   else if (eol_seen & EOL_SEEN_CRLF)
6323     {
6324       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6325       eol_type = Qdos;
6326     }
6327   else if (eol_seen & EOL_SEEN_CR)
6328     {
6329       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6330       eol_type = Qmac;
6331     }
6332   return eol_type;
6333 }
6334
6335 /* Detect how a text specified in CODING is encoded.  If a coding
6336    system is detected, update fields of CODING by the detected coding
6337    system.  */
6338
6339 void
6340 detect_coding (struct coding_system *coding)
6341 {
6342   const unsigned char *src, *src_end;
6343   int saved_mode = coding->mode;
6344
6345   coding->consumed = coding->consumed_char = 0;
6346   coding->produced = coding->produced_char = 0;
6347   coding_set_source (coding);
6348
6349   src_end = coding->source + coding->src_bytes;
6350   coding->head_ascii = 0;
6351
6352   /* If we have not yet decided the text encoding type, detect it
6353      now.  */
6354   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6355     {
6356       int c, i;
6357       struct coding_detection_info detect_info;
6358       int null_byte_found = 0, eight_bit_found = 0;
6359
6360       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6361       for (src = coding->source; src < src_end; src++)
6362         {
6363           c = *src;
6364           if (c & 0x80)
6365             {
6366               eight_bit_found = 1;
6367               if (null_byte_found)
6368                 break;
6369             }
6370           else if (c < 0x20)
6371             {
6372               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6373                   && ! inhibit_iso_escape_detection
6374                   && ! detect_info.checked)
6375                 {
6376                   if (detect_coding_iso_2022 (coding, &detect_info))
6377                     {
6378                       /* We have scanned the whole data.  */
6379                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6380                         {
6381                           /* We didn't find an 8-bit code.  We may
6382                              have found a null-byte, but it's very
6383                              rare that a binary file conforms to
6384                              ISO-2022.  */
6385                           src = src_end;
6386                           coding->head_ascii = src - coding->source;
6387                         }
6388                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6389                       break;
6390                     }
6391                 }
6392               else if (! c && !inhibit_null_byte_detection)
6393                 {
6394                   null_byte_found = 1;
6395                   if (eight_bit_found)
6396                     break;
6397                 }
6398               if (! eight_bit_found)
6399                 coding->head_ascii++;
6400             }
6401           else if (! eight_bit_found)
6402             coding->head_ascii++;
6403         }
6404
6405       if (null_byte_found || eight_bit_found
6406           || coding->head_ascii < coding->src_bytes
6407           || detect_info.found)
6408         {
6409           enum coding_category category;
6410           struct coding_system *this;
6411
6412           if (coding->head_ascii == coding->src_bytes)
6413             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6414             for (i = 0; i < coding_category_raw_text; i++)
6415               {
6416                 category = coding_priorities[i];
6417                 this = coding_categories + category;
6418                 if (detect_info.found & (1 << category))
6419                   break;
6420               }
6421           else
6422             {
6423               if (null_byte_found)
6424                 {
6425                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6426                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6427                 }
6428               for (i = 0; i < coding_category_raw_text; i++)
6429                 {
6430                   category = coding_priorities[i];
6431                   this = coding_categories + category;
6432                   if (this->id < 0)
6433                     {
6434                       /* No coding system of this category is defined.  */
6435                       detect_info.rejected |= (1 << category);
6436                     }
6437                   else if (category >= coding_category_raw_text)
6438                     continue;
6439                   else if (detect_info.checked & (1 << category))
6440                     {
6441                       if (detect_info.found & (1 << category))
6442                         break;
6443                     }
6444                   else if ((*(this->detector)) (coding, &detect_info)
6445                            && detect_info.found & (1 << category))
6446                     {
6447                       if (category == coding_category_utf_16_auto)
6448                         {
6449                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6450                             category = coding_category_utf_16_le;
6451                           else
6452                             category = coding_category_utf_16_be;
6453                         }
6454                       break;
6455                     }
6456                 }
6457             }
6458
6459           if (i < coding_category_raw_text)
6460             setup_coding_system (CODING_ID_NAME (this->id), coding);
6461           else if (null_byte_found)
6462             setup_coding_system (Qno_conversion, coding);
6463           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6464                    == CATEGORY_MASK_ANY)
6465             setup_coding_system (Qraw_text, coding);
6466           else if (detect_info.rejected)
6467             for (i = 0; i < coding_category_raw_text; i++)
6468               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6469                 {
6470                   this = coding_categories + coding_priorities[i];
6471                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6472                   break;
6473                 }
6474         }
6475     }
6476   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6477            == coding_category_utf_8_auto)
6478     {
6479       Lisp_Object coding_systems;
6480       struct coding_detection_info detect_info;
6481
6482       coding_systems
6483         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6484       detect_info.found = detect_info.rejected = 0;
6485       coding->head_ascii = 0;
6486       if (CONSP (coding_systems)
6487           && detect_coding_utf_8 (coding, &detect_info))
6488         {
6489           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6490             setup_coding_system (XCAR (coding_systems), coding);
6491           else
6492             setup_coding_system (XCDR (coding_systems), coding);
6493         }
6494     }
6495   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6496            == coding_category_utf_16_auto)
6497     {
6498       Lisp_Object coding_systems;
6499       struct coding_detection_info detect_info;
6500
6501       coding_systems
6502         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6503       detect_info.found = detect_info.rejected = 0;
6504       coding->head_ascii = 0;
6505       if (CONSP (coding_systems)
6506           && detect_coding_utf_16 (coding, &detect_info))
6507         {
6508           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6509             setup_coding_system (XCAR (coding_systems), coding);
6510           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6511             setup_coding_system (XCDR (coding_systems), coding);
6512         }
6513     }
6514   coding->mode = saved_mode;
6515 }
6516
6517
6518 static void
6519 decode_eol (struct coding_system *coding)
6520 {
6521   Lisp_Object eol_type;
6522   unsigned char *p, *pbeg, *pend;
6523
6524   eol_type = CODING_ID_EOL_TYPE (coding->id);
6525   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6526     return;
6527
6528   if (NILP (coding->dst_object))
6529     pbeg = coding->destination;
6530   else
6531     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6532   pend = pbeg + coding->produced;
6533
6534   if (VECTORP (eol_type))
6535     {
6536       int eol_seen = EOL_SEEN_NONE;
6537
6538       for (p = pbeg; p < pend; p++)
6539         {
6540           if (*p == '\n')
6541             eol_seen |= EOL_SEEN_LF;
6542           else if (*p == '\r')
6543             {
6544               if (p + 1 < pend && *(p + 1) == '\n')
6545                 {
6546                   eol_seen |= EOL_SEEN_CRLF;
6547                   p++;
6548                 }
6549               else
6550                 eol_seen |= EOL_SEEN_CR;
6551             }
6552         }
6553       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6554       if ((eol_seen & EOL_SEEN_CRLF) != 0
6555           && (eol_seen & EOL_SEEN_CR) != 0
6556           && (eol_seen & EOL_SEEN_LF) == 0)
6557         eol_seen = EOL_SEEN_CRLF;
6558       else if (eol_seen != EOL_SEEN_NONE
6559           && eol_seen != EOL_SEEN_LF
6560           && eol_seen != EOL_SEEN_CRLF
6561           && eol_seen != EOL_SEEN_CR)
6562         eol_seen = EOL_SEEN_LF;
6563       if (eol_seen != EOL_SEEN_NONE)
6564         eol_type = adjust_coding_eol_type (coding, eol_seen);
6565     }
6566
6567   if (EQ (eol_type, Qmac))
6568     {
6569       for (p = pbeg; p < pend; p++)
6570         if (*p == '\r')
6571           *p = '\n';
6572     }
6573   else if (EQ (eol_type, Qdos))
6574     {
6575       int n = 0;
6576
6577       if (NILP (coding->dst_object))
6578         {
6579           /* Start deleting '\r' from the tail to minimize the memory
6580              movement.  */
6581           for (p = pend - 2; p >= pbeg; p--)
6582             if (*p == '\r')
6583               {
6584                 memmove (p, p + 1, pend-- - p - 1);
6585                 n++;
6586               }
6587         }
6588       else
6589         {
6590           int pos_byte = coding->dst_pos_byte;
6591           int pos = coding->dst_pos;
6592           int pos_end = pos + coding->produced_char - 1;
6593
6594           while (pos < pos_end)
6595             {
6596               p = BYTE_POS_ADDR (pos_byte);
6597               if (*p == '\r' && p[1] == '\n')
6598                 {
6599                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6600                   n++;
6601                   pos_end--;
6602                 }
6603               pos++;
6604               if (coding->dst_multibyte)
6605                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6606               else
6607                 pos_byte++;
6608             }
6609         }
6610       coding->produced -= n;
6611       coding->produced_char -= n;
6612     }
6613 }
6614
6615
6616 /* Return a translation table (or list of them) from coding system
6617    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6618    decoding (ENCODEP is zero). */
6619
6620 static Lisp_Object
6621 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6622 {
6623   Lisp_Object standard, translation_table;
6624   Lisp_Object val;
6625
6626   if (NILP (Venable_character_translation))
6627     {
6628       if (max_lookup)
6629         *max_lookup = 0;
6630       return Qnil;
6631     }
6632   if (encodep)
6633     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6634       standard = Vstandard_translation_table_for_encode;
6635   else
6636     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6637       standard = Vstandard_translation_table_for_decode;
6638   if (NILP (translation_table))
6639     translation_table = standard;
6640   else
6641     {
6642       if (SYMBOLP (translation_table))
6643         translation_table = Fget (translation_table, Qtranslation_table);
6644       else if (CONSP (translation_table))
6645         {
6646           translation_table = Fcopy_sequence (translation_table);
6647           for (val = translation_table; CONSP (val); val = XCDR (val))
6648             if (SYMBOLP (XCAR (val)))
6649               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6650         }
6651       if (CHAR_TABLE_P (standard))
6652         {
6653           if (CONSP (translation_table))
6654             translation_table = nconc2 (translation_table,
6655                                         Fcons (standard, Qnil));
6656           else
6657             translation_table = Fcons (translation_table,
6658                                        Fcons (standard, Qnil));
6659         }
6660     }
6661
6662   if (max_lookup)
6663     {
6664       *max_lookup = 1;
6665       if (CHAR_TABLE_P (translation_table)
6666           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6667         {
6668           val = XCHAR_TABLE (translation_table)->extras[1];
6669           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6670             *max_lookup = XFASTINT (val);
6671         }
6672       else if (CONSP (translation_table))
6673         {
6674           Lisp_Object tail, val;
6675
6676           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6677             if (CHAR_TABLE_P (XCAR (tail))
6678                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6679               {
6680                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6681                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6682                   *max_lookup = XFASTINT (val);
6683               }
6684         }
6685     }
6686   return translation_table;
6687 }
6688
6689 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6690   do {                                                          \
6691     trans = Qnil;                                               \
6692     if (CHAR_TABLE_P (table))                                   \
6693       {                                                         \
6694         trans = CHAR_TABLE_REF (table, c);                      \
6695         if (CHARACTERP (trans))                                 \
6696           c = XFASTINT (trans), trans = Qnil;                   \
6697       }                                                         \
6698     else if (CONSP (table))                                     \
6699       {                                                         \
6700         Lisp_Object tail;                                       \
6701                                                                 \
6702         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6703           if (CHAR_TABLE_P (XCAR (tail)))                       \
6704             {                                                   \
6705               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6706               if (CHARACTERP (trans))                           \
6707                 c = XFASTINT (trans), trans = Qnil;             \
6708               else if (! NILP (trans))                          \
6709                 break;                                          \
6710             }                                                   \
6711       }                                                         \
6712   } while (0)
6713
6714
6715 /* Return a translation of character(s) at BUF according to TRANS.
6716    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6717    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6718    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6719    translation is found, and Qnil if not found..
6720    If BUF is too short to lookup characters in FROM, return Qt.  */
6721
6722 static Lisp_Object
6723 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6724 {
6725
6726   if (INTEGERP (trans))
6727     return trans;
6728   for (; CONSP (trans); trans = XCDR (trans))
6729     {
6730       Lisp_Object val = XCAR (trans);
6731       Lisp_Object from = XCAR (val);
6732       int len = ASIZE (from);
6733       int i;
6734
6735       for (i = 0; i < len; i++)
6736         {
6737           if (buf + i == buf_end)
6738             return Qt;
6739           if (XINT (AREF (from, i)) != buf[i])
6740             break;
6741         }
6742       if (i == len)
6743         return val;
6744     }
6745   return Qnil;
6746 }
6747
6748
6749 static int
6750 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6751                int last_block)
6752 {
6753   unsigned char *dst = coding->destination + coding->produced;
6754   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6755   EMACS_INT produced;
6756   EMACS_INT produced_chars = 0;
6757   int carryover = 0;
6758
6759   if (! coding->chars_at_source)
6760     {
6761       /* Source characters are in coding->charbuf.  */
6762       int *buf = coding->charbuf;
6763       int *buf_end = buf + coding->charbuf_used;
6764
6765       if (EQ (coding->src_object, coding->dst_object))
6766         {
6767           coding_set_source (coding);
6768           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6769         }
6770
6771       while (buf < buf_end)
6772         {
6773           int c = *buf, i;
6774
6775           if (c >= 0)
6776             {
6777               int from_nchars = 1, to_nchars = 1;
6778               Lisp_Object trans = Qnil;
6779
6780               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6781               if (! NILP (trans))
6782                 {
6783                   trans = get_translation (trans, buf, buf_end);
6784                   if (INTEGERP (trans))
6785                     c = XINT (trans);
6786                   else if (CONSP (trans))
6787                     {
6788                       from_nchars = ASIZE (XCAR (trans));
6789                       trans = XCDR (trans);
6790                       if (INTEGERP (trans))
6791                         c = XINT (trans);
6792                       else
6793                         {
6794                           to_nchars = ASIZE (trans);
6795                           c = XINT (AREF (trans, 0));
6796                         }
6797                     }
6798                   else if (EQ (trans, Qt) && ! last_block)
6799                     break;
6800                 }
6801
6802               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6803                 {
6804                   dst = alloc_destination (coding,
6805                                            buf_end - buf
6806                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6807                                            dst);
6808                   if (EQ (coding->src_object, coding->dst_object))
6809                     {
6810                       coding_set_source (coding);
6811                       dst_end = (((unsigned char *) coding->source)
6812                                  + coding->consumed);
6813                     }
6814                   else
6815                     dst_end = coding->destination + coding->dst_bytes;
6816                 }
6817
6818               for (i = 0; i < to_nchars; i++)
6819                 {
6820                   if (i > 0)
6821                     c = XINT (AREF (trans, i));
6822                   if (coding->dst_multibyte
6823                       || ! CHAR_BYTE8_P (c))
6824                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6825                   else
6826                     *dst++ = CHAR_TO_BYTE8 (c);
6827                 }
6828               produced_chars += to_nchars;
6829               buf += from_nchars;
6830             }
6831           else
6832             /* This is an annotation datum.  (-C) is the length.  */
6833             buf += -c;
6834         }
6835       carryover = buf_end - buf;
6836     }
6837   else
6838     {
6839       /* Source characters are at coding->source.  */
6840       const unsigned char *src = coding->source;
6841       const unsigned char *src_end = src + coding->consumed;
6842
6843       if (EQ (coding->dst_object, coding->src_object))
6844         dst_end = (unsigned char *) src;
6845       if (coding->src_multibyte != coding->dst_multibyte)
6846         {
6847           if (coding->src_multibyte)
6848             {
6849               int multibytep = 1;
6850               EMACS_INT consumed_chars = 0;
6851
6852               while (1)
6853                 {
6854                   const unsigned char *src_base = src;
6855                   int c;
6856
6857                   ONE_MORE_BYTE (c);
6858                   if (dst == dst_end)
6859                     {
6860                       if (EQ (coding->src_object, coding->dst_object))
6861                         dst_end = (unsigned char *) src;
6862                       if (dst == dst_end)
6863                         {
6864                           EMACS_INT offset = src - coding->source;
6865
6866                           dst = alloc_destination (coding, src_end - src + 1,
6867                                                    dst);
6868                           dst_end = coding->destination + coding->dst_bytes;
6869                           coding_set_source (coding);
6870                           src = coding->source + offset;
6871                           src_end = coding->source + coding->src_bytes;
6872                           if (EQ (coding->src_object, coding->dst_object))
6873                             dst_end = (unsigned char *) src;
6874                         }
6875                     }
6876                   *dst++ = c;
6877                   produced_chars++;
6878                 }
6879             no_more_source:
6880               ;
6881             }
6882           else
6883             while (src < src_end)
6884               {
6885                 int multibytep = 1;
6886                 int c = *src++;
6887
6888                 if (dst >= dst_end - 1)
6889                   {
6890                     if (EQ (coding->src_object, coding->dst_object))
6891                       dst_end = (unsigned char *) src;
6892                     if (dst >= dst_end - 1)
6893                       {
6894                         EMACS_INT offset = src - coding->source;
6895                         EMACS_INT more_bytes;
6896
6897                         if (EQ (coding->src_object, coding->dst_object))
6898                           more_bytes = ((src_end - src) / 2) + 2;
6899                         else
6900                           more_bytes = src_end - src + 2;
6901                         dst = alloc_destination (coding, more_bytes, dst);
6902                         dst_end = coding->destination + coding->dst_bytes;
6903                         coding_set_source (coding);
6904                         src = coding->source + offset;
6905                         src_end = coding->source + coding->src_bytes;
6906                         if (EQ (coding->src_object, coding->dst_object))
6907                           dst_end = (unsigned char *) src;
6908                       }
6909                   }
6910                 EMIT_ONE_BYTE (c);
6911               }
6912         }
6913       else
6914         {
6915           if (!EQ (coding->src_object, coding->dst_object))
6916             {
6917               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6918
6919               if (require > 0)
6920                 {
6921                   EMACS_INT offset = src - coding->source;
6922
6923                   dst = alloc_destination (coding, require, dst);
6924                   coding_set_source (coding);
6925                   src = coding->source + offset;
6926                   src_end = coding->source + coding->src_bytes;
6927                 }
6928             }
6929           produced_chars = coding->consumed_char;
6930           while (src < src_end)
6931             *dst++ = *src++;
6932         }
6933     }
6934
6935   produced = dst - (coding->destination + coding->produced);
6936   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6937     insert_from_gap (produced_chars, produced);
6938   coding->produced += produced;
6939   coding->produced_char += produced_chars;
6940   return carryover;
6941 }
6942
6943 /* Compose text in CODING->object according to the annotation data at
6944    CHARBUF.  CHARBUF is an array:
6945      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6946  */
6947
6948 static INLINE void
6949 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6950 {
6951   int len;
6952   EMACS_INT to;
6953   enum composition_method method;
6954   Lisp_Object components;
6955
6956   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6957   to = pos + charbuf[2];
6958   method = (enum composition_method) (charbuf[4]);
6959
6960   if (method == COMPOSITION_RELATIVE)
6961     components = Qnil;
6962   else
6963     {
6964       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6965       int i, j;
6966
6967       if (method == COMPOSITION_WITH_RULE)
6968         len = charbuf[2] * 3 - 2;
6969       charbuf += MAX_ANNOTATION_LENGTH;
6970       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6971       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6972         {
6973           if (charbuf[i] >= 0)
6974             args[j] = make_number (charbuf[i]);
6975           else
6976             {
6977               i++;
6978               args[j] = make_number (charbuf[i] % 0x100);
6979             }
6980         }
6981       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6982     }
6983   compose_text (pos, to, components, Qnil, coding->dst_object);
6984 }
6985
6986
6987 /* Put `charset' property on text in CODING->object according to
6988    the annotation data at CHARBUF.  CHARBUF is an array:
6989      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6990  */
6991
6992 static INLINE void
6993 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6994 {
6995   EMACS_INT from = pos - charbuf[2];
6996   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6997
6998   Fput_text_property (make_number (from), make_number (pos),
6999                       Qcharset, CHARSET_NAME (charset),
7000                       coding->dst_object);
7001 }
7002
7003
7004 #define CHARBUF_SIZE 0x4000
7005
7006 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7007   do {                                                                  \
7008     int size = CHARBUF_SIZE;                                            \
7009                                                                         \
7010     coding->charbuf = NULL;                                             \
7011     while (size > 1024)                                                 \
7012       {                                                                 \
7013         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7014         if (coding->charbuf)                                            \
7015           break;                                                        \
7016         size >>= 1;                                                     \
7017       }                                                                 \
7018     if (! coding->charbuf)                                              \
7019       {                                                                 \
7020         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7021         return coding->result;                                          \
7022       }                                                                 \
7023     coding->charbuf_size = size;                                        \
7024   } while (0)
7025
7026
7027 static void
7028 produce_annotation (struct coding_system *coding, EMACS_INT pos)
7029 {
7030   int *charbuf = coding->charbuf;
7031   int *charbuf_end = charbuf + coding->charbuf_used;
7032
7033   if (NILP (coding->dst_object))
7034     return;
7035
7036   while (charbuf < charbuf_end)
7037     {
7038       if (*charbuf >= 0)
7039         pos++, charbuf++;
7040       else
7041         {
7042           int len = -*charbuf;
7043
7044           if (len > 2)
7045             switch (charbuf[1])
7046               {
7047               case CODING_ANNOTATE_COMPOSITION_MASK:
7048                 produce_composition (coding, charbuf, pos);
7049                 break;
7050               case CODING_ANNOTATE_CHARSET_MASK:
7051                 produce_charset (coding, charbuf, pos);
7052                 break;
7053               }
7054           charbuf += len;
7055         }
7056     }
7057 }
7058
7059 /* Decode the data at CODING->src_object into CODING->dst_object.
7060    CODING->src_object is a buffer, a string, or nil.
7061    CODING->dst_object is a buffer.
7062
7063    If CODING->src_object is a buffer, it must be the current buffer.
7064    In this case, if CODING->src_pos is positive, it is a position of
7065    the source text in the buffer, otherwise, the source text is in the
7066    gap area of the buffer, and CODING->src_pos specifies the offset of
7067    the text from GPT (which must be the same as PT).  If this is the
7068    same buffer as CODING->dst_object, CODING->src_pos must be
7069    negative.
7070
7071    If CODING->src_object is a string, CODING->src_pos is an index to
7072    that string.
7073
7074    If CODING->src_object is nil, CODING->source must already point to
7075    the non-relocatable memory area.  In this case, CODING->src_pos is
7076    an offset from CODING->source.
7077
7078    The decoded data is inserted at the current point of the buffer
7079    CODING->dst_object.
7080 */
7081
7082 static int
7083 decode_coding (struct coding_system *coding)
7084 {
7085   Lisp_Object attrs;
7086   Lisp_Object undo_list;
7087   Lisp_Object translation_table;
7088   struct ccl_spec cclspec;
7089   int carryover;
7090   int i;
7091
7092   if (BUFFERP (coding->src_object)
7093       && coding->src_pos > 0
7094       && coding->src_pos < GPT
7095       && coding->src_pos + coding->src_chars > GPT)
7096     move_gap_both (coding->src_pos, coding->src_pos_byte);
7097
7098   undo_list = Qt;
7099   if (BUFFERP (coding->dst_object))
7100     {
7101       if (current_buffer != XBUFFER (coding->dst_object))
7102         set_buffer_internal (XBUFFER (coding->dst_object));
7103       if (GPT != PT)
7104         move_gap_both (PT, PT_BYTE);
7105       undo_list = current_buffer->undo_list;
7106       current_buffer->undo_list = Qt;
7107     }
7108
7109   coding->consumed = coding->consumed_char = 0;
7110   coding->produced = coding->produced_char = 0;
7111   coding->chars_at_source = 0;
7112   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7113   coding->errors = 0;
7114
7115   ALLOC_CONVERSION_WORK_AREA (coding);
7116
7117   attrs = CODING_ID_ATTRS (coding->id);
7118   translation_table = get_translation_table (attrs, 0, NULL);
7119
7120   carryover = 0;
7121   if (coding->decoder == decode_coding_ccl)
7122     {
7123       coding->spec.ccl = &cclspec;
7124       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7125     }
7126   do
7127     {
7128       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7129
7130       coding_set_source (coding);
7131       coding->annotated = 0;
7132       coding->charbuf_used = carryover;
7133       (*(coding->decoder)) (coding);
7134       coding_set_destination (coding);
7135       carryover = produce_chars (coding, translation_table, 0);
7136       if (coding->annotated)
7137         produce_annotation (coding, pos);
7138       for (i = 0; i < carryover; i++)
7139         coding->charbuf[i]
7140           = coding->charbuf[coding->charbuf_used - carryover + i];
7141     }
7142   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7143          || (coding->consumed < coding->src_bytes
7144              && (coding->result == CODING_RESULT_SUCCESS
7145                  || coding->result == CODING_RESULT_INVALID_SRC)));
7146
7147   if (carryover > 0)
7148     {
7149       coding_set_destination (coding);
7150       coding->charbuf_used = carryover;
7151       produce_chars (coding, translation_table, 1);
7152     }
7153
7154   coding->carryover_bytes = 0;
7155   if (coding->consumed < coding->src_bytes)
7156     {
7157       int nbytes = coding->src_bytes - coding->consumed;
7158       const unsigned char *src;
7159
7160       coding_set_source (coding);
7161       coding_set_destination (coding);
7162       src = coding->source + coding->consumed;
7163
7164       if (coding->mode & CODING_MODE_LAST_BLOCK)
7165         {
7166           /* Flush out unprocessed data as binary chars.  We are sure
7167              that the number of data is less than the size of
7168              coding->charbuf.  */
7169           coding->charbuf_used = 0;
7170           coding->chars_at_source = 0;
7171
7172           while (nbytes-- > 0)
7173             {
7174               int c = *src++;
7175
7176               if (c & 0x80)
7177                 c = BYTE8_TO_CHAR (c);
7178               coding->charbuf[coding->charbuf_used++] = c;
7179             }
7180           produce_chars (coding, Qnil, 1);
7181         }
7182       else
7183         {
7184           /* Record unprocessed bytes in coding->carryover.  We are
7185              sure that the number of data is less than the size of
7186              coding->carryover.  */
7187           unsigned char *p = coding->carryover;
7188
7189           if (nbytes > sizeof coding->carryover)
7190             nbytes = sizeof coding->carryover;
7191           coding->carryover_bytes = nbytes;
7192           while (nbytes-- > 0)
7193             *p++ = *src++;
7194         }
7195       coding->consumed = coding->src_bytes;
7196     }
7197
7198   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7199       && !inhibit_eol_conversion)
7200     decode_eol (coding);
7201   if (BUFFERP (coding->dst_object))
7202     {
7203       current_buffer->undo_list = undo_list;
7204       record_insert (coding->dst_pos, coding->produced_char);
7205     }
7206   return coding->result;
7207 }
7208
7209
7210 /* Extract an annotation datum from a composition starting at POS and
7211    ending before LIMIT of CODING->src_object (buffer or string), store
7212    the data in BUF, set *STOP to a starting position of the next
7213    composition (if any) or to LIMIT, and return the address of the
7214    next element of BUF.
7215
7216    If such an annotation is not found, set *STOP to a starting
7217    position of a composition after POS (if any) or to LIMIT, and
7218    return BUF.  */
7219
7220 static INLINE int *
7221 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7222                                struct coding_system *coding, int *buf,
7223                                EMACS_INT *stop)
7224 {
7225   EMACS_INT start, end;
7226   Lisp_Object prop;
7227
7228   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7229       || end > limit)
7230     *stop = limit;
7231   else if (start > pos)
7232     *stop = start;
7233   else
7234     {
7235       if (start == pos)
7236         {
7237           /* We found a composition.  Store the corresponding
7238              annotation data in BUF.  */
7239           int *head = buf;
7240           enum composition_method method = COMPOSITION_METHOD (prop);
7241           int nchars = COMPOSITION_LENGTH (prop);
7242
7243           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7244           if (method != COMPOSITION_RELATIVE)
7245             {
7246               Lisp_Object components;
7247               int len, i, i_byte;
7248
7249               components = COMPOSITION_COMPONENTS (prop);
7250               if (VECTORP (components))
7251                 {
7252                   len = XVECTOR (components)->size;
7253                   for (i = 0; i < len; i++)
7254                     *buf++ = XINT (AREF (components, i));
7255                 }
7256               else if (STRINGP (components))
7257                 {
7258                   len = SCHARS (components);
7259                   i = i_byte = 0;
7260                   while (i < len)
7261                     {
7262                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7263                       buf++;
7264                     }
7265                 }
7266               else if (INTEGERP (components))
7267                 {
7268                   len = 1;
7269                   *buf++ = XINT (components);
7270                 }
7271               else if (CONSP (components))
7272                 {
7273                   for (len = 0; CONSP (components);
7274                        len++, components = XCDR (components))
7275                     *buf++ = XINT (XCAR (components));
7276                 }
7277               else
7278                 abort ();
7279               *head -= len;
7280             }
7281         }
7282
7283       if (find_composition (end, limit, &start, &end, &prop,
7284                             coding->src_object)
7285           && end <= limit)
7286         *stop = start;
7287       else
7288         *stop = limit;
7289     }
7290   return buf;
7291 }
7292
7293
7294 /* Extract an annotation datum from a text property `charset' at POS of
7295    CODING->src_object (buffer of string), store the data in BUF, set
7296    *STOP to the position where the value of `charset' property changes
7297    (limiting by LIMIT), and return the address of the next element of
7298    BUF.
7299
7300    If the property value is nil, set *STOP to the position where the
7301    property value is non-nil (limiting by LIMIT), and return BUF.  */
7302
7303 static INLINE int *
7304 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7305                            struct coding_system *coding, int *buf,
7306                            EMACS_INT *stop)
7307 {
7308   Lisp_Object val, next;
7309   int id;
7310
7311   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7312   if (! NILP (val) && CHARSETP (val))
7313     id = XINT (CHARSET_SYMBOL_ID (val));
7314   else
7315     id = -1;
7316   ADD_CHARSET_DATA (buf, 0, id);
7317   next = Fnext_single_property_change (make_number (pos), Qcharset,
7318                                        coding->src_object,
7319                                        make_number (limit));
7320   *stop = XINT (next);
7321   return buf;
7322 }
7323
7324
7325 static void
7326 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7327                int max_lookup)
7328 {
7329   int *buf = coding->charbuf;
7330   int *buf_end = coding->charbuf + coding->charbuf_size;
7331   const unsigned char *src = coding->source + coding->consumed;
7332   const unsigned char *src_end = coding->source + coding->src_bytes;
7333   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7334   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7335   int multibytep = coding->src_multibyte;
7336   Lisp_Object eol_type;
7337   int c;
7338   EMACS_INT stop, stop_composition, stop_charset;
7339   int *lookup_buf = NULL;
7340
7341   if (! NILP (translation_table))
7342     lookup_buf = alloca (sizeof (int) * max_lookup);
7343
7344   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7345   if (VECTORP (eol_type))
7346     eol_type = Qunix;
7347
7348   /* Note: composition handling is not yet implemented.  */
7349   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7350
7351   if (NILP (coding->src_object))
7352     stop = stop_composition = stop_charset = end_pos;
7353   else
7354     {
7355       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7356         stop = stop_composition = pos;
7357       else
7358         stop = stop_composition = end_pos;
7359       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7360         stop = stop_charset = pos;
7361       else
7362         stop_charset = end_pos;
7363     }
7364
7365   /* Compensate for CRLF and conversion.  */
7366   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7367   while (buf < buf_end)
7368     {
7369       Lisp_Object trans;
7370
7371       if (pos == stop)
7372         {
7373           if (pos == end_pos)
7374             break;
7375           if (pos == stop_composition)
7376             buf = handle_composition_annotation (pos, end_pos, coding,
7377                                                  buf, &stop_composition);
7378           if (pos == stop_charset)
7379             buf = handle_charset_annotation (pos, end_pos, coding,
7380                                              buf, &stop_charset);
7381           stop = (stop_composition < stop_charset
7382                   ? stop_composition : stop_charset);
7383         }
7384
7385       if (! multibytep)
7386         {
7387           EMACS_INT bytes;
7388
7389           if (coding->encoder == encode_coding_raw_text
7390               || coding->encoder == encode_coding_ccl)
7391             c = *src++, pos++;
7392           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7393             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7394           else
7395             c = BYTE8_TO_CHAR (*src), src++, pos++;
7396         }
7397       else
7398         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7399       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7400         c = '\n';
7401       if (! EQ (eol_type, Qunix))
7402         {
7403           if (c == '\n')
7404             {
7405               if (EQ (eol_type, Qdos))
7406                 *buf++ = '\r';
7407               else
7408                 c = '\r';
7409             }
7410         }
7411
7412       trans = Qnil;
7413       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7414       if (NILP (trans))
7415         *buf++ = c;
7416       else
7417         {
7418           int from_nchars = 1, to_nchars = 1;
7419           int *lookup_buf_end;
7420           const unsigned char *p = src;
7421           int i;
7422
7423           lookup_buf[0] = c;
7424           for (i = 1; i < max_lookup && p < src_end; i++)
7425             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7426           lookup_buf_end = lookup_buf + i;
7427           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7428           if (INTEGERP (trans))
7429             c = XINT (trans);
7430           else if (CONSP (trans))
7431             {
7432               from_nchars = ASIZE (XCAR (trans));
7433               trans = XCDR (trans);
7434               if (INTEGERP (trans))
7435                 c = XINT (trans);
7436               else
7437                 {
7438                   to_nchars = ASIZE (trans);
7439                   if (buf + to_nchars > buf_end)
7440                     break;
7441                   c = XINT (AREF (trans, 0));
7442                 }
7443             }
7444           else
7445             break;
7446           *buf++ = c;
7447           for (i = 1; i < to_nchars; i++)
7448             *buf++ = XINT (AREF (trans, i));
7449           for (i = 1; i < from_nchars; i++, pos++)
7450             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7451         }
7452     }
7453
7454   coding->consumed = src - coding->source;
7455   coding->consumed_char = pos - coding->src_pos;
7456   coding->charbuf_used = buf - coding->charbuf;
7457   coding->chars_at_source = 0;
7458 }
7459
7460
7461 /* Encode the text at CODING->src_object into CODING->dst_object.
7462    CODING->src_object is a buffer or a string.
7463    CODING->dst_object is a buffer or nil.
7464
7465    If CODING->src_object is a buffer, it must be the current buffer.
7466    In this case, if CODING->src_pos is positive, it is a position of
7467    the source text in the buffer, otherwise. the source text is in the
7468    gap area of the buffer, and coding->src_pos specifies the offset of
7469    the text from GPT (which must be the same as PT).  If this is the
7470    same buffer as CODING->dst_object, CODING->src_pos must be
7471    negative and CODING should not have `pre-write-conversion'.
7472
7473    If CODING->src_object is a string, CODING should not have
7474    `pre-write-conversion'.
7475
7476    If CODING->dst_object is a buffer, the encoded data is inserted at
7477    the current point of that buffer.
7478
7479    If CODING->dst_object is nil, the encoded data is placed at the
7480    memory area specified by CODING->destination.  */
7481
7482 static int
7483 encode_coding (struct coding_system *coding)
7484 {
7485   Lisp_Object attrs;
7486   Lisp_Object translation_table;
7487   int max_lookup;
7488   struct ccl_spec cclspec;
7489
7490   attrs = CODING_ID_ATTRS (coding->id);
7491   if (coding->encoder == encode_coding_raw_text)
7492     translation_table = Qnil, max_lookup = 0;
7493   else
7494     translation_table = get_translation_table (attrs, 1, &max_lookup);
7495
7496   if (BUFFERP (coding->dst_object))
7497     {
7498       set_buffer_internal (XBUFFER (coding->dst_object));
7499       coding->dst_multibyte
7500         = ! NILP (current_buffer->enable_multibyte_characters);
7501     }
7502
7503   coding->consumed = coding->consumed_char = 0;
7504   coding->produced = coding->produced_char = 0;
7505   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7506   coding->errors = 0;
7507
7508   ALLOC_CONVERSION_WORK_AREA (coding);
7509
7510   if (coding->encoder == encode_coding_ccl)
7511     {
7512       coding->spec.ccl = &cclspec;
7513       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7514     }
7515   do {
7516     coding_set_source (coding);
7517     consume_chars (coding, translation_table, max_lookup);
7518     coding_set_destination (coding);
7519     (*(coding->encoder)) (coding);
7520   } while (coding->consumed_char < coding->src_chars);
7521
7522   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7523     insert_from_gap (coding->produced_char, coding->produced);
7524
7525   return (coding->result);
7526 }
7527
7528
7529 /* Name (or base name) of work buffer for code conversion.  */
7530 static Lisp_Object Vcode_conversion_workbuf_name;
7531
7532 /* A working buffer used by the top level conversion.  Once it is
7533    created, it is never destroyed.  It has the name
7534    Vcode_conversion_workbuf_name.  The other working buffers are
7535    destroyed after the use is finished, and their names are modified
7536    versions of Vcode_conversion_workbuf_name.  */
7537 static Lisp_Object Vcode_conversion_reused_workbuf;
7538
7539 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7540 static int reused_workbuf_in_use;
7541
7542
7543 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7544    multibyteness of returning buffer.  */
7545
7546 static Lisp_Object
7547 make_conversion_work_buffer (int multibyte)
7548 {
7549   Lisp_Object name, workbuf;
7550   struct buffer *current;
7551
7552   if (reused_workbuf_in_use++)
7553     {
7554       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7555       workbuf = Fget_buffer_create (name);
7556     }
7557   else
7558     {
7559       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7560         Vcode_conversion_reused_workbuf
7561           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7562       workbuf = Vcode_conversion_reused_workbuf;
7563     }
7564   current = current_buffer;
7565   set_buffer_internal (XBUFFER (workbuf));
7566   /* We can't allow modification hooks to run in the work buffer.  For
7567      instance, directory_files_internal assumes that file decoding
7568      doesn't compile new regexps.  */
7569   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7570   Ferase_buffer ();
7571   current_buffer->undo_list = Qt;
7572   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7573   set_buffer_internal (current);
7574   return workbuf;
7575 }
7576
7577
7578 static Lisp_Object
7579 code_conversion_restore (Lisp_Object arg)
7580 {
7581   Lisp_Object current, workbuf;
7582   struct gcpro gcpro1;
7583
7584   GCPRO1 (arg);
7585   current = XCAR (arg);
7586   workbuf = XCDR (arg);
7587   if (! NILP (workbuf))
7588     {
7589       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7590         reused_workbuf_in_use = 0;
7591       else if (! NILP (Fbuffer_live_p (workbuf)))
7592         Fkill_buffer (workbuf);
7593     }
7594   set_buffer_internal (XBUFFER (current));
7595   UNGCPRO;
7596   return Qnil;
7597 }
7598
7599 Lisp_Object
7600 code_conversion_save (int with_work_buf, int multibyte)
7601 {
7602   Lisp_Object workbuf = Qnil;
7603
7604   if (with_work_buf)
7605     workbuf = make_conversion_work_buffer (multibyte);
7606   record_unwind_protect (code_conversion_restore,
7607                          Fcons (Fcurrent_buffer (), workbuf));
7608   return workbuf;
7609 }
7610
7611 int
7612 decode_coding_gap (struct coding_system *coding,
7613                    EMACS_INT chars, EMACS_INT bytes)
7614 {
7615   int count = SPECPDL_INDEX ();
7616   Lisp_Object attrs;
7617
7618   code_conversion_save (0, 0);
7619
7620   coding->src_object = Fcurrent_buffer ();
7621   coding->src_chars = chars;
7622   coding->src_bytes = bytes;
7623   coding->src_pos = -chars;
7624   coding->src_pos_byte = -bytes;
7625   coding->src_multibyte = chars < bytes;
7626   coding->dst_object = coding->src_object;
7627   coding->dst_pos = PT;
7628   coding->dst_pos_byte = PT_BYTE;
7629   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7630
7631   if (CODING_REQUIRE_DETECTION (coding))
7632     detect_coding (coding);
7633
7634   coding->mode |= CODING_MODE_LAST_BLOCK;
7635   current_buffer->text->inhibit_shrinking = 1;
7636   decode_coding (coding);
7637   current_buffer->text->inhibit_shrinking = 0;
7638
7639   attrs = CODING_ID_ATTRS (coding->id);
7640   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7641     {
7642       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7643       Lisp_Object val;
7644
7645       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7646       val = call1 (CODING_ATTR_POST_READ (attrs),
7647                    make_number (coding->produced_char));
7648       CHECK_NATNUM (val);
7649       coding->produced_char += Z - prev_Z;
7650       coding->produced += Z_BYTE - prev_Z_BYTE;
7651     }
7652
7653   unbind_to (count, Qnil);
7654   return coding->result;
7655 }
7656
7657 int
7658 encode_coding_gap (struct coding_system *coding,
7659                    EMACS_INT chars, EMACS_INT bytes)
7660 {
7661   int count = SPECPDL_INDEX ();
7662
7663   code_conversion_save (0, 0);
7664
7665   coding->src_object = Fcurrent_buffer ();
7666   coding->src_chars = chars;
7667   coding->src_bytes = bytes;
7668   coding->src_pos = -chars;
7669   coding->src_pos_byte = -bytes;
7670   coding->src_multibyte = chars < bytes;
7671   coding->dst_object = coding->src_object;
7672   coding->dst_pos = PT;
7673   coding->dst_pos_byte = PT_BYTE;
7674
7675   encode_coding (coding);
7676
7677   unbind_to (count, Qnil);
7678   return coding->result;
7679 }
7680
7681
7682 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7683    SRC_OBJECT into DST_OBJECT by coding context CODING.
7684
7685    SRC_OBJECT is a buffer, a string, or Qnil.
7686
7687    If it is a buffer, the text is at point of the buffer.  FROM and TO
7688    are positions in the buffer.
7689
7690    If it is a string, the text is at the beginning of the string.
7691    FROM and TO are indices to the string.
7692
7693    If it is nil, the text is at coding->source.  FROM and TO are
7694    indices to coding->source.
7695
7696    DST_OBJECT is a buffer, Qt, or Qnil.
7697
7698    If it is a buffer, the decoded text is inserted at point of the
7699    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7700    is deleted.
7701
7702    If it is Qt, a string is made from the decoded text, and
7703    set in CODING->dst_object.
7704
7705    If it is Qnil, the decoded text is stored at CODING->destination.
7706    The caller must allocate CODING->dst_bytes bytes at
7707    CODING->destination by xmalloc.  If the decoded text is longer than
7708    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7709  */
7710
7711 void
7712 decode_coding_object (struct coding_system *coding,
7713                       Lisp_Object src_object,
7714                       EMACS_INT from, EMACS_INT from_byte,
7715                       EMACS_INT to, EMACS_INT to_byte,
7716                       Lisp_Object dst_object)
7717 {
7718   int count = SPECPDL_INDEX ();
7719   unsigned char *destination;
7720   EMACS_INT dst_bytes;
7721   EMACS_INT chars = to - from;
7722   EMACS_INT bytes = to_byte - from_byte;
7723   Lisp_Object attrs;
7724   int saved_pt = -1, saved_pt_byte;
7725   int need_marker_adjustment = 0;
7726   Lisp_Object old_deactivate_mark;
7727
7728   old_deactivate_mark = Vdeactivate_mark;
7729
7730   if (NILP (dst_object))
7731     {
7732       destination = coding->destination;
7733       dst_bytes = coding->dst_bytes;
7734     }
7735
7736   coding->src_object = src_object;
7737   coding->src_chars = chars;
7738   coding->src_bytes = bytes;
7739   coding->src_multibyte = chars < bytes;
7740
7741   if (STRINGP (src_object))
7742     {
7743       coding->src_pos = from;
7744       coding->src_pos_byte = from_byte;
7745     }
7746   else if (BUFFERP (src_object))
7747     {
7748       set_buffer_internal (XBUFFER (src_object));
7749       if (from != GPT)
7750         move_gap_both (from, from_byte);
7751       if (EQ (src_object, dst_object))
7752         {
7753           struct Lisp_Marker *tail;
7754
7755           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7756             {
7757               tail->need_adjustment
7758                 = tail->charpos == (tail->insertion_type ? from : to);
7759               need_marker_adjustment |= tail->need_adjustment;
7760             }
7761           saved_pt = PT, saved_pt_byte = PT_BYTE;
7762           TEMP_SET_PT_BOTH (from, from_byte);
7763           current_buffer->text->inhibit_shrinking = 1;
7764           del_range_both (from, from_byte, to, to_byte, 1);
7765           coding->src_pos = -chars;
7766           coding->src_pos_byte = -bytes;
7767         }
7768       else
7769         {
7770           coding->src_pos = from;
7771           coding->src_pos_byte = from_byte;
7772         }
7773     }
7774
7775   if (CODING_REQUIRE_DETECTION (coding))
7776     detect_coding (coding);
7777   attrs = CODING_ID_ATTRS (coding->id);
7778
7779   if (EQ (dst_object, Qt)
7780       || (! NILP (CODING_ATTR_POST_READ (attrs))
7781           && NILP (dst_object)))
7782     {
7783       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7784       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7785       coding->dst_pos = BEG;
7786       coding->dst_pos_byte = BEG_BYTE;
7787     }
7788   else if (BUFFERP (dst_object))
7789     {
7790       code_conversion_save (0, 0);
7791       coding->dst_object = dst_object;
7792       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7793       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7794       coding->dst_multibyte
7795         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7796     }
7797   else
7798     {
7799       code_conversion_save (0, 0);
7800       coding->dst_object = Qnil;
7801       /* Most callers presume this will return a multibyte result, and they
7802          won't use `binary' or `raw-text' anyway, so let's not worry about
7803          CODING_FOR_UNIBYTE.  */
7804       coding->dst_multibyte = 1;
7805     }
7806
7807   decode_coding (coding);
7808
7809   if (BUFFERP (coding->dst_object))
7810     set_buffer_internal (XBUFFER (coding->dst_object));
7811
7812   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7813     {
7814       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7815       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7816       Lisp_Object val;
7817
7818       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7819       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7820               old_deactivate_mark);
7821       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7822                         make_number (coding->produced_char));
7823       UNGCPRO;
7824       CHECK_NATNUM (val);
7825       coding->produced_char += Z - prev_Z;
7826       coding->produced += Z_BYTE - prev_Z_BYTE;
7827     }
7828
7829   if (EQ (dst_object, Qt))
7830     {
7831       coding->dst_object = Fbuffer_string ();
7832     }
7833   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7834     {
7835       set_buffer_internal (XBUFFER (coding->dst_object));
7836       if (dst_bytes < coding->produced)
7837         {
7838           destination = xrealloc (destination, coding->produced);
7839           if (! destination)
7840             {
7841               record_conversion_result (coding,
7842                                         CODING_RESULT_INSUFFICIENT_MEM);
7843               unbind_to (count, Qnil);
7844               return;
7845             }
7846           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7847             move_gap_both (BEGV, BEGV_BYTE);
7848           memcpy (destination, BEGV_ADDR, coding->produced);
7849           coding->destination = destination;
7850         }
7851     }
7852
7853   if (saved_pt >= 0)
7854     {
7855       /* This is the case of:
7856          (BUFFERP (src_object) && EQ (src_object, dst_object))
7857          As we have moved PT while replacing the original buffer
7858          contents, we must recover it now.  */
7859       set_buffer_internal (XBUFFER (src_object));
7860       current_buffer->text->inhibit_shrinking = 0;
7861       if (saved_pt < from)
7862         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7863       else if (saved_pt < from + chars)
7864         TEMP_SET_PT_BOTH (from, from_byte);
7865       else if (! NILP (current_buffer->enable_multibyte_characters))
7866         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7867                           saved_pt_byte + (coding->produced - bytes));
7868       else
7869         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7870                           saved_pt_byte + (coding->produced - bytes));
7871
7872       if (need_marker_adjustment)
7873         {
7874           struct Lisp_Marker *tail;
7875
7876           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7877             if (tail->need_adjustment)
7878               {
7879                 tail->need_adjustment = 0;
7880                 if (tail->insertion_type)
7881                   {
7882                     tail->bytepos = from_byte;
7883                     tail->charpos = from;
7884                   }
7885                 else
7886                   {
7887                     tail->bytepos = from_byte + coding->produced;
7888                     tail->charpos
7889                       = (NILP (current_buffer->enable_multibyte_characters)
7890                          ? tail->bytepos : from + coding->produced_char);
7891                   }
7892               }
7893         }
7894     }
7895
7896   Vdeactivate_mark = old_deactivate_mark;
7897   unbind_to (count, coding->dst_object);
7898 }
7899
7900
7901 void
7902 encode_coding_object (struct coding_system *coding,
7903                       Lisp_Object src_object,
7904                       EMACS_INT from, EMACS_INT from_byte,
7905                       EMACS_INT to, EMACS_INT to_byte,
7906                       Lisp_Object dst_object)
7907 {
7908   int count = SPECPDL_INDEX ();
7909   EMACS_INT chars = to - from;
7910   EMACS_INT bytes = to_byte - from_byte;
7911   Lisp_Object attrs;
7912   int saved_pt = -1, saved_pt_byte;
7913   int need_marker_adjustment = 0;
7914   int kill_src_buffer = 0;
7915   Lisp_Object old_deactivate_mark;
7916
7917   old_deactivate_mark = Vdeactivate_mark;
7918
7919   coding->src_object = src_object;
7920   coding->src_chars = chars;
7921   coding->src_bytes = bytes;
7922   coding->src_multibyte = chars < bytes;
7923
7924   attrs = CODING_ID_ATTRS (coding->id);
7925
7926   if (EQ (src_object, dst_object))
7927     {
7928       struct Lisp_Marker *tail;
7929
7930       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7931         {
7932           tail->need_adjustment
7933             = tail->charpos == (tail->insertion_type ? from : to);
7934           need_marker_adjustment |= tail->need_adjustment;
7935         }
7936     }
7937
7938   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7939     {
7940       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7941       set_buffer_internal (XBUFFER (coding->src_object));
7942       if (STRINGP (src_object))
7943         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7944       else if (BUFFERP (src_object))
7945         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7946       else
7947         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7948
7949       if (EQ (src_object, dst_object))
7950         {
7951           set_buffer_internal (XBUFFER (src_object));
7952           saved_pt = PT, saved_pt_byte = PT_BYTE;
7953           del_range_both (from, from_byte, to, to_byte, 1);
7954           set_buffer_internal (XBUFFER (coding->src_object));
7955         }
7956
7957       {
7958         Lisp_Object args[3];
7959         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7960
7961         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7962                 old_deactivate_mark);
7963         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7964         args[1] = make_number (BEG);
7965         args[2] = make_number (Z);
7966         safe_call (3, args);
7967         UNGCPRO;
7968       }
7969       if (XBUFFER (coding->src_object) != current_buffer)
7970         kill_src_buffer = 1;
7971       coding->src_object = Fcurrent_buffer ();
7972       if (BEG != GPT)
7973         move_gap_both (BEG, BEG_BYTE);
7974       coding->src_chars = Z - BEG;
7975       coding->src_bytes = Z_BYTE - BEG_BYTE;
7976       coding->src_pos = BEG;
7977       coding->src_pos_byte = BEG_BYTE;
7978       coding->src_multibyte = Z < Z_BYTE;
7979     }
7980   else if (STRINGP (src_object))
7981     {
7982       code_conversion_save (0, 0);
7983       coding->src_pos = from;
7984       coding->src_pos_byte = from_byte;
7985     }
7986   else if (BUFFERP (src_object))
7987     {
7988       code_conversion_save (0, 0);
7989       set_buffer_internal (XBUFFER (src_object));
7990       if (EQ (src_object, dst_object))
7991         {
7992           saved_pt = PT, saved_pt_byte = PT_BYTE;
7993           coding->src_object = del_range_1 (from, to, 1, 1);
7994           coding->src_pos = 0;
7995           coding->src_pos_byte = 0;
7996         }
7997       else
7998         {
7999           if (from < GPT && to >= GPT)
8000             move_gap_both (from, from_byte);
8001           coding->src_pos = from;
8002           coding->src_pos_byte = from_byte;
8003         }
8004     }
8005   else
8006     code_conversion_save (0, 0);
8007
8008   if (BUFFERP (dst_object))
8009     {
8010       coding->dst_object = dst_object;
8011       if (EQ (src_object, dst_object))
8012         {
8013           coding->dst_pos = from;
8014           coding->dst_pos_byte = from_byte;
8015         }
8016       else
8017         {
8018           struct buffer *current = current_buffer;
8019
8020           set_buffer_temp (XBUFFER (dst_object));
8021           coding->dst_pos = PT;
8022           coding->dst_pos_byte = PT_BYTE;
8023           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8024           set_buffer_temp (current);
8025         }
8026       coding->dst_multibyte
8027         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8028     }
8029   else if (EQ (dst_object, Qt))
8030     {
8031       coding->dst_object = Qnil;
8032       coding->dst_bytes = coding->src_chars;
8033       if (coding->dst_bytes == 0)
8034         coding->dst_bytes = 1;
8035       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8036       coding->dst_multibyte = 0;
8037     }
8038   else
8039     {
8040       coding->dst_object = Qnil;
8041       coding->dst_multibyte = 0;
8042     }
8043
8044   encode_coding (coding);
8045
8046   if (EQ (dst_object, Qt))
8047     {
8048       if (BUFFERP (coding->dst_object))
8049         coding->dst_object = Fbuffer_string ();
8050       else
8051         {
8052           coding->dst_object
8053             = make_unibyte_string ((char *) coding->destination,
8054                                    coding->produced);
8055           xfree (coding->destination);
8056         }
8057     }
8058
8059   if (saved_pt >= 0)
8060     {
8061       /* This is the case of:
8062          (BUFFERP (src_object) && EQ (src_object, dst_object))
8063          As we have moved PT while replacing the original buffer
8064          contents, we must recover it now.  */
8065       set_buffer_internal (XBUFFER (src_object));
8066       if (saved_pt < from)
8067         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8068       else if (saved_pt < from + chars)
8069         TEMP_SET_PT_BOTH (from, from_byte);
8070       else if (! NILP (current_buffer->enable_multibyte_characters))
8071         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8072                           saved_pt_byte + (coding->produced - bytes));
8073       else
8074         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8075                           saved_pt_byte + (coding->produced - bytes));
8076
8077       if (need_marker_adjustment)
8078         {
8079           struct Lisp_Marker *tail;
8080
8081           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8082             if (tail->need_adjustment)
8083               {
8084                 tail->need_adjustment = 0;
8085                 if (tail->insertion_type)
8086                   {
8087                     tail->bytepos = from_byte;
8088                     tail->charpos = from;
8089                   }
8090                 else
8091                   {
8092                     tail->bytepos = from_byte + coding->produced;
8093                     tail->charpos
8094                       = (NILP (current_buffer->enable_multibyte_characters)
8095                          ? tail->bytepos : from + coding->produced_char);
8096                   }
8097               }
8098         }
8099     }
8100
8101   if (kill_src_buffer)
8102     Fkill_buffer (coding->src_object);
8103
8104   Vdeactivate_mark = old_deactivate_mark;
8105   unbind_to (count, Qnil);
8106 }
8107
8108
8109 Lisp_Object
8110 preferred_coding_system (void)
8111 {
8112   int id = coding_categories[coding_priorities[0]].id;
8113
8114   return CODING_ID_NAME (id);
8115 }
8116
8117 \f
8118 #ifdef emacs
8119 /*** 8. Emacs Lisp library functions ***/
8120
8121 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8122        doc: /* Return t if OBJECT is nil or a coding-system.
8123 See the documentation of `define-coding-system' for information
8124 about coding-system objects.  */)
8125   (Lisp_Object object)
8126 {
8127   if (NILP (object)
8128       || CODING_SYSTEM_ID (object) >= 0)
8129     return Qt;
8130   if (! SYMBOLP (object)
8131       || NILP (Fget (object, Qcoding_system_define_form)))
8132     return Qnil;
8133   return Qt;
8134 }
8135
8136 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8137        Sread_non_nil_coding_system, 1, 1, 0,
8138        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8139   (Lisp_Object prompt)
8140 {
8141   Lisp_Object val;
8142   do
8143     {
8144       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8145                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8146     }
8147   while (SCHARS (val) == 0);
8148   return (Fintern (val, Qnil));
8149 }
8150
8151 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8152        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8153 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8154 Ignores case when completing coding systems (all Emacs coding systems
8155 are lower-case).  */)
8156   (Lisp_Object prompt, Lisp_Object default_coding_system)
8157 {
8158   Lisp_Object val;
8159   int count = SPECPDL_INDEX ();
8160
8161   if (SYMBOLP (default_coding_system))
8162     default_coding_system = SYMBOL_NAME (default_coding_system);
8163   specbind (Qcompletion_ignore_case, Qt);
8164   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8165                           Qt, Qnil, Qcoding_system_history,
8166                           default_coding_system, Qnil);
8167   unbind_to (count, Qnil);
8168   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8169 }
8170
8171 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8172        1, 1, 0,
8173        doc: /* Check validity of CODING-SYSTEM.
8174 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8175 It is valid if it is nil or a symbol defined as a coding system by the
8176 function `define-coding-system'.  */)
8177   (Lisp_Object coding_system)
8178 {
8179   Lisp_Object define_form;
8180
8181   define_form = Fget (coding_system, Qcoding_system_define_form);
8182   if (! NILP (define_form))
8183     {
8184       Fput (coding_system, Qcoding_system_define_form, Qnil);
8185       safe_eval (define_form);
8186     }
8187   if (!NILP (Fcoding_system_p (coding_system)))
8188     return coding_system;
8189   xsignal1 (Qcoding_system_error, coding_system);
8190 }
8191
8192 \f
8193 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8194    HIGHEST is nonzero, return the coding system of the highest
8195    priority among the detected coding systems.  Otherwise return a
8196    list of detected coding systems sorted by their priorities.  If
8197    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8198    multibyte form but contains only ASCII and eight-bit chars.
8199    Otherwise, the bytes are raw bytes.
8200
8201    CODING-SYSTEM controls the detection as below:
8202
8203    If it is nil, detect both text-format and eol-format.  If the
8204    text-format part of CODING-SYSTEM is already specified
8205    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8206    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8207    detect only text-format.  */
8208
8209 Lisp_Object
8210 detect_coding_system (const unsigned char *src,
8211                       EMACS_INT src_chars, EMACS_INT src_bytes,
8212                       int highest, int multibytep,
8213                       Lisp_Object coding_system)
8214 {
8215   const unsigned char *src_end = src + src_bytes;
8216   Lisp_Object attrs, eol_type;
8217   Lisp_Object val = Qnil;
8218   struct coding_system coding;
8219   int id;
8220   struct coding_detection_info detect_info;
8221   enum coding_category base_category;
8222   int null_byte_found = 0, eight_bit_found = 0;
8223
8224   if (NILP (coding_system))
8225     coding_system = Qundecided;
8226   setup_coding_system (coding_system, &coding);
8227   attrs = CODING_ID_ATTRS (coding.id);
8228   eol_type = CODING_ID_EOL_TYPE (coding.id);
8229   coding_system = CODING_ATTR_BASE_NAME (attrs);
8230
8231   coding.source = src;
8232   coding.src_chars = src_chars;
8233   coding.src_bytes = src_bytes;
8234   coding.src_multibyte = multibytep;
8235   coding.consumed = 0;
8236   coding.mode |= CODING_MODE_LAST_BLOCK;
8237   coding.head_ascii = 0;
8238
8239   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8240
8241   /* At first, detect text-format if necessary.  */
8242   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8243   if (base_category == coding_category_undecided)
8244     {
8245       enum coding_category category;
8246       struct coding_system *this;
8247       int c, i;
8248
8249       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8250       for (; src < src_end; src++)
8251         {
8252           c = *src;
8253           if (c & 0x80)
8254             {
8255               eight_bit_found = 1;
8256               if (null_byte_found)
8257                 break;
8258             }
8259           else if (c < 0x20)
8260             {
8261               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8262                   && ! inhibit_iso_escape_detection
8263                   && ! detect_info.checked)
8264                 {
8265                   if (detect_coding_iso_2022 (&coding, &detect_info))
8266                     {
8267                       /* We have scanned the whole data.  */
8268                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8269                         {
8270                           /* We didn't find an 8-bit code.  We may
8271                              have found a null-byte, but it's very
8272                              rare that a binary file confirm to
8273                              ISO-2022.  */
8274                           src = src_end;
8275                           coding.head_ascii = src - coding.source;
8276                         }
8277                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8278                       break;
8279                     }
8280                 }
8281               else if (! c && !inhibit_null_byte_detection)
8282                 {
8283                   null_byte_found = 1;
8284                   if (eight_bit_found)
8285                     break;
8286                 }
8287               if (! eight_bit_found)
8288                 coding.head_ascii++;
8289             }
8290           else if (! eight_bit_found)
8291             coding.head_ascii++;
8292         }
8293
8294       if (null_byte_found || eight_bit_found
8295           || coding.head_ascii < coding.src_bytes
8296           || detect_info.found)
8297         {
8298           if (coding.head_ascii == coding.src_bytes)
8299             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8300             for (i = 0; i < coding_category_raw_text; i++)
8301               {
8302                 category = coding_priorities[i];
8303                 this = coding_categories + category;
8304                 if (detect_info.found & (1 << category))
8305                   break;
8306               }
8307           else
8308             {
8309               if (null_byte_found)
8310                 {
8311                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8312                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8313                 }
8314               for (i = 0; i < coding_category_raw_text; i++)
8315                 {
8316                   category = coding_priorities[i];
8317                   this = coding_categories + category;
8318
8319                   if (this->id < 0)
8320                     {
8321                       /* No coding system of this category is defined.  */
8322                       detect_info.rejected |= (1 << category);
8323                     }
8324                   else if (category >= coding_category_raw_text)
8325                     continue;
8326                   else if (detect_info.checked & (1 << category))
8327                     {
8328                       if (highest
8329                           && (detect_info.found & (1 << category)))
8330                         break;
8331                     }
8332                   else if ((*(this->detector)) (&coding, &detect_info)
8333                            && highest
8334                            && (detect_info.found & (1 << category)))
8335                     {
8336                       if (category == coding_category_utf_16_auto)
8337                         {
8338                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8339                             category = coding_category_utf_16_le;
8340                           else
8341                             category = coding_category_utf_16_be;
8342                         }
8343                       break;
8344                     }
8345                 }
8346             }
8347         }
8348
8349       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8350           || null_byte_found)
8351         {
8352           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8353           id = CODING_SYSTEM_ID (Qno_conversion);
8354           val = Fcons (make_number (id), Qnil);
8355         }
8356       else if (! detect_info.rejected && ! detect_info.found)
8357         {
8358           detect_info.found = CATEGORY_MASK_ANY;
8359           id = coding_categories[coding_category_undecided].id;
8360           val = Fcons (make_number (id), Qnil);
8361         }
8362       else if (highest)
8363         {
8364           if (detect_info.found)
8365             {
8366               detect_info.found = 1 << category;
8367               val = Fcons (make_number (this->id), Qnil);
8368             }
8369           else
8370             for (i = 0; i < coding_category_raw_text; i++)
8371               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8372                 {
8373                   detect_info.found = 1 << coding_priorities[i];
8374                   id = coding_categories[coding_priorities[i]].id;
8375                   val = Fcons (make_number (id), Qnil);
8376                   break;
8377                 }
8378         }
8379       else
8380         {
8381           int mask = detect_info.rejected | detect_info.found;
8382           int found = 0;
8383
8384           for (i = coding_category_raw_text - 1; i >= 0; i--)
8385             {
8386               category = coding_priorities[i];
8387               if (! (mask & (1 << category)))
8388                 {
8389                   found |= 1 << category;
8390                   id = coding_categories[category].id;
8391                   if (id >= 0)
8392                     val = Fcons (make_number (id), val);
8393                 }
8394             }
8395           for (i = coding_category_raw_text - 1; i >= 0; i--)
8396             {
8397               category = coding_priorities[i];
8398               if (detect_info.found & (1 << category))
8399                 {
8400                   id = coding_categories[category].id;
8401                   val = Fcons (make_number (id), val);
8402                 }
8403             }
8404           detect_info.found |= found;
8405         }
8406     }
8407   else if (base_category == coding_category_utf_8_auto)
8408     {
8409       if (detect_coding_utf_8 (&coding, &detect_info))
8410         {
8411           struct coding_system *this;
8412
8413           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8414             this = coding_categories + coding_category_utf_8_sig;
8415           else
8416             this = coding_categories + coding_category_utf_8_nosig;
8417           val = Fcons (make_number (this->id), Qnil);
8418         }
8419     }
8420   else if (base_category == coding_category_utf_16_auto)
8421     {
8422       if (detect_coding_utf_16 (&coding, &detect_info))
8423         {
8424           struct coding_system *this;
8425
8426           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8427             this = coding_categories + coding_category_utf_16_le;
8428           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8429             this = coding_categories + coding_category_utf_16_be;
8430           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8431             this = coding_categories + coding_category_utf_16_be_nosig;
8432           else
8433             this = coding_categories + coding_category_utf_16_le_nosig;
8434           val = Fcons (make_number (this->id), Qnil);
8435         }
8436     }
8437   else
8438     {
8439       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8440       val = Fcons (make_number (coding.id), Qnil);
8441     }
8442
8443   /* Then, detect eol-format if necessary.  */
8444   {
8445     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8446     Lisp_Object tail;
8447
8448     if (VECTORP (eol_type))
8449       {
8450         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8451           {
8452             if (null_byte_found)
8453               normal_eol = EOL_SEEN_LF;
8454             else
8455               normal_eol = detect_eol (coding.source, src_bytes,
8456                                        coding_category_raw_text);
8457           }
8458         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8459                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8460           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8461                                       coding_category_utf_16_be);
8462         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8463                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8464           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8465                                       coding_category_utf_16_le);
8466       }
8467     else
8468       {
8469         if (EQ (eol_type, Qunix))
8470           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8471         else if (EQ (eol_type, Qdos))
8472           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8473         else
8474           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8475       }
8476
8477     for (tail = val; CONSP (tail); tail = XCDR (tail))
8478       {
8479         enum coding_category category;
8480         int this_eol;
8481
8482         id = XINT (XCAR (tail));
8483         attrs = CODING_ID_ATTRS (id);
8484         category = XINT (CODING_ATTR_CATEGORY (attrs));
8485         eol_type = CODING_ID_EOL_TYPE (id);
8486         if (VECTORP (eol_type))
8487           {
8488             if (category == coding_category_utf_16_be
8489                 || category == coding_category_utf_16_be_nosig)
8490               this_eol = utf_16_be_eol;
8491             else if (category == coding_category_utf_16_le
8492                      || category == coding_category_utf_16_le_nosig)
8493               this_eol = utf_16_le_eol;
8494             else
8495               this_eol = normal_eol;
8496
8497             if (this_eol == EOL_SEEN_LF)
8498               XSETCAR (tail, AREF (eol_type, 0));
8499             else if (this_eol == EOL_SEEN_CRLF)
8500               XSETCAR (tail, AREF (eol_type, 1));
8501             else if (this_eol == EOL_SEEN_CR)
8502               XSETCAR (tail, AREF (eol_type, 2));
8503             else
8504               XSETCAR (tail, CODING_ID_NAME (id));
8505           }
8506         else
8507           XSETCAR (tail, CODING_ID_NAME (id));
8508       }
8509   }
8510
8511   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8512 }
8513
8514
8515 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8516        2, 3, 0,
8517        doc: /* Detect coding system of the text in the region between START and END.
8518 Return a list of possible coding systems ordered by priority.
8519 The coding systems to try and their priorities follows what
8520 the function `coding-system-priority-list' (which see) returns.
8521
8522 If only ASCII characters are found (except for such ISO-2022 control
8523 characters as ESC), it returns a list of single element `undecided'
8524 or its subsidiary coding system according to a detected end-of-line
8525 format.
8526
8527 If optional argument HIGHEST is non-nil, return the coding system of
8528 highest priority.  */)
8529   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8530 {
8531   int from, to;
8532   int from_byte, to_byte;
8533
8534   CHECK_NUMBER_COERCE_MARKER (start);
8535   CHECK_NUMBER_COERCE_MARKER (end);
8536
8537   validate_region (&start, &end);
8538   from = XINT (start), to = XINT (end);
8539   from_byte = CHAR_TO_BYTE (from);
8540   to_byte = CHAR_TO_BYTE (to);
8541
8542   if (from < GPT && to >= GPT)
8543     move_gap_both (to, to_byte);
8544
8545   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8546                                to - from, to_byte - from_byte,
8547                                !NILP (highest),
8548                                !NILP (current_buffer
8549                                       ->enable_multibyte_characters),
8550                                Qnil);
8551 }
8552
8553 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8554        1, 2, 0,
8555        doc: /* Detect coding system of the text in STRING.
8556 Return a list of possible coding systems ordered by priority.
8557 The coding systems to try and their priorities follows what
8558 the function `coding-system-priority-list' (which see) returns.
8559
8560 If only ASCII characters are found (except for such ISO-2022 control
8561 characters as ESC), it returns a list of single element `undecided'
8562 or its subsidiary coding system according to a detected end-of-line
8563 format.
8564
8565 If optional argument HIGHEST is non-nil, return the coding system of
8566 highest priority.  */)
8567   (Lisp_Object string, Lisp_Object highest)
8568 {
8569   CHECK_STRING (string);
8570
8571   return detect_coding_system (SDATA (string),
8572                                SCHARS (string), SBYTES (string),
8573                                !NILP (highest), STRING_MULTIBYTE (string),
8574                                Qnil);
8575 }
8576
8577
8578 static INLINE int
8579 char_encodable_p (int c, Lisp_Object attrs)
8580 {
8581   Lisp_Object tail;
8582   struct charset *charset;
8583   Lisp_Object translation_table;
8584
8585   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8586   if (! NILP (translation_table))
8587     c = translate_char (translation_table, c);
8588   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8589        CONSP (tail); tail = XCDR (tail))
8590     {
8591       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8592       if (CHAR_CHARSET_P (c, charset))
8593         break;
8594     }
8595   return (! NILP (tail));
8596 }
8597
8598
8599 /* Return a list of coding systems that safely encode the text between
8600    START and END.  If EXCLUDE is non-nil, it is a list of coding
8601    systems not to check.  The returned list doesn't contain any such
8602    coding systems.  In any case, if the text contains only ASCII or is
8603    unibyte, return t.  */
8604
8605 DEFUN ("find-coding-systems-region-internal",
8606        Ffind_coding_systems_region_internal,
8607        Sfind_coding_systems_region_internal, 2, 3, 0,
8608        doc: /* Internal use only.  */)
8609   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8610 {
8611   Lisp_Object coding_attrs_list, safe_codings;
8612   EMACS_INT start_byte, end_byte;
8613   const unsigned char *p, *pbeg, *pend;
8614   int c;
8615   Lisp_Object tail, elt, work_table;
8616
8617   if (STRINGP (start))
8618     {
8619       if (!STRING_MULTIBYTE (start)
8620           || SCHARS (start) == SBYTES (start))
8621         return Qt;
8622       start_byte = 0;
8623       end_byte = SBYTES (start);
8624     }
8625   else
8626     {
8627       CHECK_NUMBER_COERCE_MARKER (start);
8628       CHECK_NUMBER_COERCE_MARKER (end);
8629       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8630         args_out_of_range (start, end);
8631       if (NILP (current_buffer->enable_multibyte_characters))
8632         return Qt;
8633       start_byte = CHAR_TO_BYTE (XINT (start));
8634       end_byte = CHAR_TO_BYTE (XINT (end));
8635       if (XINT (end) - XINT (start) == end_byte - start_byte)
8636         return Qt;
8637
8638       if (XINT (start) < GPT && XINT (end) > GPT)
8639         {
8640           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8641             move_gap_both (XINT (start), start_byte);
8642           else
8643             move_gap_both (XINT (end), end_byte);
8644         }
8645     }
8646
8647   coding_attrs_list = Qnil;
8648   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8649     if (NILP (exclude)
8650         || NILP (Fmemq (XCAR (tail), exclude)))
8651       {
8652         Lisp_Object attrs;
8653
8654         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8655         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8656             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8657           {
8658             ASET (attrs, coding_attr_trans_tbl,
8659                   get_translation_table (attrs, 1, NULL));
8660             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8661           }
8662       }
8663
8664   if (STRINGP (start))
8665     p = pbeg = SDATA (start);
8666   else
8667     p = pbeg = BYTE_POS_ADDR (start_byte);
8668   pend = p + (end_byte - start_byte);
8669
8670   while (p < pend && ASCII_BYTE_P (*p)) p++;
8671   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8672
8673   work_table = Fmake_char_table (Qnil, Qnil);
8674   while (p < pend)
8675     {
8676       if (ASCII_BYTE_P (*p))
8677         p++;
8678       else
8679         {
8680           c = STRING_CHAR_ADVANCE (p);
8681           if (!NILP (char_table_ref (work_table, c)))
8682             /* This character was already checked.  Ignore it.  */
8683             continue;
8684
8685           charset_map_loaded = 0;
8686           for (tail = coding_attrs_list; CONSP (tail);)
8687             {
8688               elt = XCAR (tail);
8689               if (NILP (elt))
8690                 tail = XCDR (tail);
8691               else if (char_encodable_p (c, elt))
8692                 tail = XCDR (tail);
8693               else if (CONSP (XCDR (tail)))
8694                 {
8695                   XSETCAR (tail, XCAR (XCDR (tail)));
8696                   XSETCDR (tail, XCDR (XCDR (tail)));
8697                 }
8698               else
8699                 {
8700                   XSETCAR (tail, Qnil);
8701                   tail = XCDR (tail);
8702                 }
8703             }
8704           if (charset_map_loaded)
8705             {
8706               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8707
8708               if (STRINGP (start))
8709                 pbeg = SDATA (start);
8710               else
8711                 pbeg = BYTE_POS_ADDR (start_byte);
8712               p = pbeg + p_offset;
8713               pend = pbeg + pend_offset;
8714             }
8715           char_table_set (work_table, c, Qt);
8716         }
8717     }
8718
8719   safe_codings = list2 (Qraw_text, Qno_conversion);
8720   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8721     if (! NILP (XCAR (tail)))
8722       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8723
8724   return safe_codings;
8725 }
8726
8727
8728 DEFUN ("unencodable-char-position", Funencodable_char_position,
8729        Sunencodable_char_position, 3, 5, 0,
8730        doc: /*
8731 Return position of first un-encodable character in a region.
8732 START and END specify the region and CODING-SYSTEM specifies the
8733 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8734
8735 If optional 4th argument COUNT is non-nil, it specifies at most how
8736 many un-encodable characters to search.  In this case, the value is a
8737 list of positions.
8738
8739 If optional 5th argument STRING is non-nil, it is a string to search
8740 for un-encodable characters.  In that case, START and END are indexes
8741 to the string.  */)
8742   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8743 {
8744   int n;
8745   struct coding_system coding;
8746   Lisp_Object attrs, charset_list, translation_table;
8747   Lisp_Object positions;
8748   int from, to;
8749   const unsigned char *p, *stop, *pend;
8750   int ascii_compatible;
8751
8752   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8753   attrs = CODING_ID_ATTRS (coding.id);
8754   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8755     return Qnil;
8756   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8757   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8758   translation_table = get_translation_table (attrs, 1, NULL);
8759
8760   if (NILP (string))
8761     {
8762       validate_region (&start, &end);
8763       from = XINT (start);
8764       to = XINT (end);
8765       if (NILP (current_buffer->enable_multibyte_characters)
8766           || (ascii_compatible
8767               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8768         return Qnil;
8769       p = CHAR_POS_ADDR (from);
8770       pend = CHAR_POS_ADDR (to);
8771       if (from < GPT && to >= GPT)
8772         stop = GPT_ADDR;
8773       else
8774         stop = pend;
8775     }
8776   else
8777     {
8778       CHECK_STRING (string);
8779       CHECK_NATNUM (start);
8780       CHECK_NATNUM (end);
8781       from = XINT (start);
8782       to = XINT (end);
8783       if (from > to
8784           || to > SCHARS (string))
8785         args_out_of_range_3 (string, start, end);
8786       if (! STRING_MULTIBYTE (string))
8787         return Qnil;
8788       p = SDATA (string) + string_char_to_byte (string, from);
8789       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8790       if (ascii_compatible && (to - from) == (pend - p))
8791         return Qnil;
8792     }
8793
8794   if (NILP (count))
8795     n = 1;
8796   else
8797     {
8798       CHECK_NATNUM (count);
8799       n = XINT (count);
8800     }
8801
8802   positions = Qnil;
8803   while (1)
8804     {
8805       int c;
8806
8807       if (ascii_compatible)
8808         while (p < stop && ASCII_BYTE_P (*p))
8809           p++, from++;
8810       if (p >= stop)
8811         {
8812           if (p >= pend)
8813             break;
8814           stop = pend;
8815           p = GAP_END_ADDR;
8816         }
8817
8818       c = STRING_CHAR_ADVANCE (p);
8819       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8820           && ! char_charset (translate_char (translation_table, c),
8821                              charset_list, NULL))
8822         {
8823           positions = Fcons (make_number (from), positions);
8824           n--;
8825           if (n == 0)
8826             break;
8827         }
8828
8829       from++;
8830     }
8831
8832   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8833 }
8834
8835
8836 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8837        Scheck_coding_systems_region, 3, 3, 0,
8838        doc: /* Check if the region is encodable by coding systems.
8839
8840 START and END are buffer positions specifying the region.
8841 CODING-SYSTEM-LIST is a list of coding systems to check.
8842
8843 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8844 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8845 whole region, POS0, POS1, ... are buffer positions where non-encodable
8846 characters are found.
8847
8848 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8849 value is nil.
8850
8851 START may be a string.  In that case, check if the string is
8852 encodable, and the value contains indices to the string instead of
8853 buffer positions.  END is ignored.
8854
8855 If the current buffer (or START if it is a string) is unibyte, the value
8856 is nil.  */)
8857   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8858 {
8859   Lisp_Object list;
8860   EMACS_INT start_byte, end_byte;
8861   int pos;
8862   const unsigned char *p, *pbeg, *pend;
8863   int c;
8864   Lisp_Object tail, elt, attrs;
8865
8866   if (STRINGP (start))
8867     {
8868       if (!STRING_MULTIBYTE (start)
8869           || SCHARS (start) == SBYTES (start))
8870         return Qnil;
8871       start_byte = 0;
8872       end_byte = SBYTES (start);
8873       pos = 0;
8874     }
8875   else
8876     {
8877       CHECK_NUMBER_COERCE_MARKER (start);
8878       CHECK_NUMBER_COERCE_MARKER (end);
8879       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8880         args_out_of_range (start, end);
8881       if (NILP (current_buffer->enable_multibyte_characters))
8882         return Qnil;
8883       start_byte = CHAR_TO_BYTE (XINT (start));
8884       end_byte = CHAR_TO_BYTE (XINT (end));
8885       if (XINT (end) - XINT (start) == end_byte - start_byte)
8886         return Qnil;
8887
8888       if (XINT (start) < GPT && XINT (end) > GPT)
8889         {
8890           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8891             move_gap_both (XINT (start), start_byte);
8892           else
8893             move_gap_both (XINT (end), end_byte);
8894         }
8895       pos = XINT (start);
8896     }
8897
8898   list = Qnil;
8899   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8900     {
8901       elt = XCAR (tail);
8902       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8903       ASET (attrs, coding_attr_trans_tbl,
8904             get_translation_table (attrs, 1, NULL));
8905       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8906     }
8907
8908   if (STRINGP (start))
8909     p = pbeg = SDATA (start);
8910   else
8911     p = pbeg = BYTE_POS_ADDR (start_byte);
8912   pend = p + (end_byte - start_byte);
8913
8914   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8915   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8916
8917   while (p < pend)
8918     {
8919       if (ASCII_BYTE_P (*p))
8920         p++;
8921       else
8922         {
8923           c = STRING_CHAR_ADVANCE (p);
8924
8925           charset_map_loaded = 0;
8926           for (tail = list; CONSP (tail); tail = XCDR (tail))
8927             {
8928               elt = XCDR (XCAR (tail));
8929               if (! char_encodable_p (c, XCAR (elt)))
8930                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8931             }
8932           if (charset_map_loaded)
8933             {
8934               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8935
8936               if (STRINGP (start))
8937                 pbeg = SDATA (start);
8938               else
8939                 pbeg = BYTE_POS_ADDR (start_byte);
8940               p = pbeg + p_offset;
8941               pend = pbeg + pend_offset;
8942             }
8943         }
8944       pos++;
8945     }
8946
8947   tail = list;
8948   list = Qnil;
8949   for (; CONSP (tail); tail = XCDR (tail))
8950     {
8951       elt = XCAR (tail);
8952       if (CONSP (XCDR (XCDR (elt))))
8953         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8954                       list);
8955     }
8956
8957   return list;
8958 }
8959
8960
8961 Lisp_Object
8962 code_convert_region (Lisp_Object start, Lisp_Object end,
8963                      Lisp_Object coding_system, Lisp_Object dst_object,
8964                      int encodep, int norecord)
8965 {
8966   struct coding_system coding;
8967   EMACS_INT from, from_byte, to, to_byte;
8968   Lisp_Object src_object;
8969
8970   CHECK_NUMBER_COERCE_MARKER (start);
8971   CHECK_NUMBER_COERCE_MARKER (end);
8972   if (NILP (coding_system))
8973     coding_system = Qno_conversion;
8974   else
8975     CHECK_CODING_SYSTEM (coding_system);
8976   src_object = Fcurrent_buffer ();
8977   if (NILP (dst_object))
8978     dst_object = src_object;
8979   else if (! EQ (dst_object, Qt))
8980     CHECK_BUFFER (dst_object);
8981
8982   validate_region (&start, &end);
8983   from = XFASTINT (start);
8984   from_byte = CHAR_TO_BYTE (from);
8985   to = XFASTINT (end);
8986   to_byte = CHAR_TO_BYTE (to);
8987
8988   setup_coding_system (coding_system, &coding);
8989   coding.mode |= CODING_MODE_LAST_BLOCK;
8990
8991   if (encodep)
8992     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8993                           dst_object);
8994   else
8995     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8996                           dst_object);
8997   if (! norecord)
8998     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8999
9000   return (BUFFERP (dst_object)
9001           ? make_number (coding.produced_char)
9002           : coding.dst_object);
9003 }
9004
9005
9006 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9007        3, 4, "r\nzCoding system: ",
9008        doc: /* Decode the current region from the specified coding system.
9009 When called from a program, takes four arguments:
9010         START, END, CODING-SYSTEM, and DESTINATION.
9011 START and END are buffer positions.
9012
9013 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9014 If nil, the region between START and END is replaced by the decoded text.
9015 If buffer, the decoded text is inserted in that buffer after point (point
9016 does not move).
9017 In those cases, the length of the decoded text is returned.
9018 If DESTINATION is t, the decoded text is returned.
9019
9020 This function sets `last-coding-system-used' to the precise coding system
9021 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9022 not fully specified.)  */)
9023   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9024 {
9025   return code_convert_region (start, end, coding_system, destination, 0, 0);
9026 }
9027
9028 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9029        3, 4, "r\nzCoding system: ",
9030        doc: /* Encode the current region by specified coding system.
9031 When called from a program, takes four arguments:
9032         START, END, CODING-SYSTEM and DESTINATION.
9033 START and END are buffer positions.
9034
9035 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9036 If nil, the region between START and END is replace by the encoded text.
9037 If buffer, the encoded text is inserted in that buffer after point (point
9038 does not move).
9039 In those cases, the length of the encoded text is returned.
9040 If DESTINATION is t, the encoded text is returned.
9041
9042 This function sets `last-coding-system-used' to the precise coding system
9043 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9044 not fully specified.)  */)
9045   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9046 {
9047   return code_convert_region (start, end, coding_system, destination, 1, 0);
9048 }
9049
9050 Lisp_Object
9051 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9052                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
9053 {
9054   struct coding_system coding;
9055   EMACS_INT chars, bytes;
9056
9057   CHECK_STRING (string);
9058   if (NILP (coding_system))
9059     {
9060       if (! norecord)
9061         Vlast_coding_system_used = Qno_conversion;
9062       if (NILP (dst_object))
9063         return (nocopy ? Fcopy_sequence (string) : string);
9064     }
9065
9066   if (NILP (coding_system))
9067     coding_system = Qno_conversion;
9068   else
9069     CHECK_CODING_SYSTEM (coding_system);
9070   if (NILP (dst_object))
9071     dst_object = Qt;
9072   else if (! EQ (dst_object, Qt))
9073     CHECK_BUFFER (dst_object);
9074
9075   setup_coding_system (coding_system, &coding);
9076   coding.mode |= CODING_MODE_LAST_BLOCK;
9077   chars = SCHARS (string);
9078   bytes = SBYTES (string);
9079   if (encodep)
9080     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9081   else
9082     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9083   if (! norecord)
9084     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9085
9086   return (BUFFERP (dst_object)
9087           ? make_number (coding.produced_char)
9088           : coding.dst_object);
9089 }
9090
9091
9092 /* Encode or decode STRING according to CODING_SYSTEM.
9093    Do not set Vlast_coding_system_used.
9094
9095    This function is called only from macros DECODE_FILE and
9096    ENCODE_FILE, thus we ignore character composition.  */
9097
9098 Lisp_Object
9099 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9100                               int encodep)
9101 {
9102   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9103 }
9104
9105
9106 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9107        2, 4, 0,
9108        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9109
9110 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9111 if the decoding operation is trivial.
9112
9113 Optional fourth arg BUFFER non-nil means that the decoded text is
9114 inserted in that buffer after point (point does not move).  In this
9115 case, the return value is the length of the decoded text.
9116
9117 This function sets `last-coding-system-used' to the precise coding system
9118 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9119 not fully specified.)  */)
9120   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9121 {
9122   return code_convert_string (string, coding_system, buffer,
9123                               0, ! NILP (nocopy), 0);
9124 }
9125
9126 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9127        2, 4, 0,
9128        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9129
9130 Optional third arg NOCOPY non-nil means it is OK to return STRING
9131 itself if the encoding operation is trivial.
9132
9133 Optional fourth arg BUFFER non-nil means that the encoded text is
9134 inserted in that buffer after point (point does not move).  In this
9135 case, the return value is the length of the encoded text.
9136
9137 This function sets `last-coding-system-used' to the precise coding system
9138 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9139 not fully specified.)  */)
9140   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9141 {
9142   return code_convert_string (string, coding_system, buffer,
9143                               1, ! NILP (nocopy), 1);
9144 }
9145
9146 \f
9147 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9148        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9149 Return the corresponding character.  */)
9150   (Lisp_Object code)
9151 {
9152   Lisp_Object spec, attrs, val;
9153   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9154   int c;
9155
9156   CHECK_NATNUM (code);
9157   c = XFASTINT (code);
9158   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9159   attrs = AREF (spec, 0);
9160
9161   if (ASCII_BYTE_P (c)
9162       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9163     return code;
9164
9165   val = CODING_ATTR_CHARSET_LIST (attrs);
9166   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9167   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9168   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9169
9170   if (c <= 0x7F)
9171     charset = charset_roman;
9172   else if (c >= 0xA0 && c < 0xDF)
9173     {
9174       charset = charset_kana;
9175       c -= 0x80;
9176     }
9177   else
9178     {
9179       int s1 = c >> 8, s2 = c & 0xFF;
9180
9181       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9182           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9183         error ("Invalid code: %d", code);
9184       SJIS_TO_JIS (c);
9185       charset = charset_kanji;
9186     }
9187   c = DECODE_CHAR (charset, c);
9188   if (c < 0)
9189     error ("Invalid code: %d", code);
9190   return make_number (c);
9191 }
9192
9193
9194 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9195        doc: /* Encode a Japanese character CH to shift_jis encoding.
9196 Return the corresponding code in SJIS.  */)
9197   (Lisp_Object ch)
9198 {
9199   Lisp_Object spec, attrs, charset_list;
9200   int c;
9201   struct charset *charset;
9202   unsigned code;
9203
9204   CHECK_CHARACTER (ch);
9205   c = XFASTINT (ch);
9206   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9207   attrs = AREF (spec, 0);
9208
9209   if (ASCII_CHAR_P (c)
9210       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9211     return ch;
9212
9213   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9214   charset = char_charset (c, charset_list, &code);
9215   if (code == CHARSET_INVALID_CODE (charset))
9216     error ("Can't encode by shift_jis encoding: %d", c);
9217   JIS_TO_SJIS (code);
9218
9219   return make_number (code);
9220 }
9221
9222 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9223        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9224 Return the corresponding character.  */)
9225   (Lisp_Object code)
9226 {
9227   Lisp_Object spec, attrs, val;
9228   struct charset *charset_roman, *charset_big5, *charset;
9229   int c;
9230
9231   CHECK_NATNUM (code);
9232   c = XFASTINT (code);
9233   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9234   attrs = AREF (spec, 0);
9235
9236   if (ASCII_BYTE_P (c)
9237       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9238     return code;
9239
9240   val = CODING_ATTR_CHARSET_LIST (attrs);
9241   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9242   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9243
9244   if (c <= 0x7F)
9245     charset = charset_roman;
9246   else
9247     {
9248       int b1 = c >> 8, b2 = c & 0x7F;
9249       if (b1 < 0xA1 || b1 > 0xFE
9250           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9251         error ("Invalid code: %d", code);
9252       charset = charset_big5;
9253     }
9254   c = DECODE_CHAR (charset, (unsigned )c);
9255   if (c < 0)
9256     error ("Invalid code: %d", code);
9257   return make_number (c);
9258 }
9259
9260 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9261        doc: /* Encode the Big5 character CH to BIG5 coding system.
9262 Return the corresponding character code in Big5.  */)
9263   (Lisp_Object ch)
9264 {
9265   Lisp_Object spec, attrs, charset_list;
9266   struct charset *charset;
9267   int c;
9268   unsigned code;
9269
9270   CHECK_CHARACTER (ch);
9271   c = XFASTINT (ch);
9272   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9273   attrs = AREF (spec, 0);
9274   if (ASCII_CHAR_P (c)
9275       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9276     return ch;
9277
9278   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9279   charset = char_charset (c, charset_list, &code);
9280   if (code == CHARSET_INVALID_CODE (charset))
9281     error ("Can't encode by Big5 encoding: %d", c);
9282
9283   return make_number (code);
9284 }
9285
9286 \f
9287 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9288        Sset_terminal_coding_system_internal, 1, 2, 0,
9289        doc: /* Internal use only.  */)
9290   (Lisp_Object coding_system, Lisp_Object terminal)
9291 {
9292   struct terminal *term = get_terminal (terminal, 1);
9293   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9294   CHECK_SYMBOL (coding_system);
9295   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9296   /* We had better not send unsafe characters to terminal.  */
9297   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9298   /* Character composition should be disabled.  */
9299   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9300   terminal_coding->src_multibyte = 1;
9301   terminal_coding->dst_multibyte = 0;
9302   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9303     term->charset_list = coding_charset_list (terminal_coding);
9304   else
9305     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9306   return Qnil;
9307 }
9308
9309 DEFUN ("set-safe-terminal-coding-system-internal",
9310        Fset_safe_terminal_coding_system_internal,
9311        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9312        doc: /* Internal use only.  */)
9313   (Lisp_Object coding_system)
9314 {
9315   CHECK_SYMBOL (coding_system);
9316   setup_coding_system (Fcheck_coding_system (coding_system),
9317                        &safe_terminal_coding);
9318   /* Character composition should be disabled.  */
9319   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9320   safe_terminal_coding.src_multibyte = 1;
9321   safe_terminal_coding.dst_multibyte = 0;
9322   return Qnil;
9323 }
9324
9325 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9326        Sterminal_coding_system, 0, 1, 0,
9327        doc: /* Return coding system specified for terminal output on the given terminal.
9328 TERMINAL may be a terminal object, a frame, or nil for the selected
9329 frame's terminal device.  */)
9330   (Lisp_Object terminal)
9331 {
9332   struct coding_system *terminal_coding
9333     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9334   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9335
9336   /* For backward compatibility, return nil if it is `undecided'. */
9337   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9338 }
9339
9340 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9341        Sset_keyboard_coding_system_internal, 1, 2, 0,
9342        doc: /* Internal use only.  */)
9343   (Lisp_Object coding_system, Lisp_Object terminal)
9344 {
9345   struct terminal *t = get_terminal (terminal, 1);
9346   CHECK_SYMBOL (coding_system);
9347   if (NILP (coding_system))
9348     coding_system = Qno_conversion;
9349   else
9350     Fcheck_coding_system (coding_system);
9351   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9352   /* Character composition should be disabled.  */
9353   TERMINAL_KEYBOARD_CODING (t)->common_flags
9354     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9355   return Qnil;
9356 }
9357
9358 DEFUN ("keyboard-coding-system",
9359        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9360        doc: /* Return coding system specified for decoding keyboard input.  */)
9361   (Lisp_Object terminal)
9362 {
9363   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9364                          (get_terminal (terminal, 1))->id);
9365 }
9366
9367 \f
9368 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9369        Sfind_operation_coding_system,  1, MANY, 0,
9370        doc: /* Choose a coding system for an operation based on the target name.
9371 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9372 DECODING-SYSTEM is the coding system to use for decoding
9373 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9374 for encoding (in case OPERATION does encoding).
9375
9376 The first argument OPERATION specifies an I/O primitive:
9377   For file I/O, `insert-file-contents' or `write-region'.
9378   For process I/O, `call-process', `call-process-region', or `start-process'.
9379   For network I/O, `open-network-stream'.
9380
9381 The remaining arguments should be the same arguments that were passed
9382 to the primitive.  Depending on which primitive, one of those arguments
9383 is selected as the TARGET.  For example, if OPERATION does file I/O,
9384 whichever argument specifies the file name is TARGET.
9385
9386 TARGET has a meaning which depends on OPERATION:
9387   For file I/O, TARGET is a file name (except for the special case below).
9388   For process I/O, TARGET is a process name.
9389   For network I/O, TARGET is a service name or a port number.
9390
9391 This function looks up what is specified for TARGET in
9392 `file-coding-system-alist', `process-coding-system-alist',
9393 or `network-coding-system-alist' depending on OPERATION.
9394 They may specify a coding system, a cons of coding systems,
9395 or a function symbol to call.
9396 In the last case, we call the function with one argument,
9397 which is a list of all the arguments given to this function.
9398 If the function can't decide a coding system, it can return
9399 `undecided' so that the normal code-detection is performed.
9400
9401 If OPERATION is `insert-file-contents', the argument corresponding to
9402 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9403 file name to look up, and BUFFER is a buffer that contains the file's
9404 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9405 function to call for FILENAME, that function should examine the
9406 contents of BUFFER instead of reading the file.
9407
9408 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9409   (int nargs, Lisp_Object *args)
9410 {
9411   Lisp_Object operation, target_idx, target, val;
9412   register Lisp_Object chain;
9413
9414   if (nargs < 2)
9415     error ("Too few arguments");
9416   operation = args[0];
9417   if (!SYMBOLP (operation)
9418       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9419     error ("Invalid first argument");
9420   if (nargs < 1 + XINT (target_idx))
9421     error ("Too few arguments for operation: %s",
9422            SDATA (SYMBOL_NAME (operation)));
9423   target = args[XINT (target_idx) + 1];
9424   if (!(STRINGP (target)
9425         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9426             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9427         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9428     error ("Invalid %dth argument", XINT (target_idx) + 1);
9429   if (CONSP (target))
9430     target = XCAR (target);
9431
9432   chain = ((EQ (operation, Qinsert_file_contents)
9433             || EQ (operation, Qwrite_region))
9434            ? Vfile_coding_system_alist
9435            : (EQ (operation, Qopen_network_stream)
9436               ? Vnetwork_coding_system_alist
9437               : Vprocess_coding_system_alist));
9438   if (NILP (chain))
9439     return Qnil;
9440
9441   for (; CONSP (chain); chain = XCDR (chain))
9442     {
9443       Lisp_Object elt;
9444
9445       elt = XCAR (chain);
9446       if (CONSP (elt)
9447           && ((STRINGP (target)
9448                && STRINGP (XCAR (elt))
9449                && fast_string_match (XCAR (elt), target) >= 0)
9450               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9451         {
9452           val = XCDR (elt);
9453           /* Here, if VAL is both a valid coding system and a valid
9454              function symbol, we return VAL as a coding system.  */
9455           if (CONSP (val))
9456             return val;
9457           if (! SYMBOLP (val))
9458             return Qnil;
9459           if (! NILP (Fcoding_system_p (val)))
9460             return Fcons (val, val);
9461           if (! NILP (Ffboundp (val)))
9462             {
9463               /* We use call1 rather than safe_call1
9464                  so as to get bug reports about functions called here
9465                  which don't handle the current interface.  */
9466               val = call1 (val, Flist (nargs, args));
9467               if (CONSP (val))
9468                 return val;
9469               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9470                 return Fcons (val, val);
9471             }
9472           return Qnil;
9473         }
9474     }
9475   return Qnil;
9476 }
9477
9478 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9479        Sset_coding_system_priority, 0, MANY, 0,
9480        doc: /* Assign higher priority to the coding systems given as arguments.
9481 If multiple coding systems belong to the same category,
9482 all but the first one are ignored.
9483
9484 usage: (set-coding-system-priority &rest coding-systems)  */)
9485   (int nargs, Lisp_Object *args)
9486 {
9487   int i, j;
9488   int changed[coding_category_max];
9489   enum coding_category priorities[coding_category_max];
9490
9491   memset (changed, 0, sizeof changed);
9492
9493   for (i = j = 0; i < nargs; i++)
9494     {
9495       enum coding_category category;
9496       Lisp_Object spec, attrs;
9497
9498       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9499       attrs = AREF (spec, 0);
9500       category = XINT (CODING_ATTR_CATEGORY (attrs));
9501       if (changed[category])
9502         /* Ignore this coding system because a coding system of the
9503            same category already had a higher priority.  */
9504         continue;
9505       changed[category] = 1;
9506       priorities[j++] = category;
9507       if (coding_categories[category].id >= 0
9508           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9509         setup_coding_system (args[i], &coding_categories[category]);
9510       Fset (AREF (Vcoding_category_table, category), args[i]);
9511     }
9512
9513   /* Now we have decided top J priorities.  Reflect the order of the
9514      original priorities to the remaining priorities.  */
9515
9516   for (i = j, j = 0; i < coding_category_max; i++, j++)
9517     {
9518       while (j < coding_category_max
9519              && changed[coding_priorities[j]])
9520         j++;
9521       if (j == coding_category_max)
9522         abort ();
9523       priorities[i] = coding_priorities[j];
9524     }
9525
9526   memcpy (coding_priorities, priorities, sizeof priorities);
9527
9528   /* Update `coding-category-list'.  */
9529   Vcoding_category_list = Qnil;
9530   for (i = coding_category_max - 1; i >= 0; i--)
9531     Vcoding_category_list
9532       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9533                Vcoding_category_list);
9534
9535   return Qnil;
9536 }
9537
9538 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9539        Scoding_system_priority_list, 0, 1, 0,
9540        doc: /* Return a list of coding systems ordered by their priorities.
9541 The list contains a subset of coding systems; i.e. coding systems
9542 assigned to each coding category (see `coding-category-list').
9543
9544 HIGHESTP non-nil means just return the highest priority one.  */)
9545   (Lisp_Object highestp)
9546 {
9547   int i;
9548   Lisp_Object val;
9549
9550   for (i = 0, val = Qnil; i < coding_category_max; i++)
9551     {
9552       enum coding_category category = coding_priorities[i];
9553       int id = coding_categories[category].id;
9554       Lisp_Object attrs;
9555
9556       if (id < 0)
9557         continue;
9558       attrs = CODING_ID_ATTRS (id);
9559       if (! NILP (highestp))
9560         return CODING_ATTR_BASE_NAME (attrs);
9561       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9562     }
9563   return Fnreverse (val);
9564 }
9565
9566 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9567
9568 static Lisp_Object
9569 make_subsidiaries (Lisp_Object base)
9570 {
9571   Lisp_Object subsidiaries;
9572   int base_name_len = SBYTES (SYMBOL_NAME (base));
9573   char *buf = (char *) alloca (base_name_len + 6);
9574   int i;
9575
9576   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9577   subsidiaries = Fmake_vector (make_number (3), Qnil);
9578   for (i = 0; i < 3; i++)
9579     {
9580       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9581       ASET (subsidiaries, i, intern (buf));
9582     }
9583   return subsidiaries;
9584 }
9585
9586
9587 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9588        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9589        doc: /* For internal use only.
9590 usage: (define-coding-system-internal ...)  */)
9591   (int nargs, Lisp_Object *args)
9592 {
9593   Lisp_Object name;
9594   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9595   Lisp_Object attrs;            /* Vector of attributes.  */
9596   Lisp_Object eol_type;
9597   Lisp_Object aliases;
9598   Lisp_Object coding_type, charset_list, safe_charsets;
9599   enum coding_category category;
9600   Lisp_Object tail, val;
9601   int max_charset_id = 0;
9602   int i;
9603
9604   if (nargs < coding_arg_max)
9605     goto short_args;
9606
9607   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9608
9609   name = args[coding_arg_name];
9610   CHECK_SYMBOL (name);
9611   CODING_ATTR_BASE_NAME (attrs) = name;
9612
9613   val = args[coding_arg_mnemonic];
9614   if (! STRINGP (val))
9615     CHECK_CHARACTER (val);
9616   CODING_ATTR_MNEMONIC (attrs) = val;
9617
9618   coding_type = args[coding_arg_coding_type];
9619   CHECK_SYMBOL (coding_type);
9620   CODING_ATTR_TYPE (attrs) = coding_type;
9621
9622   charset_list = args[coding_arg_charset_list];
9623   if (SYMBOLP (charset_list))
9624     {
9625       if (EQ (charset_list, Qiso_2022))
9626         {
9627           if (! EQ (coding_type, Qiso_2022))
9628             error ("Invalid charset-list");
9629           charset_list = Viso_2022_charset_list;
9630         }
9631       else if (EQ (charset_list, Qemacs_mule))
9632         {
9633           if (! EQ (coding_type, Qemacs_mule))
9634             error ("Invalid charset-list");
9635           charset_list = Vemacs_mule_charset_list;
9636         }
9637       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9638         if (max_charset_id < XFASTINT (XCAR (tail)))
9639           max_charset_id = XFASTINT (XCAR (tail));
9640     }
9641   else
9642     {
9643       charset_list = Fcopy_sequence (charset_list);
9644       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9645         {
9646           struct charset *charset;
9647
9648           val = XCAR (tail);
9649           CHECK_CHARSET_GET_CHARSET (val, charset);
9650           if (EQ (coding_type, Qiso_2022)
9651               ? CHARSET_ISO_FINAL (charset) < 0
9652               : EQ (coding_type, Qemacs_mule)
9653               ? CHARSET_EMACS_MULE_ID (charset) < 0
9654               : 0)
9655             error ("Can't handle charset `%s'",
9656                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9657
9658           XSETCAR (tail, make_number (charset->id));
9659           if (max_charset_id < charset->id)
9660             max_charset_id = charset->id;
9661         }
9662     }
9663   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9664
9665   safe_charsets = make_uninit_string (max_charset_id + 1);
9666   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9667   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9668     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9669   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9670
9671   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9672
9673   val = args[coding_arg_decode_translation_table];
9674   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9675     CHECK_SYMBOL (val);
9676   CODING_ATTR_DECODE_TBL (attrs) = val;
9677
9678   val = args[coding_arg_encode_translation_table];
9679   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9680     CHECK_SYMBOL (val);
9681   CODING_ATTR_ENCODE_TBL (attrs) = val;
9682
9683   val = args[coding_arg_post_read_conversion];
9684   CHECK_SYMBOL (val);
9685   CODING_ATTR_POST_READ (attrs) = val;
9686
9687   val = args[coding_arg_pre_write_conversion];
9688   CHECK_SYMBOL (val);
9689   CODING_ATTR_PRE_WRITE (attrs) = val;
9690
9691   val = args[coding_arg_default_char];
9692   if (NILP (val))
9693     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9694   else
9695     {
9696       CHECK_CHARACTER (val);
9697       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9698     }
9699
9700   val = args[coding_arg_for_unibyte];
9701   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9702
9703   val = args[coding_arg_plist];
9704   CHECK_LIST (val);
9705   CODING_ATTR_PLIST (attrs) = val;
9706
9707   if (EQ (coding_type, Qcharset))
9708     {
9709       /* Generate a lisp vector of 256 elements.  Each element is nil,
9710          integer, or a list of charset IDs.
9711
9712          If Nth element is nil, the byte code N is invalid in this
9713          coding system.
9714
9715          If Nth element is a number NUM, N is the first byte of a
9716          charset whose ID is NUM.
9717
9718          If Nth element is a list of charset IDs, N is the first byte
9719          of one of them.  The list is sorted by dimensions of the
9720          charsets.  A charset of smaller dimension comes first. */
9721       val = Fmake_vector (make_number (256), Qnil);
9722
9723       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9724         {
9725           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9726           int dim = CHARSET_DIMENSION (charset);
9727           int idx = (dim - 1) * 4;
9728
9729           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9730             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9731
9732           for (i = charset->code_space[idx];
9733                i <= charset->code_space[idx + 1]; i++)
9734             {
9735               Lisp_Object tmp, tmp2;
9736               int dim2;
9737
9738               tmp = AREF (val, i);
9739               if (NILP (tmp))
9740                 tmp = XCAR (tail);
9741               else if (NUMBERP (tmp))
9742                 {
9743                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9744                   if (dim < dim2)
9745                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9746                   else
9747                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9748                 }
9749               else
9750                 {
9751                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9752                     {
9753                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9754                       if (dim < dim2)
9755                         break;
9756                     }
9757                   if (NILP (tmp2))
9758                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9759                   else
9760                     {
9761                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9762                       XSETCAR (tmp2, XCAR (tail));
9763                     }
9764                 }
9765               ASET (val, i, tmp);
9766             }
9767         }
9768       ASET (attrs, coding_attr_charset_valids, val);
9769       category = coding_category_charset;
9770     }
9771   else if (EQ (coding_type, Qccl))
9772     {
9773       Lisp_Object valids;
9774
9775       if (nargs < coding_arg_ccl_max)
9776         goto short_args;
9777
9778       val = args[coding_arg_ccl_decoder];
9779       CHECK_CCL_PROGRAM (val);
9780       if (VECTORP (val))
9781         val = Fcopy_sequence (val);
9782       ASET (attrs, coding_attr_ccl_decoder, val);
9783
9784       val = args[coding_arg_ccl_encoder];
9785       CHECK_CCL_PROGRAM (val);
9786       if (VECTORP (val))
9787         val = Fcopy_sequence (val);
9788       ASET (attrs, coding_attr_ccl_encoder, val);
9789
9790       val = args[coding_arg_ccl_valids];
9791       valids = Fmake_string (make_number (256), make_number (0));
9792       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9793         {
9794           int from, to;
9795
9796           val = Fcar (tail);
9797           if (INTEGERP (val))
9798             {
9799               from = to = XINT (val);
9800               if (from < 0 || from > 255)
9801                 args_out_of_range_3 (val, make_number (0), make_number (255));
9802             }
9803           else
9804             {
9805               CHECK_CONS (val);
9806               CHECK_NATNUM_CAR (val);
9807               CHECK_NATNUM_CDR (val);
9808               from = XINT (XCAR (val));
9809               if (from > 255)
9810                 args_out_of_range_3 (XCAR (val),
9811                                      make_number (0), make_number (255));
9812               to = XINT (XCDR (val));
9813               if (to < from || to > 255)
9814                 args_out_of_range_3 (XCDR (val),
9815                                      XCAR (val), make_number (255));
9816             }
9817           for (i = from; i <= to; i++)
9818             SSET (valids, i, 1);
9819         }
9820       ASET (attrs, coding_attr_ccl_valids, valids);
9821
9822       category = coding_category_ccl;
9823     }
9824   else if (EQ (coding_type, Qutf_16))
9825     {
9826       Lisp_Object bom, endian;
9827
9828       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9829
9830       if (nargs < coding_arg_utf16_max)
9831         goto short_args;
9832
9833       bom = args[coding_arg_utf16_bom];
9834       if (! NILP (bom) && ! EQ (bom, Qt))
9835         {
9836           CHECK_CONS (bom);
9837           val = XCAR (bom);
9838           CHECK_CODING_SYSTEM (val);
9839           val = XCDR (bom);
9840           CHECK_CODING_SYSTEM (val);
9841         }
9842       ASET (attrs, coding_attr_utf_bom, bom);
9843
9844       endian = args[coding_arg_utf16_endian];
9845       CHECK_SYMBOL (endian);
9846       if (NILP (endian))
9847         endian = Qbig;
9848       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9849         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9850       ASET (attrs, coding_attr_utf_16_endian, endian);
9851
9852       category = (CONSP (bom)
9853                   ? coding_category_utf_16_auto
9854                   : NILP (bom)
9855                   ? (EQ (endian, Qbig)
9856                      ? coding_category_utf_16_be_nosig
9857                      : coding_category_utf_16_le_nosig)
9858                   : (EQ (endian, Qbig)
9859                      ? coding_category_utf_16_be
9860                      : coding_category_utf_16_le));
9861     }
9862   else if (EQ (coding_type, Qiso_2022))
9863     {
9864       Lisp_Object initial, reg_usage, request, flags;
9865       int i;
9866
9867       if (nargs < coding_arg_iso2022_max)
9868         goto short_args;
9869
9870       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9871       CHECK_VECTOR (initial);
9872       for (i = 0; i < 4; i++)
9873         {
9874           val = Faref (initial, make_number (i));
9875           if (! NILP (val))
9876             {
9877               struct charset *charset;
9878
9879               CHECK_CHARSET_GET_CHARSET (val, charset);
9880               ASET (initial, i, make_number (CHARSET_ID (charset)));
9881               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9882                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9883             }
9884           else
9885             ASET (initial, i, make_number (-1));
9886         }
9887
9888       reg_usage = args[coding_arg_iso2022_reg_usage];
9889       CHECK_CONS (reg_usage);
9890       CHECK_NUMBER_CAR (reg_usage);
9891       CHECK_NUMBER_CDR (reg_usage);
9892
9893       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9894       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9895         {
9896           int id;
9897           Lisp_Object tmp;
9898
9899           val = Fcar (tail);
9900           CHECK_CONS (val);
9901           tmp = XCAR (val);
9902           CHECK_CHARSET_GET_ID (tmp, id);
9903           CHECK_NATNUM_CDR (val);
9904           if (XINT (XCDR (val)) >= 4)
9905             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9906           XSETCAR (val, make_number (id));
9907         }
9908
9909       flags = args[coding_arg_iso2022_flags];
9910       CHECK_NATNUM (flags);
9911       i = XINT (flags);
9912       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9913         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9914
9915       ASET (attrs, coding_attr_iso_initial, initial);
9916       ASET (attrs, coding_attr_iso_usage, reg_usage);
9917       ASET (attrs, coding_attr_iso_request, request);
9918       ASET (attrs, coding_attr_iso_flags, flags);
9919       setup_iso_safe_charsets (attrs);
9920
9921       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9922         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9923                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9924                     ? coding_category_iso_7_else
9925                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9926                     ? coding_category_iso_7
9927                     : coding_category_iso_7_tight);
9928       else
9929         {
9930           int id = XINT (AREF (initial, 1));
9931
9932           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9933                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9934                        || id < 0)
9935                       ? coding_category_iso_8_else
9936                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9937                       ? coding_category_iso_8_1
9938                       : coding_category_iso_8_2);
9939         }
9940       if (category != coding_category_iso_8_1
9941           && category != coding_category_iso_8_2)
9942         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9943     }
9944   else if (EQ (coding_type, Qemacs_mule))
9945     {
9946       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9947         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9948       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9949       category = coding_category_emacs_mule;
9950     }
9951   else if (EQ (coding_type, Qshift_jis))
9952     {
9953
9954       struct charset *charset;
9955
9956       if (XINT (Flength (charset_list)) != 3
9957           && XINT (Flength (charset_list)) != 4)
9958         error ("There should be three or four charsets");
9959
9960       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9961       if (CHARSET_DIMENSION (charset) != 1)
9962         error ("Dimension of charset %s is not one",
9963                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9964       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9965         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9966
9967       charset_list = XCDR (charset_list);
9968       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9969       if (CHARSET_DIMENSION (charset) != 1)
9970         error ("Dimension of charset %s is not one",
9971                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9972
9973       charset_list = XCDR (charset_list);
9974       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9975       if (CHARSET_DIMENSION (charset) != 2)
9976         error ("Dimension of charset %s is not two",
9977                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9978
9979       charset_list = XCDR (charset_list);
9980       if (! NILP (charset_list))
9981         {
9982           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9983           if (CHARSET_DIMENSION (charset) != 2)
9984             error ("Dimension of charset %s is not two",
9985                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9986         }
9987
9988       category = coding_category_sjis;
9989       Vsjis_coding_system = name;
9990     }
9991   else if (EQ (coding_type, Qbig5))
9992     {
9993       struct charset *charset;
9994
9995       if (XINT (Flength (charset_list)) != 2)
9996         error ("There should be just two charsets");
9997
9998       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9999       if (CHARSET_DIMENSION (charset) != 1)
10000         error ("Dimension of charset %s is not one",
10001                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10002       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10003         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10004
10005       charset_list = XCDR (charset_list);
10006       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10007       if (CHARSET_DIMENSION (charset) != 2)
10008         error ("Dimension of charset %s is not two",
10009                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10010
10011       category = coding_category_big5;
10012       Vbig5_coding_system = name;
10013     }
10014   else if (EQ (coding_type, Qraw_text))
10015     {
10016       category = coding_category_raw_text;
10017       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10018     }
10019   else if (EQ (coding_type, Qutf_8))
10020     {
10021       Lisp_Object bom;
10022
10023       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10024
10025       if (nargs < coding_arg_utf8_max)
10026         goto short_args;
10027
10028       bom = args[coding_arg_utf8_bom];
10029       if (! NILP (bom) && ! EQ (bom, Qt))
10030         {
10031           CHECK_CONS (bom);
10032           val = XCAR (bom);
10033           CHECK_CODING_SYSTEM (val);
10034           val = XCDR (bom);
10035           CHECK_CODING_SYSTEM (val);
10036         }
10037       ASET (attrs, coding_attr_utf_bom, bom);
10038
10039       category = (CONSP (bom) ? coding_category_utf_8_auto
10040                   : NILP (bom) ? coding_category_utf_8_nosig
10041                   : coding_category_utf_8_sig);
10042     }
10043   else if (EQ (coding_type, Qundecided))
10044     category = coding_category_undecided;
10045   else
10046     error ("Invalid coding system type: %s",
10047            SDATA (SYMBOL_NAME (coding_type)));
10048
10049   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10050   CODING_ATTR_PLIST (attrs)
10051     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10052                                 CODING_ATTR_PLIST (attrs)));
10053   CODING_ATTR_PLIST (attrs)
10054     = Fcons (QCascii_compatible_p,
10055              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10056                     CODING_ATTR_PLIST (attrs)));
10057
10058   eol_type = args[coding_arg_eol_type];
10059   if (! NILP (eol_type)
10060       && ! EQ (eol_type, Qunix)
10061       && ! EQ (eol_type, Qdos)
10062       && ! EQ (eol_type, Qmac))
10063     error ("Invalid eol-type");
10064
10065   aliases = Fcons (name, Qnil);
10066
10067   if (NILP (eol_type))
10068     {
10069       eol_type = make_subsidiaries (name);
10070       for (i = 0; i < 3; i++)
10071         {
10072           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10073
10074           this_name = AREF (eol_type, i);
10075           this_aliases = Fcons (this_name, Qnil);
10076           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10077           this_spec = Fmake_vector (make_number (3), attrs);
10078           ASET (this_spec, 1, this_aliases);
10079           ASET (this_spec, 2, this_eol_type);
10080           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10081           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10082           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10083           if (NILP (val))
10084             Vcoding_system_alist
10085               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10086                        Vcoding_system_alist);
10087         }
10088     }
10089
10090   spec_vec = Fmake_vector (make_number (3), attrs);
10091   ASET (spec_vec, 1, aliases);
10092   ASET (spec_vec, 2, eol_type);
10093
10094   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10095   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10096   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10097   if (NILP (val))
10098     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10099                                   Vcoding_system_alist);
10100
10101   {
10102     int id = coding_categories[category].id;
10103
10104     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10105       setup_coding_system (name, &coding_categories[category]);
10106   }
10107
10108   return Qnil;
10109
10110  short_args:
10111   return Fsignal (Qwrong_number_of_arguments,
10112                   Fcons (intern ("define-coding-system-internal"),
10113                          make_number (nargs)));
10114 }
10115
10116
10117 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10118        3, 3, 0,
10119        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10120   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10121 {
10122   Lisp_Object spec, attrs;
10123
10124   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10125   attrs = AREF (spec, 0);
10126   if (EQ (prop, QCmnemonic))
10127     {
10128       if (! STRINGP (val))
10129         CHECK_CHARACTER (val);
10130       CODING_ATTR_MNEMONIC (attrs) = val;
10131     }
10132   else if (EQ (prop, QCdefault_char))
10133     {
10134       if (NILP (val))
10135         val = make_number (' ');
10136       else
10137         CHECK_CHARACTER (val);
10138       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10139     }
10140   else if (EQ (prop, QCdecode_translation_table))
10141     {
10142       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10143         CHECK_SYMBOL (val);
10144       CODING_ATTR_DECODE_TBL (attrs) = val;
10145     }
10146   else if (EQ (prop, QCencode_translation_table))
10147     {
10148       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10149         CHECK_SYMBOL (val);
10150       CODING_ATTR_ENCODE_TBL (attrs) = val;
10151     }
10152   else if (EQ (prop, QCpost_read_conversion))
10153     {
10154       CHECK_SYMBOL (val);
10155       CODING_ATTR_POST_READ (attrs) = val;
10156     }
10157   else if (EQ (prop, QCpre_write_conversion))
10158     {
10159       CHECK_SYMBOL (val);
10160       CODING_ATTR_PRE_WRITE (attrs) = val;
10161     }
10162   else if (EQ (prop, QCascii_compatible_p))
10163     {
10164       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10165     }
10166
10167   CODING_ATTR_PLIST (attrs)
10168     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10169   return val;
10170 }
10171
10172
10173 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10174        Sdefine_coding_system_alias, 2, 2, 0,
10175        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10176   (Lisp_Object alias, Lisp_Object coding_system)
10177 {
10178   Lisp_Object spec, aliases, eol_type, val;
10179
10180   CHECK_SYMBOL (alias);
10181   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10182   aliases = AREF (spec, 1);
10183   /* ALIASES should be a list of length more than zero, and the first
10184      element is a base coding system.  Append ALIAS at the tail of the
10185      list.  */
10186   while (!NILP (XCDR (aliases)))
10187     aliases = XCDR (aliases);
10188   XSETCDR (aliases, Fcons (alias, Qnil));
10189
10190   eol_type = AREF (spec, 2);
10191   if (VECTORP (eol_type))
10192     {
10193       Lisp_Object subsidiaries;
10194       int i;
10195
10196       subsidiaries = make_subsidiaries (alias);
10197       for (i = 0; i < 3; i++)
10198         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10199                                      AREF (eol_type, i));
10200     }
10201
10202   Fputhash (alias, spec, Vcoding_system_hash_table);
10203   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10204   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10205   if (NILP (val))
10206     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10207                                   Vcoding_system_alist);
10208
10209   return Qnil;
10210 }
10211
10212 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10213        1, 1, 0,
10214        doc: /* Return the base of CODING-SYSTEM.
10215 Any alias or subsidiary coding system is not a base coding system.  */)
10216   (Lisp_Object coding_system)
10217 {
10218   Lisp_Object spec, attrs;
10219
10220   if (NILP (coding_system))
10221     return (Qno_conversion);
10222   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10223   attrs = AREF (spec, 0);
10224   return CODING_ATTR_BASE_NAME (attrs);
10225 }
10226
10227 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10228        1, 1, 0,
10229        doc: "Return the property list of CODING-SYSTEM.")
10230   (Lisp_Object coding_system)
10231 {
10232   Lisp_Object spec, attrs;
10233
10234   if (NILP (coding_system))
10235     coding_system = Qno_conversion;
10236   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10237   attrs = AREF (spec, 0);
10238   return CODING_ATTR_PLIST (attrs);
10239 }
10240
10241
10242 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10243        1, 1, 0,
10244        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10245   (Lisp_Object coding_system)
10246 {
10247   Lisp_Object spec;
10248
10249   if (NILP (coding_system))
10250     coding_system = Qno_conversion;
10251   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10252   return AREF (spec, 1);
10253 }
10254
10255 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10256        Scoding_system_eol_type, 1, 1, 0,
10257        doc: /* Return eol-type of CODING-SYSTEM.
10258 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10259
10260 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10261 and CR respectively.
10262
10263 A vector value indicates that a format of end-of-line should be
10264 detected automatically.  Nth element of the vector is the subsidiary
10265 coding system whose eol-type is N.  */)
10266   (Lisp_Object coding_system)
10267 {
10268   Lisp_Object spec, eol_type;
10269   int n;
10270
10271   if (NILP (coding_system))
10272     coding_system = Qno_conversion;
10273   if (! CODING_SYSTEM_P (coding_system))
10274     return Qnil;
10275   spec = CODING_SYSTEM_SPEC (coding_system);
10276   eol_type = AREF (spec, 2);
10277   if (VECTORP (eol_type))
10278     return Fcopy_sequence (eol_type);
10279   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10280   return make_number (n);
10281 }
10282
10283 #endif /* emacs */
10284
10285 \f
10286 /*** 9. Post-amble ***/
10287
10288 void
10289 init_coding_once (void)
10290 {
10291   int i;
10292
10293   for (i = 0; i < coding_category_max; i++)
10294     {
10295       coding_categories[i].id = -1;
10296       coding_priorities[i] = i;
10297     }
10298
10299   /* ISO2022 specific initialize routine.  */
10300   for (i = 0; i < 0x20; i++)
10301     iso_code_class[i] = ISO_control_0;
10302   for (i = 0x21; i < 0x7F; i++)
10303     iso_code_class[i] = ISO_graphic_plane_0;
10304   for (i = 0x80; i < 0xA0; i++)
10305     iso_code_class[i] = ISO_control_1;
10306   for (i = 0xA1; i < 0xFF; i++)
10307     iso_code_class[i] = ISO_graphic_plane_1;
10308   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10309   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10310   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10311   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10312   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10313   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10314   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10315   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10316   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10317
10318   for (i = 0; i < 256; i++)
10319     {
10320       emacs_mule_bytes[i] = 1;
10321     }
10322   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10323   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10324   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10325   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10326 }
10327
10328 #ifdef emacs
10329
10330 void
10331 syms_of_coding (void)
10332 {
10333   staticpro (&Vcoding_system_hash_table);
10334   {
10335     Lisp_Object args[2];
10336     args[0] = QCtest;
10337     args[1] = Qeq;
10338     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10339   }
10340
10341   staticpro (&Vsjis_coding_system);
10342   Vsjis_coding_system = Qnil;
10343
10344   staticpro (&Vbig5_coding_system);
10345   Vbig5_coding_system = Qnil;
10346
10347   staticpro (&Vcode_conversion_reused_workbuf);
10348   Vcode_conversion_reused_workbuf = Qnil;
10349
10350   staticpro (&Vcode_conversion_workbuf_name);
10351   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10352
10353   reused_workbuf_in_use = 0;
10354
10355   DEFSYM (Qcharset, "charset");
10356   DEFSYM (Qtarget_idx, "target-idx");
10357   DEFSYM (Qcoding_system_history, "coding-system-history");
10358   Fset (Qcoding_system_history, Qnil);
10359
10360   /* Target FILENAME is the first argument.  */
10361   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10362   /* Target FILENAME is the third argument.  */
10363   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10364
10365   DEFSYM (Qcall_process, "call-process");
10366   /* Target PROGRAM is the first argument.  */
10367   Fput (Qcall_process, Qtarget_idx, make_number (0));
10368
10369   DEFSYM (Qcall_process_region, "call-process-region");
10370   /* Target PROGRAM is the third argument.  */
10371   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10372
10373   DEFSYM (Qstart_process, "start-process");
10374   /* Target PROGRAM is the third argument.  */
10375   Fput (Qstart_process, Qtarget_idx, make_number (2));
10376
10377   DEFSYM (Qopen_network_stream, "open-network-stream");
10378   /* Target SERVICE is the fourth argument.  */
10379   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10380
10381   DEFSYM (Qcoding_system, "coding-system");
10382   DEFSYM (Qcoding_aliases, "coding-aliases");
10383
10384   DEFSYM (Qeol_type, "eol-type");
10385   DEFSYM (Qunix, "unix");
10386   DEFSYM (Qdos, "dos");
10387
10388   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10389   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10390   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10391   DEFSYM (Qdefault_char, "default-char");
10392   DEFSYM (Qundecided, "undecided");
10393   DEFSYM (Qno_conversion, "no-conversion");
10394   DEFSYM (Qraw_text, "raw-text");
10395
10396   DEFSYM (Qiso_2022, "iso-2022");
10397
10398   DEFSYM (Qutf_8, "utf-8");
10399   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10400
10401   DEFSYM (Qutf_16, "utf-16");
10402   DEFSYM (Qbig, "big");
10403   DEFSYM (Qlittle, "little");
10404
10405   DEFSYM (Qshift_jis, "shift-jis");
10406   DEFSYM (Qbig5, "big5");
10407
10408   DEFSYM (Qcoding_system_p, "coding-system-p");
10409
10410   DEFSYM (Qcoding_system_error, "coding-system-error");
10411   Fput (Qcoding_system_error, Qerror_conditions,
10412         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10413   Fput (Qcoding_system_error, Qerror_message,
10414         make_pure_c_string ("Invalid coding system"));
10415
10416   /* Intern this now in case it isn't already done.
10417      Setting this variable twice is harmless.
10418      But don't staticpro it here--that is done in alloc.c.  */
10419   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10420
10421   DEFSYM (Qtranslation_table, "translation-table");
10422   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10423   DEFSYM (Qtranslation_table_id, "translation-table-id");
10424   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10425   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10426
10427   DEFSYM (Qvalid_codes, "valid-codes");
10428
10429   DEFSYM (Qemacs_mule, "emacs-mule");
10430
10431   DEFSYM (QCcategory, ":category");
10432   DEFSYM (QCmnemonic, ":mnemonic");
10433   DEFSYM (QCdefault_char, ":default-char");
10434   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10435   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10436   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10437   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10438   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10439
10440   Vcoding_category_table
10441     = Fmake_vector (make_number (coding_category_max), Qnil);
10442   staticpro (&Vcoding_category_table);
10443   /* Followings are target of code detection.  */
10444   ASET (Vcoding_category_table, coding_category_iso_7,
10445         intern_c_string ("coding-category-iso-7"));
10446   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10447         intern_c_string ("coding-category-iso-7-tight"));
10448   ASET (Vcoding_category_table, coding_category_iso_8_1,
10449         intern_c_string ("coding-category-iso-8-1"));
10450   ASET (Vcoding_category_table, coding_category_iso_8_2,
10451         intern_c_string ("coding-category-iso-8-2"));
10452   ASET (Vcoding_category_table, coding_category_iso_7_else,
10453         intern_c_string ("coding-category-iso-7-else"));
10454   ASET (Vcoding_category_table, coding_category_iso_8_else,
10455         intern_c_string ("coding-category-iso-8-else"));
10456   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10457         intern_c_string ("coding-category-utf-8-auto"));
10458   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10459         intern_c_string ("coding-category-utf-8"));
10460   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10461         intern_c_string ("coding-category-utf-8-sig"));
10462   ASET (Vcoding_category_table, coding_category_utf_16_be,
10463         intern_c_string ("coding-category-utf-16-be"));
10464   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10465         intern_c_string ("coding-category-utf-16-auto"));
10466   ASET (Vcoding_category_table, coding_category_utf_16_le,
10467         intern_c_string ("coding-category-utf-16-le"));
10468   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10469         intern_c_string ("coding-category-utf-16-be-nosig"));
10470   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10471         intern_c_string ("coding-category-utf-16-le-nosig"));
10472   ASET (Vcoding_category_table, coding_category_charset,
10473         intern_c_string ("coding-category-charset"));
10474   ASET (Vcoding_category_table, coding_category_sjis,
10475         intern_c_string ("coding-category-sjis"));
10476   ASET (Vcoding_category_table, coding_category_big5,
10477         intern_c_string ("coding-category-big5"));
10478   ASET (Vcoding_category_table, coding_category_ccl,
10479         intern_c_string ("coding-category-ccl"));
10480   ASET (Vcoding_category_table, coding_category_emacs_mule,
10481         intern_c_string ("coding-category-emacs-mule"));
10482   /* Followings are NOT target of code detection.  */
10483   ASET (Vcoding_category_table, coding_category_raw_text,
10484         intern_c_string ("coding-category-raw-text"));
10485   ASET (Vcoding_category_table, coding_category_undecided,
10486         intern_c_string ("coding-category-undecided"));
10487
10488   DEFSYM (Qinsufficient_source, "insufficient-source");
10489   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10490   DEFSYM (Qinvalid_source, "invalid-source");
10491   DEFSYM (Qinterrupted, "interrupted");
10492   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10493   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10494
10495   defsubr (&Scoding_system_p);
10496   defsubr (&Sread_coding_system);
10497   defsubr (&Sread_non_nil_coding_system);
10498   defsubr (&Scheck_coding_system);
10499   defsubr (&Sdetect_coding_region);
10500   defsubr (&Sdetect_coding_string);
10501   defsubr (&Sfind_coding_systems_region_internal);
10502   defsubr (&Sunencodable_char_position);
10503   defsubr (&Scheck_coding_systems_region);
10504   defsubr (&Sdecode_coding_region);
10505   defsubr (&Sencode_coding_region);
10506   defsubr (&Sdecode_coding_string);
10507   defsubr (&Sencode_coding_string);
10508   defsubr (&Sdecode_sjis_char);
10509   defsubr (&Sencode_sjis_char);
10510   defsubr (&Sdecode_big5_char);
10511   defsubr (&Sencode_big5_char);
10512   defsubr (&Sset_terminal_coding_system_internal);
10513   defsubr (&Sset_safe_terminal_coding_system_internal);
10514   defsubr (&Sterminal_coding_system);
10515   defsubr (&Sset_keyboard_coding_system_internal);
10516   defsubr (&Skeyboard_coding_system);
10517   defsubr (&Sfind_operation_coding_system);
10518   defsubr (&Sset_coding_system_priority);
10519   defsubr (&Sdefine_coding_system_internal);
10520   defsubr (&Sdefine_coding_system_alias);
10521   defsubr (&Scoding_system_put);
10522   defsubr (&Scoding_system_base);
10523   defsubr (&Scoding_system_plist);
10524   defsubr (&Scoding_system_aliases);
10525   defsubr (&Scoding_system_eol_type);
10526   defsubr (&Scoding_system_priority_list);
10527
10528   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10529                doc: /* List of coding systems.
10530
10531 Do not alter the value of this variable manually.  This variable should be
10532 updated by the functions `define-coding-system' and
10533 `define-coding-system-alias'.  */);
10534   Vcoding_system_list = Qnil;
10535
10536   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10537                doc: /* Alist of coding system names.
10538 Each element is one element list of coding system name.
10539 This variable is given to `completing-read' as COLLECTION argument.
10540
10541 Do not alter the value of this variable manually.  This variable should be
10542 updated by the functions `make-coding-system' and
10543 `define-coding-system-alias'.  */);
10544   Vcoding_system_alist = Qnil;
10545
10546   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10547                doc: /* List of coding-categories (symbols) ordered by priority.
10548
10549 On detecting a coding system, Emacs tries code detection algorithms
10550 associated with each coding-category one by one in this order.  When
10551 one algorithm agrees with a byte sequence of source text, the coding
10552 system bound to the corresponding coding-category is selected.
10553
10554 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10555   {
10556     int i;
10557
10558     Vcoding_category_list = Qnil;
10559     for (i = coding_category_max - 1; i >= 0; i--)
10560       Vcoding_category_list
10561         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10562                  Vcoding_category_list);
10563   }
10564
10565   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10566                doc: /* Specify the coding system for read operations.
10567 It is useful to bind this variable with `let', but do not set it globally.
10568 If the value is a coding system, it is used for decoding on read operation.
10569 If not, an appropriate element is used from one of the coding system alists.
10570 There are three such tables: `file-coding-system-alist',
10571 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10572   Vcoding_system_for_read = Qnil;
10573
10574   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10575                doc: /* Specify the coding system for write operations.
10576 Programs bind this variable with `let', but you should not set it globally.
10577 If the value is a coding system, it is used for encoding of output,
10578 when writing it to a file and when sending it to a file or subprocess.
10579
10580 If this does not specify a coding system, an appropriate element
10581 is used from one of the coding system alists.
10582 There are three such tables: `file-coding-system-alist',
10583 `process-coding-system-alist', and `network-coding-system-alist'.
10584 For output to files, if the above procedure does not specify a coding system,
10585 the value of `buffer-file-coding-system' is used.  */);
10586   Vcoding_system_for_write = Qnil;
10587
10588   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10589                doc: /*
10590 Coding system used in the latest file or process I/O.  */);
10591   Vlast_coding_system_used = Qnil;
10592
10593   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10594                doc: /*
10595 Error status of the last code conversion.
10596
10597 When an error was detected in the last code conversion, this variable
10598 is set to one of the following symbols.
10599   `insufficient-source'
10600   `inconsistent-eol'
10601   `invalid-source'
10602   `interrupted'
10603   `insufficient-memory'
10604 When no error was detected, the value doesn't change.  So, to check
10605 the error status of a code conversion by this variable, you must
10606 explicitly set this variable to nil before performing code
10607 conversion.  */);
10608   Vlast_code_conversion_error = Qnil;
10609
10610   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10611                doc: /*
10612 *Non-nil means always inhibit code conversion of end-of-line format.
10613 See info node `Coding Systems' and info node `Text and Binary' concerning
10614 such conversion.  */);
10615   inhibit_eol_conversion = 0;
10616
10617   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10618                doc: /*
10619 Non-nil means process buffer inherits coding system of process output.
10620 Bind it to t if the process output is to be treated as if it were a file
10621 read from some filesystem.  */);
10622   inherit_process_coding_system = 0;
10623
10624   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10625                doc: /*
10626 Alist to decide a coding system to use for a file I/O operation.
10627 The format is ((PATTERN . VAL) ...),
10628 where PATTERN is a regular expression matching a file name,
10629 VAL is a coding system, a cons of coding systems, or a function symbol.
10630 If VAL is a coding system, it is used for both decoding and encoding
10631 the file contents.
10632 If VAL is a cons of coding systems, the car part is used for decoding,
10633 and the cdr part is used for encoding.
10634 If VAL is a function symbol, the function must return a coding system
10635 or a cons of coding systems which are used as above.  The function is
10636 called with an argument that is a list of the arguments with which
10637 `find-operation-coding-system' was called.  If the function can't decide
10638 a coding system, it can return `undecided' so that the normal
10639 code-detection is performed.
10640
10641 See also the function `find-operation-coding-system'
10642 and the variable `auto-coding-alist'.  */);
10643   Vfile_coding_system_alist = Qnil;
10644
10645   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10646                doc: /*
10647 Alist to decide a coding system to use for a process I/O operation.
10648 The format is ((PATTERN . VAL) ...),
10649 where PATTERN is a regular expression matching a program name,
10650 VAL is a coding system, a cons of coding systems, or a function symbol.
10651 If VAL is a coding system, it is used for both decoding what received
10652 from the program and encoding what sent to the program.
10653 If VAL is a cons of coding systems, the car part is used for decoding,
10654 and the cdr part is used for encoding.
10655 If VAL is a function symbol, the function must return a coding system
10656 or a cons of coding systems which are used as above.
10657
10658 See also the function `find-operation-coding-system'.  */);
10659   Vprocess_coding_system_alist = Qnil;
10660
10661   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10662                doc: /*
10663 Alist to decide a coding system to use for a network I/O operation.
10664 The format is ((PATTERN . VAL) ...),
10665 where PATTERN is a regular expression matching a network service name
10666 or is a port number to connect to,
10667 VAL is a coding system, a cons of coding systems, or a function symbol.
10668 If VAL is a coding system, it is used for both decoding what received
10669 from the network stream and encoding what sent to the network stream.
10670 If VAL is a cons of coding systems, the car part is used for decoding,
10671 and the cdr part is used for encoding.
10672 If VAL is a function symbol, the function must return a coding system
10673 or a cons of coding systems which are used as above.
10674
10675 See also the function `find-operation-coding-system'.  */);
10676   Vnetwork_coding_system_alist = Qnil;
10677
10678   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10679                doc: /* Coding system to use with system messages.
10680 Also used for decoding keyboard input on X Window system.  */);
10681   Vlocale_coding_system = Qnil;
10682
10683   /* The eol mnemonics are reset in startup.el system-dependently.  */
10684   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10685                doc: /*
10686 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10687   eol_mnemonic_unix = make_pure_c_string (":");
10688
10689   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10690                doc: /*
10691 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10692   eol_mnemonic_dos = make_pure_c_string ("\\");
10693
10694   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10695                doc: /*
10696 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10697   eol_mnemonic_mac = make_pure_c_string ("/");
10698
10699   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10700                doc: /*
10701 *String displayed in mode line when end-of-line format is not yet determined.  */);
10702   eol_mnemonic_undecided = make_pure_c_string (":");
10703
10704   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10705                doc: /*
10706 *Non-nil enables character translation while encoding and decoding.  */);
10707   Venable_character_translation = Qt;
10708
10709   DEFVAR_LISP ("standard-translation-table-for-decode",
10710                &Vstandard_translation_table_for_decode,
10711                doc: /* Table for translating characters while decoding.  */);
10712   Vstandard_translation_table_for_decode = Qnil;
10713
10714   DEFVAR_LISP ("standard-translation-table-for-encode",
10715                &Vstandard_translation_table_for_encode,
10716                doc: /* Table for translating characters while encoding.  */);
10717   Vstandard_translation_table_for_encode = Qnil;
10718
10719   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10720                doc: /* Alist of charsets vs revision numbers.
10721 While encoding, if a charset (car part of an element) is found,
10722 designate it with the escape sequence identifying revision (cdr part
10723 of the element).  */);
10724   Vcharset_revision_table = Qnil;
10725
10726   DEFVAR_LISP ("default-process-coding-system",
10727                &Vdefault_process_coding_system,
10728                doc: /* Cons of coding systems used for process I/O by default.
10729 The car part is used for decoding a process output,
10730 the cdr part is used for encoding a text to be sent to a process.  */);
10731   Vdefault_process_coding_system = Qnil;
10732
10733   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10734                doc: /*
10735 Table of extra Latin codes in the range 128..159 (inclusive).
10736 This is a vector of length 256.
10737 If Nth element is non-nil, the existence of code N in a file
10738 \(or output of subprocess) doesn't prevent it to be detected as
10739 a coding system of ISO 2022 variant which has a flag
10740 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10741 or reading output of a subprocess.
10742 Only 128th through 159th elements have a meaning.  */);
10743   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10744
10745   DEFVAR_LISP ("select-safe-coding-system-function",
10746                &Vselect_safe_coding_system_function,
10747                doc: /*
10748 Function to call to select safe coding system for encoding a text.
10749
10750 If set, this function is called to force a user to select a proper
10751 coding system which can encode the text in the case that a default
10752 coding system used in each operation can't encode the text.  The
10753 function should take care that the buffer is not modified while
10754 the coding system is being selected.
10755
10756 The default value is `select-safe-coding-system' (which see).  */);
10757   Vselect_safe_coding_system_function = Qnil;
10758
10759   DEFVAR_BOOL ("coding-system-require-warning",
10760                &coding_system_require_warning,
10761                doc: /* Internal use only.
10762 If non-nil, on writing a file, `select-safe-coding-system-function' is
10763 called even if `coding-system-for-write' is non-nil.  The command
10764 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10765   coding_system_require_warning = 0;
10766
10767
10768   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10769                &inhibit_iso_escape_detection,
10770                doc: /*
10771 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10772
10773 When Emacs reads text, it tries to detect how the text is encoded.
10774 This code detection is sensitive to escape sequences.  If Emacs sees
10775 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10776 of the ISO2022 encodings, and decodes text by the corresponding coding
10777 system (e.g. `iso-2022-7bit').
10778
10779 However, there may be a case that you want to read escape sequences in
10780 a file as is.  In such a case, you can set this variable to non-nil.
10781 Then the code detection will ignore any escape sequences, and no text is
10782 detected as encoded in some ISO-2022 encoding.  The result is that all
10783 escape sequences become visible in a buffer.
10784
10785 The default value is nil, and it is strongly recommended not to change
10786 it.  That is because many Emacs Lisp source files that contain
10787 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10788 in Emacs's distribution, and they won't be decoded correctly on
10789 reading if you suppress escape sequence detection.
10790
10791 The other way to read escape sequences in a file without decoding is
10792 to explicitly specify some coding system that doesn't use ISO-2022
10793 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10794   inhibit_iso_escape_detection = 0;
10795
10796   DEFVAR_BOOL ("inhibit-null-byte-detection",
10797                &inhibit_null_byte_detection,
10798                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10799 By default, Emacs treats it as binary data, and does not attempt to
10800 decode it.  The effect is as if you specified `no-conversion' for
10801 reading that text.
10802
10803 Set this to non-nil when a regular text happens to include null bytes.
10804 Examples are Index nodes of Info files and null-byte delimited output
10805 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10806 decode text as usual.  */);
10807   inhibit_null_byte_detection = 0;
10808
10809   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10810                doc: /* Char table for translating self-inserting characters.
10811 This is applied to the result of input methods, not their input.
10812 See also `keyboard-translate-table'.
10813
10814 Use of this variable for character code unification was rendered
10815 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10816 internal character representation.  */);
10817     Vtranslation_table_for_input = Qnil;
10818
10819   {
10820     Lisp_Object args[coding_arg_max];
10821     Lisp_Object plist[16];
10822     int i;
10823
10824     for (i = 0; i < coding_arg_max; i++)
10825       args[i] = Qnil;
10826
10827     plist[0] = intern_c_string (":name");
10828     plist[1] = args[coding_arg_name] = Qno_conversion;
10829     plist[2] = intern_c_string (":mnemonic");
10830     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10831     plist[4] = intern_c_string (":coding-type");
10832     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10833     plist[6] = intern_c_string (":ascii-compatible-p");
10834     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10835     plist[8] = intern_c_string (":default-char");
10836     plist[9] = args[coding_arg_default_char] = make_number (0);
10837     plist[10] = intern_c_string (":for-unibyte");
10838     plist[11] = args[coding_arg_for_unibyte] = Qt;
10839     plist[12] = intern_c_string (":docstring");
10840     plist[13] = make_pure_c_string ("Do no conversion.\n\
10841 \n\
10842 When you visit a file with this coding, the file is read into a\n\
10843 unibyte buffer as is, thus each byte of a file is treated as a\n\
10844 character.");
10845     plist[14] = intern_c_string (":eol-type");
10846     plist[15] = args[coding_arg_eol_type] = Qunix;
10847     args[coding_arg_plist] = Flist (16, plist);
10848     Fdefine_coding_system_internal (coding_arg_max, args);
10849
10850     plist[1] = args[coding_arg_name] = Qundecided;
10851     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10852     plist[5] = args[coding_arg_coding_type] = Qundecided;
10853     /* This is already set.
10854        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10855     plist[8] = intern_c_string (":charset-list");
10856     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10857     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10858     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10859     plist[15] = args[coding_arg_eol_type] = Qnil;
10860     args[coding_arg_plist] = Flist (16, plist);
10861     Fdefine_coding_system_internal (coding_arg_max, args);
10862   }
10863
10864   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10865
10866   {
10867     int i;
10868
10869     for (i = 0; i < coding_category_max; i++)
10870       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10871   }
10872 #if defined (DOS_NT)
10873   system_eol_type = Qdos;
10874 #else
10875   system_eol_type = Qunix;
10876 #endif
10877   staticpro (&system_eol_type);
10878 }
10879
10880 char *
10881 emacs_strerror (int error_number)
10882 {
10883   char *str;
10884
10885   synchronize_system_messages_locale ();
10886   str = strerror (error_number);
10887
10888   if (! NILP (Vlocale_coding_system))
10889     {
10890       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10891                                                       Vlocale_coding_system,
10892                                                       0);
10893       str = (char *) SDATA (dec);
10894     }
10895
10896   return str;
10897 }
10898
10899 #endif /* emacs */