src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292
 293 #include "lisp.h"
 294 #include "buffer.h"
 295 #include "character.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 307 Lisp_Object Qunix, Qdos;
 308 extern Lisp_Object Qmac;        /* frame.c */
 309 Lisp_Object Qbuffer_file_coding_system;
 310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 311 Lisp_Object Qdefault_char;
 312 Lisp_Object Qno_conversion, Qundecided;
 313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 314 Lisp_Object Qbig, Qlittle;
 315 Lisp_Object Qcoding_system_history;
 316 Lisp_Object Qvalid_codes;
 317 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 320 Lisp_Object QCascii_compatible_p;
 321
 322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 323 Lisp_Object Qcall_process, Qcall_process_region;
 324 Lisp_Object Qstart_process, Qopen_network_stream;
 325 Lisp_Object Qtarget_idx;
 326
 327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 328 Lisp_Object Qinterrupted, Qinsufficient_memory;
 329
 330 extern Lisp_Object Qcompletion_ignore_case;
 331
 332 /* If a symbol has this property, evaluate the value to define the
 333    symbol as a coding system.  */
 334 static Lisp_Object Qcoding_system_define_form;
 335
 336 int coding_system_require_warning;
 337
 338 Lisp_Object Vselect_safe_coding_system_function;
 339
 340 /* Mnemonic string for each format of end-of-line.  */
 341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 342 /* Mnemonic string to indicate format of end-of-line is not yet
 343    decided.  */
 344 Lisp_Object eol_mnemonic_undecided;
 345
 346 /* Format of end-of-line decided by system.  This is Qunix on
 347    Unix and Mac, Qdos on DOS/Windows.
 348    This has an effect only for external encoding (i.e. for output to
 349    file and process), not for in-buffer or Lisp string encoding.  */
 350 static Lisp_Object system_eol_type;
 351
 352 #ifdef emacs
 353
 354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 355
 356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 357
 358 /* Coding system emacs-mule and raw-text are for converting only
 359    end-of-line format.  */
 360 Lisp_Object Qemacs_mule, Qraw_text;
 361 Lisp_Object Qutf_8_emacs;
 362
 363 /* Coding-systems are handed between Emacs Lisp programs and C internal
 364    routines by the following three variables.  */
 365 /* Coding-system for reading files and receiving data from process.  */
 366 Lisp_Object Vcoding_system_for_read;
 367 /* Coding-system for writing files and sending data to process.  */
 368 Lisp_Object Vcoding_system_for_write;
 369 /* Coding-system actually used in the latest I/O.  */
 370 Lisp_Object Vlast_coding_system_used;
 371 /* Set to non-nil when an error is detected while code conversion.  */
 372 Lisp_Object Vlast_code_conversion_error;
 373 /* A vector of length 256 which contains information about special
 374    Latin codes (especially for dealing with Microsoft codes).  */
 375 Lisp_Object Vlatin_extra_code_table;
 376
 377 /* Flag to inhibit code conversion of end-of-line format.  */
 378 int inhibit_eol_conversion;
 379
 380 /* Flag to inhibit ISO2022 escape sequence detection.  */
 381 int inhibit_iso_escape_detection;
 382
 383 /* Flag to inhibit detection of binary files through null bytes.  */
 384 int inhibit_null_byte_detection;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display when
 390    terminal coding system is nil.  */
 391 struct coding_system safe_terminal_coding;
 392
 393 Lisp_Object Vfile_coding_system_alist;
 394 Lisp_Object Vprocess_coding_system_alist;
 395 Lisp_Object Vnetwork_coding_system_alist;
 396
 397 Lisp_Object Vlocale_coding_system;
 398
 399 #endif /* emacs */
 400
 401 /* Flag to tell if we look up translation table on character code
 402    conversion.  */
 403 Lisp_Object Venable_character_translation;
 404 /* Standard translation table to look up on decoding (reading).  */
 405 Lisp_Object Vstandard_translation_table_for_decode;
 406 /* Standard translation table to look up on encoding (writing).  */
 407 Lisp_Object Vstandard_translation_table_for_encode;
 408
 409 Lisp_Object Qtranslation_table;
 410 Lisp_Object Qtranslation_table_id;
 411 Lisp_Object Qtranslation_table_for_decode;
 412 Lisp_Object Qtranslation_table_for_encode;
 413
 414 /* Alist of charsets vs revision number.  */
 415 static Lisp_Object Vcharset_revision_table;
 416
 417 /* Default coding systems used for process I/O.  */
 418 Lisp_Object Vdefault_process_coding_system;
 419
 420 /* Char table for translating Quail and self-inserting input.  */
 421 Lisp_Object Vtranslation_table_for_input;
 422
 423 /* Two special coding systems.  */
 424 Lisp_Object Vsjis_coding_system;
 425 Lisp_Object Vbig5_coding_system;
 426
 427 /* ISO2022 section */
 428
 429 #define CODING_ISO_INITIAL(coding, reg)                 \
 430   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 431                      coding_attr_iso_initial),          \
 432                reg)))
 433
 434
 435 #define CODING_ISO_REQUEST(coding, charset_id)          \
 436   (((charset_id) <= (coding)->max_charset_id            \
 437     ? ((coding)->safe_charsets[charset_id] != 255       \
 438        ? (coding)->safe_charsets[charset_id]            \
 439        : -1)                                            \
 440     : -1))
 441
 442
 443 #define CODING_ISO_FLAGS(coding)        \
 444   ((coding)->spec.iso_2022.flags)
 445 #define CODING_ISO_DESIGNATION(coding, reg)     \
 446   ((coding)->spec.iso_2022.current_designation[reg])
 447 #define CODING_ISO_INVOCATION(coding, plane)    \
 448   ((coding)->spec.iso_2022.current_invocation[plane])
 449 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 450   ((coding)->spec.iso_2022.single_shifting)
 451 #define CODING_ISO_BOL(coding)  \
 452   ((coding)->spec.iso_2022.bol)
 453 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 454   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 455 #define CODING_ISO_CMP_STATUS(coding)   \
 456   (&(coding)->spec.iso_2022.cmp_status)
 457 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 458   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 459 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 460   ((coding)->spec.iso_2022.embedded_utf_8)
 461
 462 /* Control characters of ISO2022.  */
 463                         /* code */      /* function */
 464 #define ISO_CODE_LF     0x0A            /* line-feed */
 465 #define ISO_CODE_CR     0x0D            /* carriage-return */
 466 #define ISO_CODE_SO     0x0E            /* shift-out */
 467 #define ISO_CODE_SI     0x0F            /* shift-in */
 468 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 469 #define ISO_CODE_ESC    0x1B            /* escape */
 470 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 471 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 472 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 473
 474 /* All code (1-byte) of ISO2022 is classified into one of the
 475    followings.  */
 476 enum iso_code_class_type
 477   {
 478     ISO_control_0,              /* Control codes in the range
 479                                    0x00..0x1F and 0x7F, except for the
 480                                    following 5 codes.  */
 481     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 482     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 483     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 484     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 485     ISO_control_1,              /* Control codes in the range
 486                                    0x80..0x9F, except for the
 487                                    following 3 codes.  */
 488     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 489     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 490     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 491     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 492     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 493     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 494     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 495   };
 496
 497 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 498     `iso-flags' attribute of an iso2022 coding system.  */
 499
 500 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 501    instead of the correct short-form sequence (e.g. ESC $ A).  */
 502 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 503
 504 /* If set, reset graphic planes and registers at end-of-line to the
 505    initial state.  */
 506 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 507
 508 /* If set, reset graphic planes and registers before any control
 509    characters to the initial state.  */
 510 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 511
 512 /* If set, encode by 7-bit environment.  */
 513 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 514
 515 /* If set, use locking-shift function.  */
 516 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 517
 518 /* If set, use single-shift function.  Overwrite
 519    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 520 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 521
 522 /* If set, use designation escape sequence.  */
 523 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 524
 525 /* If set, produce revision number sequence.  */
 526 #define CODING_ISO_FLAG_REVISION        0x0080
 527
 528 /* If set, produce ISO6429's direction specifying sequence.  */
 529 #define CODING_ISO_FLAG_DIRECTION       0x0100
 530
 531 /* If set, assume designation states are reset at beginning of line on
 532    output.  */
 533 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 534
 535 /* If set, designation sequence should be placed at beginning of line
 536    on output.  */
 537 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 538
 539 /* If set, do not encode unsafe charactes on output.  */
 540 #define CODING_ISO_FLAG_SAFE            0x0800
 541
 542 /* If set, extra latin codes (128..159) are accepted as a valid code
 543    on input.  */
 544 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 545
 546 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 547
 548 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 549
 550 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 551
 552 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 553
 554 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 555
 556 /* A character to be produced on output if encoding of the original
 557    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 558 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 559
 560 /* UTF-8 section */
 561 #define CODING_UTF_8_BOM(coding)        \
 562   ((coding)->spec.utf_8_bom)
 563
 564 /* UTF-16 section */
 565 #define CODING_UTF_16_BOM(coding)       \
 566   ((coding)->spec.utf_16.bom)
 567
 568 #define CODING_UTF_16_ENDIAN(coding)    \
 569   ((coding)->spec.utf_16.endian)
 570
 571 #define CODING_UTF_16_SURROGATE(coding) \
 572   ((coding)->spec.utf_16.surrogate)
 573
 574
 575 /* CCL section */
 576 #define CODING_CCL_DECODER(coding)      \
 577   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 578 #define CODING_CCL_ENCODER(coding)      \
 579   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 580 #define CODING_CCL_VALIDS(coding)                                          \
 581   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 582
 583 /* Index for each coding category in `coding_categories' */
 584
 585 enum coding_category
 586   {
 587     coding_category_iso_7,
 588     coding_category_iso_7_tight,
 589     coding_category_iso_8_1,
 590     coding_category_iso_8_2,
 591     coding_category_iso_7_else,
 592     coding_category_iso_8_else,
 593     coding_category_utf_8_auto,
 594     coding_category_utf_8_nosig,
 595     coding_category_utf_8_sig,
 596     coding_category_utf_16_auto,
 597     coding_category_utf_16_be,
 598     coding_category_utf_16_le,
 599     coding_category_utf_16_be_nosig,
 600     coding_category_utf_16_le_nosig,
 601     coding_category_charset,
 602     coding_category_sjis,
 603     coding_category_big5,
 604     coding_category_ccl,
 605     coding_category_emacs_mule,
 606     /* All above are targets of code detection.  */
 607     coding_category_raw_text,
 608     coding_category_undecided,
 609     coding_category_max
 610   };
 611
 612 /* Definitions of flag bits used in detect_coding_XXXX.  */
 613 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 614 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 615 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 616 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 617 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 618 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 619 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 620 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 621 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 622 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 623 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 624 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 625 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 626 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 627 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 628 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 629 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 630 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 631 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 632 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 633
 634 /* This value is returned if detect_coding_mask () find nothing other
 635    than ASCII characters.  */
 636 #define CATEGORY_MASK_ANY               \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_8_1              \
 640    | CATEGORY_MASK_ISO_8_2              \
 641    | CATEGORY_MASK_ISO_7_ELSE           \
 642    | CATEGORY_MASK_ISO_8_ELSE           \
 643    | CATEGORY_MASK_UTF_8_AUTO           \
 644    | CATEGORY_MASK_UTF_8_NOSIG          \
 645    | CATEGORY_MASK_UTF_8_SIG            \
 646    | CATEGORY_MASK_UTF_16_AUTO          \
 647    | CATEGORY_MASK_UTF_16_BE            \
 648    | CATEGORY_MASK_UTF_16_LE            \
 649    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 650    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 651    | CATEGORY_MASK_CHARSET              \
 652    | CATEGORY_MASK_SJIS                 \
 653    | CATEGORY_MASK_BIG5                 \
 654    | CATEGORY_MASK_CCL                  \
 655    | CATEGORY_MASK_EMACS_MULE)
 656
 657
 658 #define CATEGORY_MASK_ISO_7BIT \
 659   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 660
 661 #define CATEGORY_MASK_ISO_8BIT \
 662   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 663
 664 #define CATEGORY_MASK_ISO_ELSE \
 665   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 666
 667 #define CATEGORY_MASK_ISO_ESCAPE        \
 668   (CATEGORY_MASK_ISO_7                  \
 669    | CATEGORY_MASK_ISO_7_TIGHT          \
 670    | CATEGORY_MASK_ISO_7_ELSE           \
 671    | CATEGORY_MASK_ISO_8_ELSE)
 672
 673 #define CATEGORY_MASK_ISO       \
 674   (  CATEGORY_MASK_ISO_7BIT     \
 675      | CATEGORY_MASK_ISO_8BIT   \
 676      | CATEGORY_MASK_ISO_ELSE)
 677
 678 #define CATEGORY_MASK_UTF_16            \
 679   (CATEGORY_MASK_UTF_16_AUTO            \
 680    | CATEGORY_MASK_UTF_16_BE            \
 681    | CATEGORY_MASK_UTF_16_LE            \
 682    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 683    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 684
 685 #define CATEGORY_MASK_UTF_8     \
 686   (CATEGORY_MASK_UTF_8_AUTO     \
 687    | CATEGORY_MASK_UTF_8_NOSIG  \
 688    | CATEGORY_MASK_UTF_8_SIG)
 689
 690 /* List of symbols `coding-category-xxx' ordered by priority.  This
 691    variable is exposed to Emacs Lisp.  */
 692 static Lisp_Object Vcoding_category_list;
 693
 694 /* Table of coding categories (Lisp symbols).  This variable is for
 695    internal use oly.  */
 696 static Lisp_Object Vcoding_category_table;
 697
 698 /* Table of coding-categories ordered by priority.  */
 699 static enum coding_category coding_priorities[coding_category_max];
 700
 701 /* Nth element is a coding context for the coding system bound to the
 702    Nth coding category.  */
 703 static struct coding_system coding_categories[coding_category_max];
 704
 705 /*** Commonly used macros and functions ***/
 706
 707 #ifndef min
 708 #define min(a, b) ((a) < (b) ? (a) : (b))
 709 #endif
 710 #ifndef max
 711 #define max(a, b) ((a) > (b) ? (a) : (b))
 712 #endif
 713
 714 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 715   do {                                                  \
 716     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 717     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 718   } while (0)
 719
 720
 721 /* Safely get one byte from the source text pointed by SRC which ends
 722    at SRC_END, and set C to that byte.  If there are not enough bytes
 723    in the source, it jumps to `no_more_source'.  If multibytep is
 724    nonzero, and a multibyte character is found at SRC, set C to the
 725    negative value of the character code.  The caller should declare
 726    and set these variables appropriately in advance:
 727         src, src_end, multibytep */
 728
 729 #define ONE_MORE_BYTE(c)                                \
 730   do {                                                  \
 731     if (src == src_end)                                 \
 732       {                                                 \
 733         if (src_base < src)                             \
 734           record_conversion_result                      \
 735             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 736         goto no_more_source;                            \
 737       }                                                 \
 738     c = *src++;                                         \
 739     if (multibytep && (c & 0x80))                       \
 740       {                                                 \
 741         if ((c & 0xFE) == 0xC0)                         \
 742           c = ((c & 1) << 6) | *src++;                  \
 743         else                                            \
 744           {                                             \
 745             src--;                                      \
 746             c = - string_char (src, &src, NULL);        \
 747             record_conversion_result                    \
 748               (coding, CODING_RESULT_INVALID_SRC);      \
 749           }                                             \
 750       }                                                 \
 751     consumed_chars++;                                   \
 752   } while (0)
 753
 754 /* Safely get two bytes from the source text pointed by SRC which ends
 755    at SRC_END, and set C1 and C2 to those bytes while skipping the
 756    heading multibyte characters.  If there are not enough bytes in the
 757    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 758    a multibyte character is found for C2, set C2 to the negative value
 759    of the character code.  The caller should declare and set these
 760    variables appropriately in advance:
 761         src, src_end, multibytep
 762    It is intended that this macro is used in detect_coding_utf_16.  */
 763
 764 #define TWO_MORE_BYTES(c1, c2)                          \
 765   do {                                                  \
 766     do {                                                \
 767       if (src == src_end)                               \
 768         goto no_more_source;                            \
 769       c1 = *src++;                                      \
 770       if (multibytep && (c1 & 0x80))                    \
 771         {                                               \
 772           if ((c1 & 0xFE) == 0xC0)                      \
 773             c1 = ((c1 & 1) << 6) | *src++;              \
 774           else                                          \
 775             {                                           \
 776               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 777               c1 = -1;                                  \
 778             }                                           \
 779         }                                               \
 780     } while (c1 < 0);                                   \
 781     if (src == src_end)                                 \
 782       goto no_more_source;                              \
 783     c2 = *src++;                                        \
 784     if (multibytep && (c2 & 0x80))                      \
 785       {                                                 \
 786         if ((c2 & 0xFE) == 0xC0)                        \
 787           c2 = ((c2 & 1) << 6) | *src++;                \
 788         else                                            \
 789           c2 = -1;                                      \
 790       }                                                 \
 791   } while (0)
 792
 793
 794 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 795   do {                                                  \
 796     c = *src++;                                         \
 797     if (multibytep && (c & 0x80))                       \
 798       {                                                 \
 799         if ((c & 0xFE) == 0xC0)                         \
 800           c = ((c & 1) << 6) | *src++;                  \
 801         else                                            \
 802           {                                             \
 803             src--;                                      \
 804             c = - string_char (src, &src, NULL);        \
 805             record_conversion_result                    \
 806               (coding, CODING_RESULT_INVALID_SRC);      \
 807           }                                             \
 808       }                                                 \
 809     consumed_chars++;                                   \
 810   } while (0)
 811
 812
 813 /* Store a byte C in the place pointed by DST and increment DST to the
 814    next free point, and increment PRODUCED_CHARS.  The caller should
 815    assure that C is 0..127, and declare and set the variable `dst'
 816    appropriately in advance.
 817 */
 818
 819
 820 #define EMIT_ONE_ASCII_BYTE(c)  \
 821   do {                          \
 822     produced_chars++;           \
 823     *dst++ = (c);               \
 824   } while (0)
 825
 826
 827 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 828
 829 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 830   do {                                  \
 831     produced_chars += 2;                \
 832     *dst++ = (c1), *dst++ = (c2);       \
 833   } while (0)
 834
 835
 836 /* Store a byte C in the place pointed by DST and increment DST to the
 837    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 838    nonzero, store in an appropriate multibyte from.  The caller should
 839    declare and set the variables `dst' and `multibytep' appropriately
 840    in advance.  */
 841
 842 #define EMIT_ONE_BYTE(c)                \
 843   do {                                  \
 844     produced_chars++;                   \
 845     if (multibytep)                     \
 846       {                                 \
 847         int ch = (c);                   \
 848         if (ch >= 0x80)                 \
 849           ch = BYTE8_TO_CHAR (ch);      \
 850         CHAR_STRING_ADVANCE (ch, dst);  \
 851       }                                 \
 852     else                                \
 853       *dst++ = (c);                     \
 854   } while (0)
 855
 856
 857 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 858
 859 #define EMIT_TWO_BYTES(c1, c2)          \
 860   do {                                  \
 861     produced_chars += 2;                \
 862     if (multibytep)                     \
 863       {                                 \
 864         int ch;                         \
 865                                         \
 866         ch = (c1);                      \
 867         if (ch >= 0x80)                 \
 868           ch = BYTE8_TO_CHAR (ch);      \
 869         CHAR_STRING_ADVANCE (ch, dst);  \
 870         ch = (c2);                      \
 871         if (ch >= 0x80)                 \
 872           ch = BYTE8_TO_CHAR (ch);      \
 873         CHAR_STRING_ADVANCE (ch, dst);  \
 874       }                                 \
 875     else                                \
 876       {                                 \
 877         *dst++ = (c1);                  \
 878         *dst++ = (c2);                  \
 879       }                                 \
 880   } while (0)
 881
 882
 883 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 884   do {                                  \
 885     EMIT_ONE_BYTE (c1);                 \
 886     EMIT_TWO_BYTES (c2, c3);            \
 887   } while (0)
 888
 889
 890 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 891   do {                                          \
 892     EMIT_TWO_BYTES (c1, c2);                    \
 893     EMIT_TWO_BYTES (c3, c4);                    \
 894   } while (0)
 895
 896
 897 /* Prototypes for static functions.  */
 898 static void record_conversion_result P_ ((struct coding_system *coding,
 899                                           enum coding_result_code result));
 900 static int detect_coding_utf_8 P_ ((struct coding_system *,
 901                                     struct coding_detection_info *info));
 902 static void decode_coding_utf_8 P_ ((struct coding_system *));
 903 static int encode_coding_utf_8 P_ ((struct coding_system *));
 904
 905 static int detect_coding_utf_16 P_ ((struct coding_system *,
 906                                      struct coding_detection_info *info));
 907 static void decode_coding_utf_16 P_ ((struct coding_system *));
 908 static int encode_coding_utf_16 P_ ((struct coding_system *));
 909
 910 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 911                                        struct coding_detection_info *info));
 912 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 913 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 914
 915 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 916                                          struct coding_detection_info *info));
 917 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 918 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 919
 920 static int detect_coding_sjis P_ ((struct coding_system *,
 921                                    struct coding_detection_info *info));
 922 static void decode_coding_sjis P_ ((struct coding_system *));
 923 static int encode_coding_sjis P_ ((struct coding_system *));
 924
 925 static int detect_coding_big5 P_ ((struct coding_system *,
 926                                    struct coding_detection_info *info));
 927 static void decode_coding_big5 P_ ((struct coding_system *));
 928 static int encode_coding_big5 P_ ((struct coding_system *));
 929
 930 static int detect_coding_ccl P_ ((struct coding_system *,
 931                                   struct coding_detection_info *info));
 932 static void decode_coding_ccl P_ ((struct coding_system *));
 933 static int encode_coding_ccl P_ ((struct coding_system *));
 934
 935 static void decode_coding_raw_text P_ ((struct coding_system *));
 936 static int encode_coding_raw_text P_ ((struct coding_system *));
 937
 938 static void coding_set_source P_ ((struct coding_system *));
 939 static void coding_set_destination P_ ((struct coding_system *));
 940 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 941 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 942                                             EMACS_INT, EMACS_INT));
 943 static unsigned char *alloc_destination P_ ((struct coding_system *,
 944                                              EMACS_INT, unsigned char *));
 945 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 946 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 947                                                      int *, int *,
 948                                                      unsigned char *));
 949 static int detect_eol P_ ((const unsigned char *,
 950                            EMACS_INT, enum coding_category));
 951 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 952 static void decode_eol P_ ((struct coding_system *));
 953 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 954 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 955 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 956 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 957                                         EMACS_INT));
 958 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 959 static int decode_coding P_ ((struct coding_system *));
 960 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 961                                                       struct coding_system *,
 962                                                       int *, EMACS_INT *));
 963 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 964                                                   struct coding_system *,
 965                                                   int *, EMACS_INT *));
 966 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 967 static int encode_coding P_ ((struct coding_system *));
 968 static Lisp_Object make_conversion_work_buffer P_ ((int));
 969 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 970 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 971 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 972
 973 static void
 974 record_conversion_result (struct coding_system *coding,
 975                           enum coding_result_code result)
 976 {
 977   coding->result = result;
 978   switch (result)
 979     {
 980     case CODING_RESULT_INSUFFICIENT_SRC:
 981       Vlast_code_conversion_error = Qinsufficient_source;
 982       break;
 983     case CODING_RESULT_INCONSISTENT_EOL:
 984       Vlast_code_conversion_error = Qinconsistent_eol;
 985       break;
 986     case CODING_RESULT_INVALID_SRC:
 987       Vlast_code_conversion_error = Qinvalid_source;
 988       break;
 989     case CODING_RESULT_INTERRUPT:
 990       Vlast_code_conversion_error = Qinterrupted;
 991       break;
 992     case CODING_RESULT_INSUFFICIENT_MEM:
 993       Vlast_code_conversion_error = Qinsufficient_memory;
 994       break;
 995     default:
 996       Vlast_code_conversion_error = intern ("Unknown error");
 997     }
 998 }
 999
1000 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1001   do {                                                                       \
1002     charset_map_loaded = 0;                                                  \
1003     c = DECODE_CHAR (charset, code);                                         \
1004     if (charset_map_loaded)                                                  \
1005       {                                                                      \
1006         const unsigned char *orig = coding->source;                          \
1007         EMACS_INT offset;                                                    \
1008                                                                              \
1009         coding_set_source (coding);                                          \
1010         offset = coding->source - orig;                                      \
1011         src += offset;                                                       \
1012         src_base += offset;                                                  \
1013         src_end += offset;                                                   \
1014       }                                                                      \
1015   } while (0)
1016
1017
1018 /* If there are at least BYTES length of room at dst, allocate memory
1019    for coding->destination and update dst and dst_end.  We don't have
1020    to take care of coding->source which will be relocated.  It is
1021    handled by calling coding_set_source in encode_coding.  */
1022
1023 #define ASSURE_DESTINATION(bytes)                               \
1024   do {                                                          \
1025     if (dst + (bytes) >= dst_end)                               \
1026       {                                                         \
1027         int more_bytes = charbuf_end - charbuf + (bytes);       \
1028                                                                 \
1029         dst = alloc_destination (coding, more_bytes, dst);      \
1030         dst_end = coding->destination + coding->dst_bytes;      \
1031       }                                                         \
1032   } while (0)
1033
1034
1035 /* Store multibyte form of the character C in P, and advance P to the
1036    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1037    never calls MAYBE_UNIFY_CHAR.  */
1038
1039 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1040   do {                                          \
1041     if ((c) <= MAX_1_BYTE_CHAR)                 \
1042       *(p)++ = (c);                             \
1043     else if ((c) <= MAX_2_BYTE_CHAR)            \
1044       *(p)++ = (0xC0 | ((c) >> 6)),             \
1045         *(p)++ = (0x80 | ((c) & 0x3F));         \
1046     else if ((c) <= MAX_3_BYTE_CHAR)            \
1047       *(p)++ = (0xE0 | ((c) >> 12)),            \
1048         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1049         *(p)++ = (0x80 | ((c) & 0x3F));         \
1050     else if ((c) <= MAX_4_BYTE_CHAR)            \
1051       *(p)++ = (0xF0 | (c >> 18)),              \
1052         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1053         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1054         *(p)++ = (0x80 | (c & 0x3F));           \
1055     else if ((c) <= MAX_5_BYTE_CHAR)            \
1056       *(p)++ = 0xF8,                            \
1057         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1058         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1059         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1060         *(p)++ = (0x80 | (c & 0x3F));           \
1061     else                                        \
1062       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1063   } while (0)
1064
1065
1066 /* Return the character code of character whose multibyte form is at
1067    P, and advance P to the end of the multibyte form.  This is like
1068    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1069
1070 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1071   (!((p)[0] & 0x80)                                             \
1072    ? *(p)++                                                     \
1073    : ! ((p)[0] & 0x20)                                          \
1074    ? ((p) += 2,                                                 \
1075       ((((p)[-2] & 0x1F) << 6)                                  \
1076        | ((p)[-1] & 0x3F)                                       \
1077        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1078    : ! ((p)[0] & 0x10)                                          \
1079    ? ((p) += 3,                                                 \
1080       ((((p)[-3] & 0x0F) << 12)                                 \
1081        | (((p)[-2] & 0x3F) << 6)                                \
1082        | ((p)[-1] & 0x3F)))                                     \
1083    : ! ((p)[0] & 0x08)                                          \
1084    ? ((p) += 4,                                                 \
1085       ((((p)[-4] & 0xF) << 18)                                  \
1086        | (((p)[-3] & 0x3F) << 12)                               \
1087        | (((p)[-2] & 0x3F) << 6)                                \
1088        | ((p)[-1] & 0x3F)))                                     \
1089    : ((p) += 5,                                                 \
1090       ((((p)[-4] & 0x3F) << 18)                                 \
1091        | (((p)[-3] & 0x3F) << 12)                               \
1092        | (((p)[-2] & 0x3F) << 6)                                \
1093        | ((p)[-1] & 0x3F))))
1094
1095
1096 static void
1097 coding_set_source (coding)
1098      struct coding_system *coding;
1099 {
1100   if (BUFFERP (coding->src_object))
1101     {
1102       struct buffer *buf = XBUFFER (coding->src_object);
1103
1104       if (coding->src_pos < 0)
1105         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1106       else
1107         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1108     }
1109   else if (STRINGP (coding->src_object))
1110     {
1111       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1112     }
1113   else
1114     /* Otherwise, the source is C string and is never relocated
1115        automatically.  Thus we don't have to update anything.  */
1116     ;
1117 }
1118
1119 static void
1120 coding_set_destination (coding)
1121      struct coding_system *coding;
1122 {
1123   if (BUFFERP (coding->dst_object))
1124     {
1125       if (coding->src_pos < 0)
1126         {
1127           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1128           coding->dst_bytes = (GAP_END_ADDR
1129                                - (coding->src_bytes - coding->consumed)
1130                                - coding->destination);
1131         }
1132       else
1133         {
1134           /* We are sure that coding->dst_pos_byte is before the gap
1135              of the buffer. */
1136           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1137                                  + coding->dst_pos_byte - BEG_BYTE);
1138           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1139                                - coding->destination);
1140         }
1141     }
1142   else
1143     /* Otherwise, the destination is C string and is never relocated
1144        automatically.  Thus we don't have to update anything.  */
1145     ;
1146 }
1147
1148
1149 static void
1150 coding_alloc_by_realloc (coding, bytes)
1151      struct coding_system *coding;
1152      EMACS_INT bytes;
1153 {
1154   coding->destination = (unsigned char *) xrealloc (coding->destination,
1155                                                     coding->dst_bytes + bytes);
1156   coding->dst_bytes += bytes;
1157 }
1158
1159 static void
1160 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1161      struct coding_system *coding;
1162      EMACS_INT gap_head_used, bytes;
1163 {
1164   if (EQ (coding->src_object, coding->dst_object))
1165     {
1166       /* The gap may contain the produced data at the head and not-yet
1167          consumed data at the tail.  To preserve those data, we at
1168          first make the gap size to zero, then increase the gap
1169          size.  */
1170       EMACS_INT add = GAP_SIZE;
1171
1172       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1173       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1174       make_gap (bytes);
1175       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1176       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1177     }
1178   else
1179     {
1180       Lisp_Object this_buffer;
1181
1182       this_buffer = Fcurrent_buffer ();
1183       set_buffer_internal (XBUFFER (coding->dst_object));
1184       make_gap (bytes);
1185       set_buffer_internal (XBUFFER (this_buffer));
1186     }
1187 }
1188
1189
1190 static unsigned char *
1191 alloc_destination (coding, nbytes, dst)
1192      struct coding_system *coding;
1193      EMACS_INT nbytes;
1194      unsigned char *dst;
1195 {
1196   EMACS_INT offset = dst - coding->destination;
1197
1198   if (BUFFERP (coding->dst_object))
1199     {
1200       struct buffer *buf = XBUFFER (coding->dst_object);
1201
1202       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1203     }
1204   else
1205     coding_alloc_by_realloc (coding, nbytes);
1206   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1207   coding_set_destination (coding);
1208   dst = coding->destination + offset;
1209   return dst;
1210 }
1211
1212 /** Macros for annotations.  */
1213
1214 /* An annotation data is stored in the array coding->charbuf in this
1215    format:
1216      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1217    LENGTH is the number of elements in the annotation.
1218    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1219    NCHARS is the number of characters in the text annotated.
1220
1221    The format of the following elements depend on ANNOTATION_MASK.
1222
1223    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1224    follows:
1225      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1226
1227    NBYTES is the number of bytes specified in the header part of
1228    old-style emacs-mule encoding, or 0 for the other kind of
1229    composition.
1230
1231    METHOD is one of enum composition_method.
1232
1233    Optionnal COMPOSITION-COMPONENTS are characters and composition
1234    rules.
1235
1236    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1237    follows.
1238
1239    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1240    recover from an invalid annotation, and should be skipped by
1241    produce_annotation.  */
1242
1243 /* Maximum length of the header of annotation data.  */
1244 #define MAX_ANNOTATION_LENGTH 5
1245
1246 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1247   do {                                                  \
1248     *(buf)++ = -(len);                                  \
1249     *(buf)++ = (mask);                                  \
1250     *(buf)++ = (nchars);                                \
1251     coding->annotated = 1;                              \
1252   } while (0);
1253
1254 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1255   do {                                                                      \
1256     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1257     *buf++ = nbytes;                                                        \
1258     *buf++ = method;                                                        \
1259   } while (0)
1260
1261
1262 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1263   do {                                                                  \
1264     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1265     *buf++ = id;                                                        \
1266   } while (0)
1267
1268 \f
1269 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1270
1271
1272
1273 \f
1274 /*** 3. UTF-8 ***/
1275
1276 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1277    Check if a text is encoded in UTF-8.  If it is, return 1, else
1278    return 0.  */
1279
1280 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1281 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1282 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1283 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1284 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1285 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1286
1287 #define UTF_BOM 0xFEFF
1288 #define UTF_8_BOM_1 0xEF
1289 #define UTF_8_BOM_2 0xBB
1290 #define UTF_8_BOM_3 0xBF
1291
1292 static int
1293 detect_coding_utf_8 (coding, detect_info)
1294      struct coding_system *coding;
1295      struct coding_detection_info *detect_info;
1296 {
1297   const unsigned char *src = coding->source, *src_base;
1298   const unsigned char *src_end = coding->source + coding->src_bytes;
1299   int multibytep = coding->src_multibyte;
1300   int consumed_chars = 0;
1301   int bom_found = 0;
1302   int found = 0;
1303
1304   detect_info->checked |= CATEGORY_MASK_UTF_8;
1305   /* A coding system of this category is always ASCII compatible.  */
1306   src += coding->head_ascii;
1307
1308   while (1)
1309     {
1310       int c, c1, c2, c3, c4;
1311
1312       src_base = src;
1313       ONE_MORE_BYTE (c);
1314       if (c < 0 || UTF_8_1_OCTET_P (c))
1315         continue;
1316       ONE_MORE_BYTE (c1);
1317       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1318         break;
1319       if (UTF_8_2_OCTET_LEADING_P (c))
1320         {
1321           found = 1;
1322           continue;
1323         }
1324       ONE_MORE_BYTE (c2);
1325       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1326         break;
1327       if (UTF_8_3_OCTET_LEADING_P (c))
1328         {
1329           found = 1;
1330           if (src_base == coding->source
1331               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1332             bom_found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c3);
1336       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1337         break;
1338       if (UTF_8_4_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           continue;
1342         }
1343       ONE_MORE_BYTE (c4);
1344       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1345         break;
1346       if (UTF_8_5_OCTET_LEADING_P (c))
1347         {
1348           found = 1;
1349           continue;
1350         }
1351       break;
1352     }
1353   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1354   return 0;
1355
1356  no_more_source:
1357   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1358     {
1359       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1360       return 0;
1361     }
1362   if (bom_found)
1363     {
1364       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1365       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1366     }
1367   else
1368     {
1369       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1370       if (found)
1371         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1372     }
1373   return 1;
1374 }
1375
1376
1377 static void
1378 decode_coding_utf_8 (coding)
1379      struct coding_system *coding;
1380 {
1381   const unsigned char *src = coding->source + coding->consumed;
1382   const unsigned char *src_end = coding->source + coding->src_bytes;
1383   const unsigned char *src_base;
1384   int *charbuf = coding->charbuf + coding->charbuf_used;
1385   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1386   int consumed_chars = 0, consumed_chars_base = 0;
1387   int multibytep = coding->src_multibyte;
1388   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1389   Lisp_Object attr, charset_list;
1390   int eol_crlf =
1391     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1392   int byte_after_cr = -1;
1393
1394   CODING_GET_INFO (coding, attr, charset_list);
1395
1396   if (bom != utf_without_bom)
1397     {
1398       int c1, c2, c3;
1399
1400       src_base = src;
1401       ONE_MORE_BYTE (c1);
1402       if (! UTF_8_3_OCTET_LEADING_P (c1))
1403         src = src_base;
1404       else
1405         {
1406           ONE_MORE_BYTE (c2);
1407           if (! UTF_8_EXTRA_OCTET_P (c2))
1408             src = src_base;
1409           else
1410             {
1411               ONE_MORE_BYTE (c3);
1412               if (! UTF_8_EXTRA_OCTET_P (c3))
1413                 src = src_base;
1414               else
1415                 {
1416                   if ((c1 != UTF_8_BOM_1)
1417                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1418                     src = src_base;
1419                   else
1420                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1421                 }
1422             }
1423         }
1424     }
1425   CODING_UTF_8_BOM (coding) = utf_without_bom;
1426
1427
1428
1429   while (1)
1430     {
1431       int c, c1, c2, c3, c4, c5;
1432
1433       src_base = src;
1434       consumed_chars_base = consumed_chars;
1435
1436       if (charbuf >= charbuf_end)
1437         {
1438           if (byte_after_cr >= 0)
1439             src_base--;
1440           break;
1441         }
1442
1443       if (byte_after_cr >= 0)
1444         c1 = byte_after_cr, byte_after_cr = -1;
1445       else
1446         ONE_MORE_BYTE (c1);
1447       if (c1 < 0)
1448         {
1449           c = - c1;
1450         }
1451       else if (UTF_8_1_OCTET_P(c1))
1452         {
1453           if (eol_crlf && c1 == '\r')
1454             ONE_MORE_BYTE (byte_after_cr);
1455           c = c1;
1456         }
1457       else
1458         {
1459           ONE_MORE_BYTE (c2);
1460           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1461             goto invalid_code;
1462           if (UTF_8_2_OCTET_LEADING_P (c1))
1463             {
1464               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1465               /* Reject overlong sequences here and below.  Encoders
1466                  producing them are incorrect, they can be misleading,
1467                  and they mess up read/write invariance.  */
1468               if (c < 128)
1469                 goto invalid_code;
1470             }
1471           else
1472             {
1473               ONE_MORE_BYTE (c3);
1474               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1475                 goto invalid_code;
1476               if (UTF_8_3_OCTET_LEADING_P (c1))
1477                 {
1478                   c = (((c1 & 0xF) << 12)
1479                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1480                   if (c < 0x800
1481                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1482                     goto invalid_code;
1483                 }
1484               else
1485                 {
1486                   ONE_MORE_BYTE (c4);
1487                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1488                     goto invalid_code;
1489                   if (UTF_8_4_OCTET_LEADING_P (c1))
1490                     {
1491                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1492                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1493                     if (c < 0x10000)
1494                       goto invalid_code;
1495                     }
1496                   else
1497                     {
1498                       ONE_MORE_BYTE (c5);
1499                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1500                         goto invalid_code;
1501                       if (UTF_8_5_OCTET_LEADING_P (c1))
1502                         {
1503                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1504                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1505                                | (c5 & 0x3F));
1506                           if ((c > MAX_CHAR) || (c < 0x200000))
1507                             goto invalid_code;
1508                         }
1509                       else
1510                         goto invalid_code;
1511                     }
1512                 }
1513             }
1514         }
1515
1516       *charbuf++ = c;
1517       continue;
1518
1519     invalid_code:
1520       src = src_base;
1521       consumed_chars = consumed_chars_base;
1522       ONE_MORE_BYTE (c);
1523       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1524       coding->errors++;
1525     }
1526
1527  no_more_source:
1528   coding->consumed_char += consumed_chars_base;
1529   coding->consumed = src_base - coding->source;
1530   coding->charbuf_used = charbuf - coding->charbuf;
1531 }
1532
1533
1534 static int
1535 encode_coding_utf_8 (coding)
1536      struct coding_system *coding;
1537 {
1538   int multibytep = coding->dst_multibyte;
1539   int *charbuf = coding->charbuf;
1540   int *charbuf_end = charbuf + coding->charbuf_used;
1541   unsigned char *dst = coding->destination + coding->produced;
1542   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1543   int produced_chars = 0;
1544   int c;
1545
1546   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1547     {
1548       ASSURE_DESTINATION (3);
1549       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1550       CODING_UTF_8_BOM (coding) = utf_without_bom;
1551     }
1552
1553   if (multibytep)
1554     {
1555       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1556
1557       while (charbuf < charbuf_end)
1558         {
1559           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1560
1561           ASSURE_DESTINATION (safe_room);
1562           c = *charbuf++;
1563           if (CHAR_BYTE8_P (c))
1564             {
1565               c = CHAR_TO_BYTE8 (c);
1566               EMIT_ONE_BYTE (c);
1567             }
1568           else
1569             {
1570               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1571               for (p = str; p < pend; p++)
1572                 EMIT_ONE_BYTE (*p);
1573             }
1574         }
1575     }
1576   else
1577     {
1578       int safe_room = MAX_MULTIBYTE_LENGTH;
1579
1580       while (charbuf < charbuf_end)
1581         {
1582           ASSURE_DESTINATION (safe_room);
1583           c = *charbuf++;
1584           if (CHAR_BYTE8_P (c))
1585             *dst++ = CHAR_TO_BYTE8 (c);
1586           else
1587             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1588           produced_chars++;
1589         }
1590     }
1591   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1592   coding->produced_char += produced_chars;
1593   coding->produced = dst - coding->destination;
1594   return 0;
1595 }
1596
1597
1598 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1599    Check if a text is encoded in one of UTF-16 based coding systems.
1600    If it is, return 1, else return 0.  */
1601
1602 #define UTF_16_HIGH_SURROGATE_P(val) \
1603   (((val) & 0xFC00) == 0xD800)
1604
1605 #define UTF_16_LOW_SURROGATE_P(val) \
1606   (((val) & 0xFC00) == 0xDC00)
1607
1608 #define UTF_16_INVALID_P(val)   \
1609   (((val) == 0xFFFE)            \
1610    || ((val) == 0xFFFF)         \
1611    || UTF_16_LOW_SURROGATE_P (val))
1612
1613
1614 static int
1615 detect_coding_utf_16 (coding, detect_info)
1616      struct coding_system *coding;
1617      struct coding_detection_info *detect_info;
1618 {
1619   const unsigned char *src = coding->source, *src_base = src;
1620   const unsigned char *src_end = coding->source + coding->src_bytes;
1621   int multibytep = coding->src_multibyte;
1622   int consumed_chars = 0;
1623   int c1, c2;
1624
1625   detect_info->checked |= CATEGORY_MASK_UTF_16;
1626   if (coding->mode & CODING_MODE_LAST_BLOCK
1627       && (coding->src_chars & 1))
1628     {
1629       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1630       return 0;
1631     }
1632
1633   TWO_MORE_BYTES (c1, c2);
1634   if ((c1 == 0xFF) && (c2 == 0xFE))
1635     {
1636       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1637                              | CATEGORY_MASK_UTF_16_AUTO);
1638       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1639                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1640                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1641     }
1642   else if ((c1 == 0xFE) && (c2 == 0xFF))
1643     {
1644       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1645                              | CATEGORY_MASK_UTF_16_AUTO);
1646       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1647                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1648                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1649     }
1650   else if (c2 < 0)
1651     {
1652       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1653       return 0;
1654     }
1655   else
1656     {
1657       /* We check the dispersion of Eth and Oth bytes where E is even and
1658          O is odd.  If both are high, we assume binary data.*/
1659       unsigned char e[256], o[256];
1660       unsigned e_num = 1, o_num = 1;
1661
1662       memset (e, 0, 256);
1663       memset (o, 0, 256);
1664       e[c1] = 1;
1665       o[c2] = 1;
1666
1667       detect_info->rejected
1668         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1669
1670       while (1)
1671         {
1672           TWO_MORE_BYTES (c1, c2);
1673           if (c2 < 0)
1674             break;
1675           if (! e[c1])
1676             {
1677               e[c1] = 1;
1678               e_num++;
1679               if (e_num >= 128)
1680                 break;
1681             }
1682           if (! o[c2])
1683             {
1684               o[c1] = 1;
1685               o_num++;
1686               if (o_num >= 128)
1687                 break;
1688             }
1689         }
1690       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1691       return 0;
1692     }
1693
1694  no_more_source:
1695   return 1;
1696 }
1697
1698 static void
1699 decode_coding_utf_16 (coding)
1700      struct coding_system *coding;
1701 {
1702   const unsigned char *src = coding->source + coding->consumed;
1703   const unsigned char *src_end = coding->source + coding->src_bytes;
1704   const unsigned char *src_base;
1705   int *charbuf = coding->charbuf + coding->charbuf_used;
1706   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1707   int consumed_chars = 0, consumed_chars_base = 0;
1708   int multibytep = coding->src_multibyte;
1709   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1710   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1711   int surrogate = CODING_UTF_16_SURROGATE (coding);
1712   Lisp_Object attr, charset_list;
1713   int eol_crlf =
1714     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1715   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1716
1717   CODING_GET_INFO (coding, attr, charset_list);
1718
1719   if (bom == utf_with_bom)
1720     {
1721       int c, c1, c2;
1722
1723       src_base = src;
1724       ONE_MORE_BYTE (c1);
1725       ONE_MORE_BYTE (c2);
1726       c = (c1 << 8) | c2;
1727
1728       if (endian == utf_16_big_endian
1729           ? c != 0xFEFF : c != 0xFFFE)
1730         {
1731           /* The first two bytes are not BOM.  Treat them as bytes
1732              for a normal character.  */
1733           src = src_base;
1734           coding->errors++;
1735         }
1736       CODING_UTF_16_BOM (coding) = utf_without_bom;
1737     }
1738   else if (bom == utf_detect_bom)
1739     {
1740       /* We have already tried to detect BOM and failed in
1741          detect_coding.  */
1742       CODING_UTF_16_BOM (coding) = utf_without_bom;
1743     }
1744
1745   while (1)
1746     {
1747       int c, c1, c2;
1748
1749       src_base = src;
1750       consumed_chars_base = consumed_chars;
1751
1752       if (charbuf + 2 >= charbuf_end)
1753         {
1754           if (byte_after_cr1 >= 0)
1755             src_base -= 2;
1756           break;
1757         }
1758
1759       if (byte_after_cr1 >= 0)
1760         c1 = byte_after_cr1, byte_after_cr1 = -1;
1761       else
1762         ONE_MORE_BYTE (c1);
1763       if (c1 < 0)
1764         {
1765           *charbuf++ = -c1;
1766           continue;
1767         }
1768       if (byte_after_cr2 >= 0)
1769         c2 = byte_after_cr2, byte_after_cr2 = -1;
1770       else
1771         ONE_MORE_BYTE (c2);
1772       if (c2 < 0)
1773         {
1774           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1775           *charbuf++ = -c2;
1776           continue;
1777         }
1778       c = (endian == utf_16_big_endian
1779            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1780
1781       if (surrogate)
1782         {
1783           if (! UTF_16_LOW_SURROGATE_P (c))
1784             {
1785               if (endian == utf_16_big_endian)
1786                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1787               else
1788                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1789               *charbuf++ = c1;
1790               *charbuf++ = c2;
1791               coding->errors++;
1792               if (UTF_16_HIGH_SURROGATE_P (c))
1793                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1794               else
1795                 *charbuf++ = c;
1796             }
1797           else
1798             {
1799               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1800               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1801               *charbuf++ = 0x10000 + c;
1802             }
1803         }
1804       else
1805         {
1806           if (UTF_16_HIGH_SURROGATE_P (c))
1807             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1808           else
1809             {
1810               if (eol_crlf && c == '\r')
1811                 {
1812                   ONE_MORE_BYTE (byte_after_cr1);
1813                   ONE_MORE_BYTE (byte_after_cr2);
1814                 }
1815               *charbuf++ = c;
1816             }
1817         }
1818     }
1819
1820  no_more_source:
1821   coding->consumed_char += consumed_chars_base;
1822   coding->consumed = src_base - coding->source;
1823   coding->charbuf_used = charbuf - coding->charbuf;
1824 }
1825
1826 static int
1827 encode_coding_utf_16 (coding)
1828      struct coding_system *coding;
1829 {
1830   int multibytep = coding->dst_multibyte;
1831   int *charbuf = coding->charbuf;
1832   int *charbuf_end = charbuf + coding->charbuf_used;
1833   unsigned char *dst = coding->destination + coding->produced;
1834   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1835   int safe_room = 8;
1836   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1837   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1838   int produced_chars = 0;
1839   Lisp_Object attrs, charset_list;
1840   int c;
1841
1842   CODING_GET_INFO (coding, attrs, charset_list);
1843
1844   if (bom != utf_without_bom)
1845     {
1846       ASSURE_DESTINATION (safe_room);
1847       if (big_endian)
1848         EMIT_TWO_BYTES (0xFE, 0xFF);
1849       else
1850         EMIT_TWO_BYTES (0xFF, 0xFE);
1851       CODING_UTF_16_BOM (coding) = utf_without_bom;
1852     }
1853
1854   while (charbuf < charbuf_end)
1855     {
1856       ASSURE_DESTINATION (safe_room);
1857       c = *charbuf++;
1858       if (c >= MAX_UNICODE_CHAR)
1859         c = coding->default_char;
1860
1861       if (c < 0x10000)
1862         {
1863           if (big_endian)
1864             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1865           else
1866             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1867         }
1868       else
1869         {
1870           int c1, c2;
1871
1872           c -= 0x10000;
1873           c1 = (c >> 10) + 0xD800;
1874           c2 = (c & 0x3FF) + 0xDC00;
1875           if (big_endian)
1876             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1877           else
1878             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1879         }
1880     }
1881   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1882   coding->produced = dst - coding->destination;
1883   coding->produced_char += produced_chars;
1884   return 0;
1885 }
1886
1887 \f
1888 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1889
1890 /* Emacs' internal format for representation of multiple character
1891    sets is a kind of multi-byte encoding, i.e. characters are
1892    represented by variable-length sequences of one-byte codes.
1893
1894    ASCII characters and control characters (e.g. `tab', `newline') are
1895    represented by one-byte sequences which are their ASCII codes, in
1896    the range 0x00 through 0x7F.
1897
1898    8-bit characters of the range 0x80..0x9F are represented by
1899    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1900    code + 0x20).
1901
1902    8-bit characters of the range 0xA0..0xFF are represented by
1903    one-byte sequences which are their 8-bit code.
1904
1905    The other characters are represented by a sequence of `base
1906    leading-code', optional `extended leading-code', and one or two
1907    `position-code's.  The length of the sequence is determined by the
1908    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1909    whereas extended leading-code and position-code take the range 0xA0
1910    through 0xFF.  See `charset.h' for more details about leading-code
1911    and position-code.
1912
1913    --- CODE RANGE of Emacs' internal format ---
1914    character set        range
1915    -------------        -----
1916    ascii                0x00..0x7F
1917    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1918    eight-bit-graphic    0xA0..0xBF
1919    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1920    ---------------------------------------------
1921
1922    As this is the internal character representation, the format is
1923    usually not used externally (i.e. in a file or in a data sent to a
1924    process).  But, it is possible to have a text externally in this
1925    format (i.e. by encoding by the coding system `emacs-mule').
1926
1927    In that case, a sequence of one-byte codes has a slightly different
1928    form.
1929
1930    At first, all characters in eight-bit-control are represented by
1931    one-byte sequences which are their 8-bit code.
1932
1933    Next, character composition data are represented by the byte
1934    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1935    where,
1936         METHOD is 0xF2 plus one of composition method (enum
1937         composition_method),
1938
1939         BYTES is 0xA0 plus a byte length of this composition data,
1940
1941         CHARS is 0xA0 plus a number of characters composed by this
1942         data,
1943
1944         COMPONENTs are characters of multibye form or composition
1945         rules encoded by two-byte of ASCII codes.
1946
1947    In addition, for backward compatibility, the following formats are
1948    also recognized as composition data on decoding.
1949
1950    0x80 MSEQ ...
1951    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1952
1953    Here,
1954         MSEQ is a multibyte form but in these special format:
1955           ASCII: 0xA0 ASCII_CODE+0x80,
1956           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1957         RULE is a one byte code of the range 0xA0..0xF0 that
1958         represents a composition rule.
1959   */
1960
1961 char emacs_mule_bytes[256];
1962
1963
1964 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1965    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1966    else return 0.  */
1967
1968 static int
1969 detect_coding_emacs_mule (coding, detect_info)
1970      struct coding_system *coding;
1971      struct coding_detection_info *detect_info;
1972 {
1973   const unsigned char *src = coding->source, *src_base;
1974   const unsigned char *src_end = coding->source + coding->src_bytes;
1975   int multibytep = coding->src_multibyte;
1976   int consumed_chars = 0;
1977   int c;
1978   int found = 0;
1979
1980   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1981   /* A coding system of this category is always ASCII compatible.  */
1982   src += coding->head_ascii;
1983
1984   while (1)
1985     {
1986       src_base = src;
1987       ONE_MORE_BYTE (c);
1988       if (c < 0)
1989         continue;
1990       if (c == 0x80)
1991         {
1992           /* Perhaps the start of composite character.  We simply skip
1993              it because analyzing it is too heavy for detecting.  But,
1994              at least, we check that the composite character
1995              constitutes of more than 4 bytes.  */
1996           const unsigned char *src_base;
1997
1998         repeat:
1999           src_base = src;
2000           do
2001             {
2002               ONE_MORE_BYTE (c);
2003             }
2004           while (c >= 0xA0);
2005
2006           if (src - src_base <= 4)
2007             break;
2008           found = CATEGORY_MASK_EMACS_MULE;
2009           if (c == 0x80)
2010             goto repeat;
2011         }
2012
2013       if (c < 0x80)
2014         {
2015           if (c < 0x20
2016               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2017             break;
2018         }
2019       else
2020         {
2021           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2022
2023           while (more_bytes > 0)
2024             {
2025               ONE_MORE_BYTE (c);
2026               if (c < 0xA0)
2027                 {
2028                   src--;        /* Unread the last byte.  */
2029                   break;
2030                 }
2031               more_bytes--;
2032             }
2033           if (more_bytes != 0)
2034             break;
2035           found = CATEGORY_MASK_EMACS_MULE;
2036         }
2037     }
2038   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2039   return 0;
2040
2041  no_more_source:
2042   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2043     {
2044       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2045       return 0;
2046     }
2047   detect_info->found |= found;
2048   return 1;
2049 }
2050
2051
2052 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2053    character.  If CMP_STATUS indicates that we must expect MSEQ or
2054    RULE described above, decode it and return the negative value of
2055    the deocded character or rule.  If an invalid byte is found, return
2056    -1.  If SRC is too short, return -2.  */
2057
2058 int
2059 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2060      struct coding_system *coding;
2061      const unsigned char *src;
2062      int *nbytes, *nchars, *id;
2063      struct composition_status *cmp_status;
2064 {
2065   const unsigned char *src_end = coding->source + coding->src_bytes;
2066   const unsigned char *src_base = src;
2067   int multibytep = coding->src_multibyte;
2068   struct charset *charset;
2069   unsigned code;
2070   int c;
2071   int consumed_chars = 0;
2072   int mseq_found = 0;
2073
2074   ONE_MORE_BYTE (c);
2075   if (c < 0)
2076     {
2077       c = -c;
2078       charset = emacs_mule_charset[0];
2079     }
2080   else
2081     {
2082       if (c >= 0xA0)
2083         {
2084           if (cmp_status->state != COMPOSING_NO
2085               && cmp_status->old_form)
2086             {
2087               if (cmp_status->state == COMPOSING_CHAR)
2088                 {
2089                   if (c == 0xA0)
2090                     {
2091                       ONE_MORE_BYTE (c);
2092                       c -= 0x80;
2093                       if (c < 0)
2094                         goto invalid_code;
2095                     }
2096                   else
2097                     c -= 0x20;
2098                   mseq_found = 1;
2099                 }
2100               else
2101                 {
2102                   *nbytes = src - src_base;
2103                   *nchars = consumed_chars;
2104                   return -c;
2105                 }
2106             }
2107           else
2108             goto invalid_code;
2109         }
2110
2111       switch (emacs_mule_bytes[c])
2112         {
2113         case 2:
2114           if (! (charset = emacs_mule_charset[c]))
2115             goto invalid_code;
2116           ONE_MORE_BYTE (c);
2117           if (c < 0xA0)
2118             goto invalid_code;
2119           code = c & 0x7F;
2120           break;
2121
2122         case 3:
2123           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2124               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2125             {
2126               ONE_MORE_BYTE (c);
2127               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2128                 goto invalid_code;
2129               ONE_MORE_BYTE (c);
2130               if (c < 0xA0)
2131                 goto invalid_code;
2132               code = c & 0x7F;
2133             }
2134           else
2135             {
2136               if (! (charset = emacs_mule_charset[c]))
2137                 goto invalid_code;
2138               ONE_MORE_BYTE (c);
2139               if (c < 0xA0)
2140                 goto invalid_code;
2141               code = (c & 0x7F) << 8;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code |= c & 0x7F;
2146             }
2147           break;
2148
2149         case 4:
2150           ONE_MORE_BYTE (c);
2151           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2152             goto invalid_code;
2153           ONE_MORE_BYTE (c);
2154           if (c < 0xA0)
2155             goto invalid_code;
2156           code = (c & 0x7F) << 8;
2157           ONE_MORE_BYTE (c);
2158           if (c < 0xA0)
2159             goto invalid_code;
2160           code |= c & 0x7F;
2161           break;
2162
2163         case 1:
2164           code = c;
2165           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2166                                      ? charset_ascii : charset_eight_bit);
2167           break;
2168
2169         default:
2170           abort ();
2171         }
2172       c = DECODE_CHAR (charset, code);
2173       if (c < 0)
2174         goto invalid_code;
2175     }
2176   *nbytes = src - src_base;
2177   *nchars = consumed_chars;
2178   if (id)
2179     *id = charset->id;
2180   return (mseq_found ? -c : c);
2181
2182  no_more_source:
2183   return -2;
2184
2185  invalid_code:
2186   return -1;
2187 }
2188
2189
2190 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2191
2192 /* Handle these composition sequence ('|': the end of header elements,
2193    BYTES and CHARS >= 0xA0):
2194
2195    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2196    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2197    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2198
2199    and these old form:
2200
2201    (4) relative composition: 0x80 | MSEQ ... MSEQ
2202    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2203
2204    When the starter 0x80 and the following header elements are found,
2205    this annotation header is produced.
2206
2207         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2208
2209    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2210    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2211
2212    Then, upon reading the following elements, these codes are produced
2213    until the composition end is found:
2214
2215    (1) CHAR ... CHAR
2216    (2) ALT ... ALT CHAR ... CHAR
2217    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2218    (4) CHAR ... CHAR
2219    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2220
2221    When the composition end is found, LENGTH and NCHARS in the
2222    annotation header is updated as below:
2223
2224    (1) LENGTH: unchanged, NCHARS: unchanged
2225    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2226    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2227    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2228    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2229
2230    If an error is found while composing, the annotation header is
2231    changed to the original composition header (plus filler -1s) as
2232    below:
2233
2234    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2235    (5)          [ 0x80 0xFF -1 -1- -1 ]
2236
2237    and the sequence [ -2 DECODED-RULE ] is changed to the original
2238    byte sequence as below:
2239         o the original byte sequence is B: [ B -1 ]
2240         o the original byte sequence is B1 B2: [ B1 B2 ]
2241
2242    Most of the routines are implemented by macros because many
2243    variables and labels in the caller decode_coding_emacs_mule must be
2244    accessible, and they are usually called just once (thus doesn't
2245    increase the size of compiled object).  */
2246
2247 /* Decode a composition rule represented by C as a component of
2248    composition sequence of Emacs 20 style.  Set RULE to the decoded
2249    rule. */
2250
2251 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2252   do {                                                  \
2253     int gref, nref;                                     \
2254                                                         \
2255     c -= 0xA0;                                          \
2256     if (c < 0 || c >= 81)                               \
2257       goto invalid_code;                                \
2258     gref = c / 9, nref = c % 9;                         \
2259     if (gref == 4) gref = 10;                           \
2260     if (nref == 4) nref = 10;                           \
2261     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2262   } while (0)
2263
2264
2265 /* Decode a composition rule represented by C and the following byte
2266    at SRC as a component of composition sequence of Emacs 21 style.
2267    Set RULE to the decoded rule.  */
2268
2269 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2270   do {                                                  \
2271     int gref, nref;                                     \
2272                                                         \
2273     gref = c - 0x20;                                    \
2274     if (gref < 0 || gref >= 81)                         \
2275       goto invalid_code;                                \
2276     ONE_MORE_BYTE (c);                                  \
2277     nref = c - 0x20;                                    \
2278     if (nref < 0 || nref >= 81)                         \
2279       goto invalid_code;                                \
2280     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2281   } while (0)
2282
2283
2284 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2285    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2286    byte length of this composition information, CHARS is the number of
2287    characters composed by this composition.  */
2288
2289 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2290   do {                                                                  \
2291     enum composition_method method = c - 0xF2;                          \
2292     int *charbuf_base = charbuf;                                        \
2293     int nbytes, nchars;                                                 \
2294                                                                         \
2295     ONE_MORE_BYTE (c);                                                  \
2296     if (c < 0)                                                          \
2297       goto invalid_code;                                                \
2298     nbytes = c - 0xA0;                                                  \
2299     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2300       goto invalid_code;                                                \
2301     ONE_MORE_BYTE (c);                                                  \
2302     nchars = c - 0xA0;                                                  \
2303     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2304       goto invalid_code;                                                \
2305     cmp_status->old_form = 0;                                           \
2306     cmp_status->method = method;                                        \
2307     if (method == COMPOSITION_RELATIVE)                                 \
2308       cmp_status->state = COMPOSING_CHAR;                               \
2309     else                                                                \
2310       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2311     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2312     cmp_status->nchars = nchars;                                        \
2313     cmp_status->ncomps = nbytes - 4;                                    \
2314     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2315   } while (0)
2316
2317
2318 /* Start of Emacs 20 style format for relative composition.  */
2319
2320 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2321   do {                                                          \
2322     cmp_status->old_form = 1;                                   \
2323     cmp_status->method = COMPOSITION_RELATIVE;                  \
2324     cmp_status->state = COMPOSING_CHAR;                         \
2325     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2326     cmp_status->nchars = cmp_status->ncomps = 0;                \
2327     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for rule-base composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2345   do {                                                  \
2346     const unsigned char *current_src = src;             \
2347                                                         \
2348     ONE_MORE_BYTE (c);                                  \
2349     if (c < 0)                                          \
2350       goto invalid_code;                                \
2351     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2352         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2353       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2354     else if (c < 0xA0)                                  \
2355       goto invalid_code;                                \
2356     else if (c < 0xC0)                                  \
2357       {                                                 \
2358         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2359         /* Re-read C as a composition component.  */    \
2360         src = current_src;                              \
2361       }                                                 \
2362     else if (c == 0xFF)                                 \
2363       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2364     else                                                \
2365       goto invalid_code;                                \
2366   } while (0)
2367
2368 #define EMACS_MULE_COMPOSITION_END()                            \
2369   do {                                                          \
2370     int idx = - cmp_status->length;                             \
2371                                                                 \
2372     if (cmp_status->old_form)                                   \
2373       charbuf[idx + 2] = cmp_status->nchars;                    \
2374     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2375       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2376     cmp_status->state = COMPOSING_NO;                           \
2377   } while (0)
2378
2379
2380 static int
2381 emacs_mule_finish_composition (charbuf, cmp_status)
2382      int *charbuf;
2383      struct composition_status *cmp_status;
2384 {
2385   int idx = - cmp_status->length;
2386   int new_chars;
2387
2388   if (cmp_status->old_form && cmp_status->nchars > 0)
2389     {
2390       charbuf[idx + 2] = cmp_status->nchars;
2391       new_chars = 0;
2392       if (cmp_status->method == COMPOSITION_WITH_RULE
2393           && cmp_status->state == COMPOSING_CHAR)
2394         {
2395           /* The last rule was invalid.  */
2396           int rule = charbuf[-1] + 0xA0;
2397
2398           charbuf[-2] = BYTE8_TO_CHAR (rule);
2399           charbuf[-1] = -1;
2400           new_chars = 1;
2401         }
2402     }
2403   else
2404     {
2405       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2406
2407       if (cmp_status->method == COMPOSITION_WITH_RULE)
2408         {
2409           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2410           charbuf[idx++] = -3;
2411           charbuf[idx++] = 0;
2412           new_chars = 1;
2413         }
2414       else
2415         {
2416           int nchars = charbuf[idx + 1] + 0xA0;
2417           int nbytes = charbuf[idx + 2] + 0xA0;
2418
2419           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2420           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2421           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2422           charbuf[idx++] = -1;
2423           new_chars = 4;
2424         }
2425     }
2426   cmp_status->state = COMPOSING_NO;
2427   return new_chars;
2428 }
2429
2430 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2431   do {                                                                    \
2432     if (cmp_status->state != COMPOSING_NO)                                \
2433       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2434   } while (0)
2435
2436
2437 static void
2438 decode_coding_emacs_mule (coding)
2439      struct coding_system *coding;
2440 {
2441   const unsigned char *src = coding->source + coding->consumed;
2442   const unsigned char *src_end = coding->source + coding->src_bytes;
2443   const unsigned char *src_base;
2444   int *charbuf = coding->charbuf + coding->charbuf_used;
2445   int *charbuf_end
2446     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2447   int consumed_chars = 0, consumed_chars_base;
2448   int multibytep = coding->src_multibyte;
2449   Lisp_Object attrs, charset_list;
2450   int char_offset = coding->produced_char;
2451   int last_offset = char_offset;
2452   int last_id = charset_ascii;
2453   int eol_crlf =
2454     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2455   int byte_after_cr = -1;
2456   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2457
2458   CODING_GET_INFO (coding, attrs, charset_list);
2459
2460   if (cmp_status->state != COMPOSING_NO)
2461     {
2462       int i;
2463
2464       for (i = 0; i < cmp_status->length; i++)
2465         *charbuf++ = cmp_status->carryover[i];
2466       coding->annotated = 1;
2467     }
2468
2469   while (1)
2470     {
2471       int c, id;
2472
2473       src_base = src;
2474       consumed_chars_base = consumed_chars;
2475
2476       if (charbuf >= charbuf_end)
2477         {
2478           if (byte_after_cr >= 0)
2479             src_base--;
2480           break;
2481         }
2482
2483       if (byte_after_cr >= 0)
2484         c = byte_after_cr, byte_after_cr = -1;
2485       else
2486         ONE_MORE_BYTE (c);
2487
2488       if (c < 0 || c == 0x80)
2489         {
2490           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2491           if (c < 0)
2492             {
2493               *charbuf++ = -c;
2494               char_offset++;
2495             }
2496           else
2497             DECODE_EMACS_MULE_COMPOSITION_START ();
2498           continue;
2499         }
2500
2501       if (c < 0x80)
2502         {
2503           if (eol_crlf && c == '\r')
2504             ONE_MORE_BYTE (byte_after_cr);
2505           id = charset_ascii;
2506           if (cmp_status->state != COMPOSING_NO)
2507             {
2508               if (cmp_status->old_form)
2509                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2510               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2511                 cmp_status->ncomps--;
2512             }
2513         }
2514       else
2515         {
2516           int nchars, nbytes;
2517
2518           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2519                                cmp_status);
2520           if (c < 0)
2521             {
2522               if (c == -1)
2523                 goto invalid_code;
2524               if (c == -2)
2525                 break;
2526             }
2527           src = src_base + nbytes;
2528           consumed_chars = consumed_chars_base + nchars;
2529           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2530             cmp_status->ncomps -= nchars;
2531         }
2532
2533       /* Now if C >= 0, we found a normally encoded characer, if C <
2534          0, we found an old-style composition component character or
2535          rule.  */
2536
2537       if (cmp_status->state == COMPOSING_NO)
2538         {
2539           if (last_id != id)
2540             {
2541               if (last_id != charset_ascii)
2542                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2543                                   last_id);
2544               last_id = id;
2545               last_offset = char_offset;
2546             }
2547           *charbuf++ = c;
2548           char_offset++;
2549         }
2550       else if (cmp_status->state == COMPOSING_CHAR)
2551         {
2552           if (cmp_status->old_form)
2553             {
2554               if (c >= 0)
2555                 {
2556                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2557                   *charbuf++ = c;
2558                   char_offset++;
2559                 }
2560               else
2561                 {
2562                   *charbuf++ = -c;
2563                   cmp_status->nchars++;
2564                   cmp_status->length++;
2565                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2566                     EMACS_MULE_COMPOSITION_END ();
2567                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2568                     cmp_status->state = COMPOSING_RULE;
2569                 }
2570             }
2571           else
2572             {
2573               *charbuf++ = c;
2574               cmp_status->length++;
2575               cmp_status->nchars--;
2576               if (cmp_status->nchars == 0)
2577                 EMACS_MULE_COMPOSITION_END ();
2578             }
2579         }
2580       else if (cmp_status->state == COMPOSING_RULE)
2581         {
2582           int rule;
2583
2584           if (c >= 0)
2585             {
2586               EMACS_MULE_COMPOSITION_END ();
2587               *charbuf++ = c;
2588               char_offset++;
2589             }
2590           else
2591             {
2592               c = -c;
2593               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2594               if (rule < 0)
2595                 goto invalid_code;
2596               *charbuf++ = -2;
2597               *charbuf++ = rule;
2598               cmp_status->length += 2;
2599               cmp_status->state = COMPOSING_CHAR;
2600             }
2601         }
2602       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2603         {
2604           *charbuf++ = c;
2605           cmp_status->length++;
2606           if (cmp_status->ncomps == 0)
2607             cmp_status->state = COMPOSING_CHAR;
2608           else if (cmp_status->ncomps > 0)
2609             {
2610               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2611                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2612             }
2613           else
2614             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2615         }
2616       else                      /* COMPOSING_COMPONENT_RULE */
2617         {
2618           int rule;
2619
2620           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2621           if (rule < 0)
2622             goto invalid_code;
2623           *charbuf++ = -2;
2624           *charbuf++ = rule;
2625           cmp_status->length += 2;
2626           cmp_status->ncomps--;
2627           if (cmp_status->ncomps > 0)
2628             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2629           else
2630             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2631         }
2632       continue;
2633
2634     retry:
2635       src = src_base;
2636       consumed_chars = consumed_chars_base;
2637       continue;
2638
2639     invalid_code:
2640       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2641       src = src_base;
2642       consumed_chars = consumed_chars_base;
2643       ONE_MORE_BYTE (c);
2644       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2645       char_offset++;
2646       coding->errors++;
2647     }
2648
2649  no_more_source:
2650   if (cmp_status->state != COMPOSING_NO)
2651     {
2652       if (coding->mode & CODING_MODE_LAST_BLOCK)
2653         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2654       else
2655         {
2656           int i;
2657
2658           charbuf -= cmp_status->length;
2659           for (i = 0; i < cmp_status->length; i++)
2660             cmp_status->carryover[i] = charbuf[i];
2661         }
2662     }
2663   if (last_id != charset_ascii)
2664     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2665   coding->consumed_char += consumed_chars_base;
2666   coding->consumed = src_base - coding->source;
2667   coding->charbuf_used = charbuf - coding->charbuf;
2668 }
2669
2670
2671 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2672   do {                                          \
2673     if (id < 0xA0)                              \
2674       codes[0] = id, codes[1] = 0;              \
2675     else if (id < 0xE0)                         \
2676       codes[0] = 0x9A, codes[1] = id;           \
2677     else if (id < 0xF0)                         \
2678       codes[0] = 0x9B, codes[1] = id;           \
2679     else if (id < 0xF5)                         \
2680       codes[0] = 0x9C, codes[1] = id;           \
2681     else                                        \
2682       codes[0] = 0x9D, codes[1] = id;           \
2683   } while (0);
2684
2685
2686 static int
2687 encode_coding_emacs_mule (coding)
2688      struct coding_system *coding;
2689 {
2690   int multibytep = coding->dst_multibyte;
2691   int *charbuf = coding->charbuf;
2692   int *charbuf_end = charbuf + coding->charbuf_used;
2693   unsigned char *dst = coding->destination + coding->produced;
2694   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2695   int safe_room = 8;
2696   int produced_chars = 0;
2697   Lisp_Object attrs, charset_list;
2698   int c;
2699   int preferred_charset_id = -1;
2700
2701   CODING_GET_INFO (coding, attrs, charset_list);
2702   if (! EQ (charset_list, Vemacs_mule_charset_list))
2703     {
2704       CODING_ATTR_CHARSET_LIST (attrs)
2705         = charset_list = Vemacs_mule_charset_list;
2706     }
2707
2708   while (charbuf < charbuf_end)
2709     {
2710       ASSURE_DESTINATION (safe_room);
2711       c = *charbuf++;
2712
2713       if (c < 0)
2714         {
2715           /* Handle an annotation.  */
2716           switch (*charbuf)
2717             {
2718             case CODING_ANNOTATE_COMPOSITION_MASK:
2719               /* Not yet implemented.  */
2720               break;
2721             case CODING_ANNOTATE_CHARSET_MASK:
2722               preferred_charset_id = charbuf[3];
2723               if (preferred_charset_id >= 0
2724                   && NILP (Fmemq (make_number (preferred_charset_id),
2725                                   charset_list)))
2726                 preferred_charset_id = -1;
2727               break;
2728             default:
2729               abort ();
2730             }
2731           charbuf += -c - 1;
2732           continue;
2733         }
2734
2735       if (ASCII_CHAR_P (c))
2736         EMIT_ONE_ASCII_BYTE (c);
2737       else if (CHAR_BYTE8_P (c))
2738         {
2739           c = CHAR_TO_BYTE8 (c);
2740           EMIT_ONE_BYTE (c);
2741         }
2742       else
2743         {
2744           struct charset *charset;
2745           unsigned code;
2746           int dimension;
2747           int emacs_mule_id;
2748           unsigned char leading_codes[2];
2749
2750           if (preferred_charset_id >= 0)
2751             {
2752               charset = CHARSET_FROM_ID (preferred_charset_id);
2753               if (CHAR_CHARSET_P (c, charset))
2754                 code = ENCODE_CHAR (charset, c);
2755               else
2756                 charset = char_charset (c, charset_list, &code);
2757             }
2758           else
2759             charset = char_charset (c, charset_list, &code);
2760           if (! charset)
2761             {
2762               c = coding->default_char;
2763               if (ASCII_CHAR_P (c))
2764                 {
2765                   EMIT_ONE_ASCII_BYTE (c);
2766                   continue;
2767                 }
2768               charset = char_charset (c, charset_list, &code);
2769             }
2770           dimension = CHARSET_DIMENSION (charset);
2771           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2772           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2773           EMIT_ONE_BYTE (leading_codes[0]);
2774           if (leading_codes[1])
2775             EMIT_ONE_BYTE (leading_codes[1]);
2776           if (dimension == 1)
2777             EMIT_ONE_BYTE (code | 0x80);
2778           else
2779             {
2780               code |= 0x8080;
2781               EMIT_ONE_BYTE (code >> 8);
2782               EMIT_ONE_BYTE (code & 0xFF);
2783             }
2784         }
2785     }
2786   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2787   coding->produced_char += produced_chars;
2788   coding->produced = dst - coding->destination;
2789   return 0;
2790 }
2791
2792 \f
2793 /*** 7. ISO2022 handlers ***/
2794
2795 /* The following note describes the coding system ISO2022 briefly.
2796    Since the intention of this note is to help understand the
2797    functions in this file, some parts are NOT ACCURATE or are OVERLY
2798    SIMPLIFIED.  For thorough understanding, please refer to the
2799    original document of ISO2022.  This is equivalent to the standard
2800    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2801
2802    ISO2022 provides many mechanisms to encode several character sets
2803    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2804    is encoded using bytes less than 128.  This may make the encoded
2805    text a little bit longer, but the text passes more easily through
2806    several types of gateway, some of which strip off the MSB (Most
2807    Significant Bit).
2808
2809    There are two kinds of character sets: control character sets and
2810    graphic character sets.  The former contain control characters such
2811    as `newline' and `escape' to provide control functions (control
2812    functions are also provided by escape sequences).  The latter
2813    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2814    two control character sets and many graphic character sets.
2815
2816    Graphic character sets are classified into one of the following
2817    four classes, according to the number of bytes (DIMENSION) and
2818    number of characters in one dimension (CHARS) of the set:
2819    - DIMENSION1_CHARS94
2820    - DIMENSION1_CHARS96
2821    - DIMENSION2_CHARS94
2822    - DIMENSION2_CHARS96
2823
2824    In addition, each character set is assigned an identification tag,
2825    unique for each set, called the "final character" (denoted as <F>
2826    hereafter).  The <F> of each character set is decided by ECMA(*)
2827    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2828    (0x30..0x3F are for private use only).
2829
2830    Note (*): ECMA = European Computer Manufacturers Association
2831
2832    Here are examples of graphic character sets [NAME(<F>)]:
2833         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2834         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2835         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2836         o DIMENSION2_CHARS96 -- none for the moment
2837
2838    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2839         C0 [0x00..0x1F] -- control character plane 0
2840         GL [0x20..0x7F] -- graphic character plane 0
2841         C1 [0x80..0x9F] -- control character plane 1
2842         GR [0xA0..0xFF] -- graphic character plane 1
2843
2844    A control character set is directly designated and invoked to C0 or
2845    C1 by an escape sequence.  The most common case is that:
2846    - ISO646's  control character set is designated/invoked to C0, and
2847    - ISO6429's control character set is designated/invoked to C1,
2848    and usually these designations/invocations are omitted in encoded
2849    text.  In a 7-bit environment, only C0 can be used, and a control
2850    character for C1 is encoded by an appropriate escape sequence to
2851    fit into the environment.  All control characters for C1 are
2852    defined to have corresponding escape sequences.
2853
2854    A graphic character set is at first designated to one of four
2855    graphic registers (G0 through G3), then these graphic registers are
2856    invoked to GL or GR.  These designations and invocations can be
2857    done independently.  The most common case is that G0 is invoked to
2858    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2859    these invocations and designations are omitted in encoded text.
2860    In a 7-bit environment, only GL can be used.
2861
2862    When a graphic character set of CHARS94 is invoked to GL, codes
2863    0x20 and 0x7F of the GL area work as control characters SPACE and
2864    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2865    be used.
2866
2867    There are two ways of invocation: locking-shift and single-shift.
2868    With locking-shift, the invocation lasts until the next different
2869    invocation, whereas with single-shift, the invocation affects the
2870    following character only and doesn't affect the locking-shift
2871    state.  Invocations are done by the following control characters or
2872    escape sequences:
2873
2874    ----------------------------------------------------------------------
2875    abbrev  function                  cntrl escape seq   description
2876    ----------------------------------------------------------------------
2877    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2878    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2879    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2880    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2881    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2882    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2883    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2884    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2885    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2886    ----------------------------------------------------------------------
2887    (*) These are not used by any known coding system.
2888
2889    Control characters for these functions are defined by macros
2890    ISO_CODE_XXX in `coding.h'.
2891
2892    Designations are done by the following escape sequences:
2893    ----------------------------------------------------------------------
2894    escape sequence      description
2895    ----------------------------------------------------------------------
2896    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2897    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2898    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2899    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2900    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2901    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2902    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2903    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2904    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2905    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2906    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2907    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2908    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2909    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2910    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2911    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2912    ----------------------------------------------------------------------
2913
2914    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2915    of dimension 1, chars 94, and final character <F>, etc...
2916
2917    Note (*): Although these designations are not allowed in ISO2022,
2918    Emacs accepts them on decoding, and produces them on encoding
2919    CHARS96 character sets in a coding system which is characterized as
2920    7-bit environment, non-locking-shift, and non-single-shift.
2921
2922    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2923    '(' must be omitted.  We refer to this as "short-form" hereafter.
2924
2925    Now you may notice that there are a lot of ways of encoding the
2926    same multilingual text in ISO2022.  Actually, there exist many
2927    coding systems such as Compound Text (used in X11's inter client
2928    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2929    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2930    localized platforms), and all of these are variants of ISO2022.
2931
2932    In addition to the above, Emacs handles two more kinds of escape
2933    sequences: ISO6429's direction specification and Emacs' private
2934    sequence for specifying character composition.
2935
2936    ISO6429's direction specification takes the following form:
2937         o CSI ']'      -- end of the current direction
2938         o CSI '0' ']'  -- end of the current direction
2939         o CSI '1' ']'  -- start of left-to-right text
2940         o CSI '2' ']'  -- start of right-to-left text
2941    The control character CSI (0x9B: control sequence introducer) is
2942    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2943
2944    Character composition specification takes the following form:
2945         o ESC '0' -- start relative composition
2946         o ESC '1' -- end composition
2947         o ESC '2' -- start rule-base composition (*)
2948         o ESC '3' -- start relative composition with alternate chars  (**)
2949         o ESC '4' -- start rule-base composition with alternate chars  (**)
2950   Since these are not standard escape sequences of any ISO standard,
2951   the use of them with these meanings is restricted to Emacs only.
2952
2953   (*) This form is used only in Emacs 20.7 and older versions,
2954   but newer versions can safely decode it.
2955   (**) This form is used only in Emacs 21.1 and newer versions,
2956   and older versions can't decode it.
2957
2958   Here's a list of example usages of these composition escape
2959   sequences (categorized by `enum composition_method').
2960
2961   COMPOSITION_RELATIVE:
2962         ESC 0 CHAR [ CHAR ] ESC 1
2963   COMPOSITION_WITH_RULE:
2964         ESC 2 CHAR [ RULE CHAR ] ESC 1
2965   COMPOSITION_WITH_ALTCHARS:
2966         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2967   COMPOSITION_WITH_RULE_ALTCHARS:
2968         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2969
2970 enum iso_code_class_type iso_code_class[256];
2971
2972 #define SAFE_CHARSET_P(coding, id)      \
2973   ((id) <= (coding)->max_charset_id     \
2974    && (coding)->safe_charsets[id] != 255)
2975
2976
2977 #define SHIFT_OUT_OK(category)  \
2978   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2979
2980 static void
2981 setup_iso_safe_charsets (attrs)
2982      Lisp_Object attrs;
2983 {
2984   Lisp_Object charset_list, safe_charsets;
2985   Lisp_Object request;
2986   Lisp_Object reg_usage;
2987   Lisp_Object tail;
2988   int reg94, reg96;
2989   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2990   int max_charset_id;
2991
2992   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2993   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2994       && ! EQ (charset_list, Viso_2022_charset_list))
2995     {
2996       CODING_ATTR_CHARSET_LIST (attrs)
2997         = charset_list = Viso_2022_charset_list;
2998       ASET (attrs, coding_attr_safe_charsets, Qnil);
2999     }
3000
3001   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3002     return;
3003
3004   max_charset_id = 0;
3005   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3006     {
3007       int id = XINT (XCAR (tail));
3008       if (max_charset_id < id)
3009         max_charset_id = id;
3010     }
3011
3012   safe_charsets = make_uninit_string (max_charset_id + 1);
3013   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3014   request = AREF (attrs, coding_attr_iso_request);
3015   reg_usage = AREF (attrs, coding_attr_iso_usage);
3016   reg94 = XINT (XCAR (reg_usage));
3017   reg96 = XINT (XCDR (reg_usage));
3018
3019   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3020     {
3021       Lisp_Object id;
3022       Lisp_Object reg;
3023       struct charset *charset;
3024
3025       id = XCAR (tail);
3026       charset = CHARSET_FROM_ID (XINT (id));
3027       reg = Fcdr (Fassq (id, request));
3028       if (! NILP (reg))
3029         SSET (safe_charsets, XINT (id), XINT (reg));
3030       else if (charset->iso_chars_96)
3031         {
3032           if (reg96 < 4)
3033             SSET (safe_charsets, XINT (id), reg96);
3034         }
3035       else
3036         {
3037           if (reg94 < 4)
3038             SSET (safe_charsets, XINT (id), reg94);
3039         }
3040     }
3041   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3042 }
3043
3044
3045 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3046    Check if a text is encoded in one of ISO-2022 based codig systems.
3047    If it is, return 1, else return 0.  */
3048
3049 static int
3050 detect_coding_iso_2022 (coding, detect_info)
3051      struct coding_system *coding;
3052      struct coding_detection_info *detect_info;
3053 {
3054   const unsigned char *src = coding->source, *src_base = src;
3055   const unsigned char *src_end = coding->source + coding->src_bytes;
3056   int multibytep = coding->src_multibyte;
3057   int single_shifting = 0;
3058   int id;
3059   int c, c1;
3060   int consumed_chars = 0;
3061   int i;
3062   int rejected = 0;
3063   int found = 0;
3064   int composition_count = -1;
3065
3066   detect_info->checked |= CATEGORY_MASK_ISO;
3067
3068   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3069     {
3070       struct coding_system *this = &(coding_categories[i]);
3071       Lisp_Object attrs, val;
3072
3073       if (this->id < 0)
3074         continue;
3075       attrs = CODING_ID_ATTRS (this->id);
3076       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3077           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3078         setup_iso_safe_charsets (attrs);
3079       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3080       this->max_charset_id = SCHARS (val) - 1;
3081       this->safe_charsets = SDATA (val);
3082     }
3083
3084   /* A coding system of this category is always ASCII compatible.  */
3085   src += coding->head_ascii;
3086
3087   while (rejected != CATEGORY_MASK_ISO)
3088     {
3089       src_base = src;
3090       ONE_MORE_BYTE (c);
3091       switch (c)
3092         {
3093         case ISO_CODE_ESC:
3094           if (inhibit_iso_escape_detection)
3095             break;
3096           single_shifting = 0;
3097           ONE_MORE_BYTE (c);
3098           if (c >= '(' && c <= '/')
3099             {
3100               /* Designation sequence for a charset of dimension 1.  */
3101               ONE_MORE_BYTE (c1);
3102               if (c1 < ' ' || c1 >= 0x80
3103                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3104                 /* Invalid designation sequence.  Just ignore.  */
3105                 break;
3106             }
3107           else if (c == '$')
3108             {
3109               /* Designation sequence for a charset of dimension 2.  */
3110               ONE_MORE_BYTE (c);
3111               if (c >= '@' && c <= 'B')
3112                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3113                 id = iso_charset_table[1][0][c];
3114               else if (c >= '(' && c <= '/')
3115                 {
3116                   ONE_MORE_BYTE (c1);
3117                   if (c1 < ' ' || c1 >= 0x80
3118                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3119                     /* Invalid designation sequence.  Just ignore.  */
3120                     break;
3121                 }
3122               else
3123                 /* Invalid designation sequence.  Just ignore it.  */
3124                 break;
3125             }
3126           else if (c == 'N' || c == 'O')
3127             {
3128               /* ESC <Fe> for SS2 or SS3.  */
3129               single_shifting = 1;
3130               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3131               break;
3132             }
3133           else if (c == '1')
3134             {
3135               /* End of composition.  */
3136               if (composition_count < 0
3137                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3138                 /* Invalid */
3139                 break;
3140               composition_count = -1;
3141               found |= CATEGORY_MASK_ISO;
3142             }
3143           else if (c >= '0' && c <= '4')
3144             {
3145               /* ESC <Fp> for start/end composition.  */
3146               composition_count = 0;
3147               break;
3148             }
3149           else
3150             {
3151               /* Invalid escape sequence.  Just ignore it.  */
3152               break;
3153             }
3154
3155           /* We found a valid designation sequence for CHARSET.  */
3156           rejected |= CATEGORY_MASK_ISO_8BIT;
3157           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3158                               id))
3159             found |= CATEGORY_MASK_ISO_7;
3160           else
3161             rejected |= CATEGORY_MASK_ISO_7;
3162           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3163                               id))
3164             found |= CATEGORY_MASK_ISO_7_TIGHT;
3165           else
3166             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3167           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3168                               id))
3169             found |= CATEGORY_MASK_ISO_7_ELSE;
3170           else
3171             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3172           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3173                               id))
3174             found |= CATEGORY_MASK_ISO_8_ELSE;
3175           else
3176             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3177           break;
3178
3179         case ISO_CODE_SO:
3180         case ISO_CODE_SI:
3181           /* Locking shift out/in.  */
3182           if (inhibit_iso_escape_detection)
3183             break;
3184           single_shifting = 0;
3185           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3186           break;
3187
3188         case ISO_CODE_CSI:
3189           /* Control sequence introducer.  */
3190           single_shifting = 0;
3191           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3192           found |= CATEGORY_MASK_ISO_8_ELSE;
3193           goto check_extra_latin;
3194
3195         case ISO_CODE_SS2:
3196         case ISO_CODE_SS3:
3197           /* Single shift.   */
3198           if (inhibit_iso_escape_detection)
3199             break;
3200           single_shifting = 0;
3201           rejected |= CATEGORY_MASK_ISO_7BIT;
3202           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3203               & CODING_ISO_FLAG_SINGLE_SHIFT)
3204             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3205           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3206               & CODING_ISO_FLAG_SINGLE_SHIFT)
3207             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3208           if (single_shifting)
3209             break;
3210           goto check_extra_latin;
3211
3212         default:
3213           if (c < 0)
3214             continue;
3215           if (c < 0x80)
3216             {
3217               if (composition_count >= 0)
3218                 composition_count++;
3219               single_shifting = 0;
3220               break;
3221             }
3222           if (c >= 0xA0)
3223             {
3224               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3225               found |= CATEGORY_MASK_ISO_8_1;
3226               /* Check the length of succeeding codes of the range
3227                  0xA0..0FF.  If the byte length is even, we include
3228                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3229                  only when we are not single shifting.  */
3230               if (! single_shifting
3231                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3232                 {
3233                   int i = 1;
3234                   while (src < src_end)
3235                     {
3236                       ONE_MORE_BYTE (c);
3237                       if (c < 0xA0)
3238                         break;
3239                       i++;
3240                     }
3241
3242                   if (i & 1 && src < src_end)
3243                     {
3244                       rejected |= CATEGORY_MASK_ISO_8_2;
3245                       if (composition_count >= 0)
3246                         composition_count += i;
3247                     }
3248                   else
3249                     {
3250                       found |= CATEGORY_MASK_ISO_8_2;
3251                       if (composition_count >= 0)
3252                         composition_count += i / 2;
3253                     }
3254                 }
3255               break;
3256             }
3257         check_extra_latin:
3258           single_shifting = 0;
3259           if (! VECTORP (Vlatin_extra_code_table)
3260               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3261             {
3262               rejected = CATEGORY_MASK_ISO;
3263               break;
3264             }
3265           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3266               & CODING_ISO_FLAG_LATIN_EXTRA)
3267             found |= CATEGORY_MASK_ISO_8_1;
3268           else
3269             rejected |= CATEGORY_MASK_ISO_8_1;
3270           rejected |= CATEGORY_MASK_ISO_8_2;
3271         }
3272     }
3273   detect_info->rejected |= CATEGORY_MASK_ISO;
3274   return 0;
3275
3276  no_more_source:
3277   detect_info->rejected |= rejected;
3278   detect_info->found |= (found & ~rejected);
3279   return 1;
3280 }
3281
3282
3283 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3284    escape sequence should be kept.  */
3285 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3286   do {                                                                  \
3287     int id, prev;                                                       \
3288                                                                         \
3289     if (final < '0' || final >= 128                                     \
3290         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3291         || !SAFE_CHARSET_P (coding, id))                                \
3292       {                                                                 \
3293         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3294         chars_96 = -1;                                                  \
3295         break;                                                          \
3296       }                                                                 \
3297     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3298     if (id == charset_jisx0201_roman)                                   \
3299       {                                                                 \
3300         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3301           id = charset_ascii;                                           \
3302       }                                                                 \
3303     else if (id == charset_jisx0208_1978)                               \
3304       {                                                                 \
3305         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3306           id = charset_jisx0208;                                        \
3307       }                                                                 \
3308     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3309     /* If there was an invalid designation to REG previously, and this  \
3310        designation is ASCII to REG, we should keep this designation     \
3311        sequence.  */                                                    \
3312     if (prev == -2 && id == charset_ascii)                              \
3313       chars_96 = -1;                                                    \
3314   } while (0)
3315
3316
3317 /* Handle these composition sequence (ALT: alternate char):
3318
3319    (1) relative composition: ESC 0 CHAR ... ESC 1
3320    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3321    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3322    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3323
3324    When the start sequence (ESC 0/2/3/4) is found, this annotation
3325    header is produced.
3326
3327         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3328
3329    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3330    produced until the end sequence (ESC 1) is found:
3331
3332    (1) CHAR ... CHAR
3333    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3334    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3335    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3336
3337    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3338    annotation header is updated as below:
3339
3340    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3341    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3342    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3343    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3344
3345    If an error is found while composing, the annotation header is
3346    changed to:
3347
3348         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3349
3350    and the sequence [ -2 DECODED-RULE ] is changed to the original
3351    byte sequence as below:
3352         o the original byte sequence is B: [ B -1 ]
3353         o the original byte sequence is B1 B2: [ B1 B2 ]
3354    and the sequence [ -1 -1 ] is changed to the original byte
3355    sequence:
3356         [ ESC '0' ]
3357 */
3358
3359 /* Decode a composition rule C1 and maybe one more byte from the
3360    source, and set RULE to the encoded composition rule, NBYTES to the
3361    length of the composition rule.  If the rule is invalid, set RULE
3362    to some negative value.  */
3363
3364 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3365   do {                                                                  \
3366     rule = c1 - 32;                                                     \
3367     if (rule < 0)                                                       \
3368       break;                                                            \
3369     if (rule < 81)              /* old format (before ver.21) */        \
3370       {                                                                 \
3371         int gref = (rule) / 9;                                          \
3372         int nref = (rule) % 9;                                          \
3373         if (gref == 4) gref = 10;                                       \
3374         if (nref == 4) nref = 10;                                       \
3375         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3376         nbytes = 1;                                                     \
3377       }                                                                 \
3378     else                        /* new format (after ver.21) */         \
3379       {                                                                 \
3380         int c;                                                          \
3381                                                                         \
3382         ONE_MORE_BYTE (c);                                              \
3383         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3384         if (rule >= 0)                                                  \
3385           rule += 0x100;   /* to destinguish it from the old format */  \
3386         nbytes = 2;                                                     \
3387       }                                                                 \
3388   } while (0)
3389
3390 #define ENCODE_COMPOSITION_RULE(rule)                           \
3391   do {                                                          \
3392     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3393                                                                 \
3394     if (rule < 0x100)           /* old format */                \
3395       {                                                         \
3396         if (gref == 10) gref = 4;                               \
3397         if (nref == 10) nref = 4;                               \
3398         charbuf[idx] = 32 + gref * 9 + nref;                    \
3399         charbuf[idx + 1] = -1;                                  \
3400         new_chars++;                                            \
3401       }                                                         \
3402     else                                /* new format */        \
3403       {                                                         \
3404         charbuf[idx] = 32 + 81 + gref;                          \
3405         charbuf[idx + 1] = 32 + nref;                           \
3406         new_chars += 2;                                         \
3407       }                                                         \
3408   } while (0)
3409
3410 /* Finish the current composition as invalid.  */
3411
3412 static int finish_composition P_ ((int *, struct composition_status *));
3413
3414 static int
3415 finish_composition (charbuf, cmp_status)
3416      int *charbuf;
3417      struct composition_status *cmp_status;
3418 {
3419   int idx = - cmp_status->length;
3420   int new_chars;
3421
3422   /* Recover the original ESC sequence */
3423   charbuf[idx++] = ISO_CODE_ESC;
3424   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3425                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3426                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3427                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3428                     : '4');
3429   charbuf[idx++] = -2;
3430   charbuf[idx++] = 0;
3431   charbuf[idx++] = -1;
3432   new_chars = cmp_status->nchars;
3433   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3434     for (; idx < 0; idx++)
3435       {
3436         int elt = charbuf[idx];
3437
3438         if (elt == -2)
3439           {
3440             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3441             idx++;
3442           }
3443         else if (elt == -1)
3444           {
3445             charbuf[idx++] = ISO_CODE_ESC;
3446             charbuf[idx] = '0';
3447             new_chars += 2;
3448           }
3449       }
3450   cmp_status->state = COMPOSING_NO;
3451   return new_chars;
3452 }
3453
3454 /* If characers are under composition, finish the composition.  */
3455 #define MAYBE_FINISH_COMPOSITION()                              \
3456   do {                                                          \
3457     if (cmp_status->state != COMPOSING_NO)                      \
3458       char_offset += finish_composition (charbuf, cmp_status);  \
3459   } while (0)
3460
3461 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3462
3463    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3464    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3465    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3466    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3467
3468    Produce this annotation sequence now:
3469
3470    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3471 */
3472
3473 #define DECODE_COMPOSITION_START(c1)                                       \
3474   do {                                                                     \
3475     if (c1 == '0'                                                          \
3476         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3477              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3478             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3479                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3480       {                                                                    \
3481         *charbuf++ = -1;                                                   \
3482         *charbuf++= -1;                                                    \
3483         cmp_status->state = COMPOSING_CHAR;                                \
3484         cmp_status->length += 2;                                           \
3485       }                                                                    \
3486     else                                                                   \
3487       {                                                                    \
3488         MAYBE_FINISH_COMPOSITION ();                                       \
3489         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3490                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3491                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3492                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3493         cmp_status->state                                                  \
3494           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3495         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3496         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3497         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3498         coding->annotated = 1;                                             \
3499       }                                                                    \
3500   } while (0)
3501
3502
3503 /* Handle composition end sequence ESC 1.  */
3504
3505 #define DECODE_COMPOSITION_END()                                        \
3506   do {                                                                  \
3507     if (cmp_status->nchars == 0                                         \
3508         || ((cmp_status->state == COMPOSING_CHAR)                       \
3509             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3510       {                                                                 \
3511         MAYBE_FINISH_COMPOSITION ();                                    \
3512         goto invalid_code;                                              \
3513       }                                                                 \
3514     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3515       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3516     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3517       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3518     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3519     char_offset += cmp_status->nchars;                                  \
3520     cmp_status->state = COMPOSING_NO;                                   \
3521   } while (0)
3522
3523 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3524
3525 #define STORE_COMPOSITION_RULE(rule)    \
3526   do {                                  \
3527     *charbuf++ = -2;                    \
3528     *charbuf++ = rule;                  \
3529     cmp_status->length += 2;            \
3530     cmp_status->state--;                \
3531   } while (0)
3532
3533 /* Store a composed char or a component char C in charbuf, and update
3534    cmp_status.  */
3535
3536 #define STORE_COMPOSITION_CHAR(c)                                       \
3537   do {                                                                  \
3538     *charbuf++ = (c);                                                   \
3539     cmp_status->length++;                                               \
3540     if (cmp_status->state == COMPOSING_CHAR)                            \
3541       cmp_status->nchars++;                                             \
3542     else                                                                \
3543       cmp_status->ncomps++;                                             \
3544     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3545         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3546             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3547       cmp_status->state++;                                              \
3548   } while (0)
3549
3550
3551 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3552
3553 static void
3554 decode_coding_iso_2022 (coding)
3555      struct coding_system *coding;
3556 {
3557   const unsigned char *src = coding->source + coding->consumed;
3558   const unsigned char *src_end = coding->source + coding->src_bytes;
3559   const unsigned char *src_base;
3560   int *charbuf = coding->charbuf + coding->charbuf_used;
3561   int *charbuf_end
3562     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3563   int consumed_chars = 0, consumed_chars_base;
3564   int multibytep = coding->src_multibyte;
3565   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3566   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3567   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3568   int charset_id_2, charset_id_3;
3569   struct charset *charset;
3570   int c;
3571   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3572   Lisp_Object attrs, charset_list;
3573   int char_offset = coding->produced_char;
3574   int last_offset = char_offset;
3575   int last_id = charset_ascii;
3576   int eol_crlf =
3577     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3578   int byte_after_cr = -1;
3579   int i;
3580
3581   CODING_GET_INFO (coding, attrs, charset_list);
3582   setup_iso_safe_charsets (attrs);
3583   /* Charset list may have been changed.  */
3584   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3585   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3586
3587   if (cmp_status->state != COMPOSING_NO)
3588     {
3589       for (i = 0; i < cmp_status->length; i++)
3590         *charbuf++ = cmp_status->carryover[i];
3591       coding->annotated = 1;
3592     }
3593
3594   while (1)
3595     {
3596       int c1, c2;
3597
3598       src_base = src;
3599       consumed_chars_base = consumed_chars;
3600
3601       if (charbuf >= charbuf_end)
3602         {
3603           if (byte_after_cr >= 0)
3604             src_base--;
3605           break;
3606         }
3607
3608       if (byte_after_cr >= 0)
3609         c1 = byte_after_cr, byte_after_cr = -1;
3610       else
3611         ONE_MORE_BYTE (c1);
3612       if (c1 < 0)
3613         goto invalid_code;
3614
3615       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3616         {
3617           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3618           char_offset++;
3619           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3620           continue;
3621         }
3622
3623       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3624         {
3625           if (c1 == ISO_CODE_ESC)
3626             {
3627               if (src + 1 >= src_end)
3628                 goto no_more_source;
3629               *charbuf++ = ISO_CODE_ESC;
3630               char_offset++;
3631               if (src[0] == '%' && src[1] == '@')
3632                 {
3633                   src += 2;
3634                   consumed_chars += 2;
3635                   char_offset += 2;
3636                   /* We are sure charbuf can contain two more chars. */
3637                   *charbuf++ = '%';
3638                   *charbuf++ = '@';
3639                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3640                 }
3641             }
3642           else
3643             {
3644               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3645               char_offset++;
3646             }
3647           continue;
3648         }
3649
3650       if ((cmp_status->state == COMPOSING_RULE
3651            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3652           && c1 != ISO_CODE_ESC)
3653         {
3654           int rule, nbytes;
3655
3656           DECODE_COMPOSITION_RULE (rule, nbytes);
3657           if (rule < 0)
3658             goto invalid_code;
3659           STORE_COMPOSITION_RULE (rule);
3660           continue;
3661         }
3662
3663       /* We produce at most one character.  */
3664       switch (iso_code_class [c1])
3665         {
3666         case ISO_0x20_or_0x7F:
3667           if (charset_id_0 < 0
3668               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3669             /* This is SPACE or DEL.  */
3670             charset = CHARSET_FROM_ID (charset_ascii);
3671           else
3672             charset = CHARSET_FROM_ID (charset_id_0);
3673           break;
3674
3675         case ISO_graphic_plane_0:
3676           if (charset_id_0 < 0)
3677             charset = CHARSET_FROM_ID (charset_ascii);
3678           else
3679             charset = CHARSET_FROM_ID (charset_id_0);
3680           break;
3681
3682         case ISO_0xA0_or_0xFF:
3683           if (charset_id_1 < 0
3684               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3685               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3686             goto invalid_code;
3687           /* This is a graphic character, we fall down ... */
3688
3689         case ISO_graphic_plane_1:
3690           if (charset_id_1 < 0)
3691             goto invalid_code;
3692           charset = CHARSET_FROM_ID (charset_id_1);
3693           break;
3694
3695         case ISO_control_0:
3696           if (eol_crlf && c1 == '\r')
3697             ONE_MORE_BYTE (byte_after_cr);
3698           MAYBE_FINISH_COMPOSITION ();
3699           charset = CHARSET_FROM_ID (charset_ascii);
3700           break;
3701
3702         case ISO_control_1:
3703           goto invalid_code;
3704
3705         case ISO_shift_out:
3706           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3707               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3708             goto invalid_code;
3709           CODING_ISO_INVOCATION (coding, 0) = 1;
3710           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3711           continue;
3712
3713         case ISO_shift_in:
3714           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3715             goto invalid_code;
3716           CODING_ISO_INVOCATION (coding, 0) = 0;
3717           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3718           continue;
3719
3720         case ISO_single_shift_2_7:
3721         case ISO_single_shift_2:
3722           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3723             goto invalid_code;
3724           /* SS2 is handled as an escape sequence of ESC 'N' */
3725           c1 = 'N';
3726           goto label_escape_sequence;
3727
3728         case ISO_single_shift_3:
3729           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3730             goto invalid_code;
3731           /* SS2 is handled as an escape sequence of ESC 'O' */
3732           c1 = 'O';
3733           goto label_escape_sequence;
3734
3735         case ISO_control_sequence_introducer:
3736           /* CSI is handled as an escape sequence of ESC '[' ...  */
3737           c1 = '[';
3738           goto label_escape_sequence;
3739
3740         case ISO_escape:
3741           ONE_MORE_BYTE (c1);
3742         label_escape_sequence:
3743           /* Escape sequences handled here are invocation,
3744              designation, direction specification, and character
3745              composition specification.  */
3746           switch (c1)
3747             {
3748             case '&':           /* revision of following character set */
3749               ONE_MORE_BYTE (c1);
3750               if (!(c1 >= '@' && c1 <= '~'))
3751                 goto invalid_code;
3752               ONE_MORE_BYTE (c1);
3753               if (c1 != ISO_CODE_ESC)
3754                 goto invalid_code;
3755               ONE_MORE_BYTE (c1);
3756               goto label_escape_sequence;
3757
3758             case '$':           /* designation of 2-byte character set */
3759               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3760                 goto invalid_code;
3761               {
3762                 int reg, chars96;
3763
3764                 ONE_MORE_BYTE (c1);
3765                 if (c1 >= '@' && c1 <= 'B')
3766                   {     /* designation of JISX0208.1978, GB2312.1980,
3767                            or JISX0208.1980 */
3768                     reg = 0, chars96 = 0;
3769                   }
3770                 else if (c1 >= 0x28 && c1 <= 0x2B)
3771                   { /* designation of DIMENSION2_CHARS94 character set */
3772                     reg = c1 - 0x28, chars96 = 0;
3773                     ONE_MORE_BYTE (c1);
3774                   }
3775                 else if (c1 >= 0x2C && c1 <= 0x2F)
3776                   { /* designation of DIMENSION2_CHARS96 character set */
3777                     reg = c1 - 0x2C, chars96 = 1;
3778                     ONE_MORE_BYTE (c1);
3779                   }
3780                 else
3781                   goto invalid_code;
3782                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3783                 /* We must update these variables now.  */
3784                 if (reg == 0)
3785                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3786                 else if (reg == 1)
3787                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3788                 if (chars96 < 0)
3789                   goto invalid_code;
3790               }
3791               continue;
3792
3793             case 'n':           /* invocation of locking-shift-2 */
3794               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3795                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3796                 goto invalid_code;
3797               CODING_ISO_INVOCATION (coding, 0) = 2;
3798               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3799               continue;
3800
3801             case 'o':           /* invocation of locking-shift-3 */
3802               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3803                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3804                 goto invalid_code;
3805               CODING_ISO_INVOCATION (coding, 0) = 3;
3806               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3807               continue;
3808
3809             case 'N':           /* invocation of single-shift-2 */
3810               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3811                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3812                 goto invalid_code;
3813               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3814               if (charset_id_2 < 0)
3815                 charset = CHARSET_FROM_ID (charset_ascii);
3816               else
3817                 charset = CHARSET_FROM_ID (charset_id_2);
3818               ONE_MORE_BYTE (c1);
3819               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3820                 goto invalid_code;
3821               break;
3822
3823             case 'O':           /* invocation of single-shift-3 */
3824               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3825                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3826                 goto invalid_code;
3827               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3828               if (charset_id_3 < 0)
3829                 charset = CHARSET_FROM_ID (charset_ascii);
3830               else
3831                 charset = CHARSET_FROM_ID (charset_id_3);
3832               ONE_MORE_BYTE (c1);
3833               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3834                 goto invalid_code;
3835               break;
3836
3837             case '0': case '2': case '3': case '4': /* start composition */
3838               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3839                 goto invalid_code;
3840               if (last_id != charset_ascii)
3841                 {
3842                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3843                   last_id = charset_ascii;
3844                   last_offset = char_offset;
3845                 }
3846               DECODE_COMPOSITION_START (c1);
3847               continue;
3848
3849             case '1':           /* end composition */
3850               if (cmp_status->state == COMPOSING_NO)
3851                 goto invalid_code;
3852               DECODE_COMPOSITION_END ();
3853               continue;
3854
3855             case '[':           /* specification of direction */
3856               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3857                 goto invalid_code;
3858               /* For the moment, nested direction is not supported.
3859                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3860                  left-to-right, and nozero means right-to-left.  */
3861               ONE_MORE_BYTE (c1);
3862               switch (c1)
3863                 {
3864                 case ']':       /* end of the current direction */
3865                   coding->mode &= ~CODING_MODE_DIRECTION;
3866
3867                 case '0':       /* end of the current direction */
3868                 case '1':       /* start of left-to-right direction */
3869                   ONE_MORE_BYTE (c1);
3870                   if (c1 == ']')
3871                     coding->mode &= ~CODING_MODE_DIRECTION;
3872                   else
3873                     goto invalid_code;
3874                   break;
3875
3876                 case '2':       /* start of right-to-left direction */
3877                   ONE_MORE_BYTE (c1);
3878                   if (c1 == ']')
3879                     coding->mode |= CODING_MODE_DIRECTION;
3880                   else
3881                     goto invalid_code;
3882                   break;
3883
3884                 default:
3885                   goto invalid_code;
3886                 }
3887               continue;
3888
3889             case '%':
3890               ONE_MORE_BYTE (c1);
3891               if (c1 == '/')
3892                 {
3893                   /* CTEXT extended segment:
3894                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3895                      We keep these bytes as is for the moment.
3896                      They may be decoded by post-read-conversion.  */
3897                   int dim, M, L;
3898                   int size;
3899
3900                   ONE_MORE_BYTE (dim);
3901                   if (dim < 0 || dim > 4)
3902                     goto invalid_code;
3903                   ONE_MORE_BYTE (M);
3904                   if (M < 128)
3905                     goto invalid_code;
3906                   ONE_MORE_BYTE (L);
3907                   if (L < 128)
3908                     goto invalid_code;
3909                   size = ((M - 128) * 128) + (L - 128);
3910                   if (charbuf + 6 > charbuf_end)
3911                     goto break_loop;
3912                   *charbuf++ = ISO_CODE_ESC;
3913                   *charbuf++ = '%';
3914                   *charbuf++ = '/';
3915                   *charbuf++ = dim;
3916                   *charbuf++ = BYTE8_TO_CHAR (M);
3917                   *charbuf++ = BYTE8_TO_CHAR (L);
3918                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3919                 }
3920               else if (c1 == 'G')
3921                 {
3922                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3923                      ESC % G --UTF-8-BYTES-- ESC % @
3924                      We keep these bytes as is for the moment.
3925                      They may be decoded by post-read-conversion.  */
3926                   if (charbuf + 3 > charbuf_end)
3927                     goto break_loop;
3928                   *charbuf++ = ISO_CODE_ESC;
3929                   *charbuf++ = '%';
3930                   *charbuf++ = 'G';
3931                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3932                 }
3933               else
3934                 goto invalid_code;
3935               continue;
3936               break;
3937
3938             default:
3939               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3940                 goto invalid_code;
3941               {
3942                 int reg, chars96;
3943
3944                 if (c1 >= 0x28 && c1 <= 0x2B)
3945                   { /* designation of DIMENSION1_CHARS94 character set */
3946                     reg = c1 - 0x28, chars96 = 0;
3947                     ONE_MORE_BYTE (c1);
3948                   }
3949                 else if (c1 >= 0x2C && c1 <= 0x2F)
3950                   { /* designation of DIMENSION1_CHARS96 character set */
3951                     reg = c1 - 0x2C, chars96 = 1;
3952                     ONE_MORE_BYTE (c1);
3953                   }
3954                 else
3955                   goto invalid_code;
3956                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3957                 /* We must update these variables now.  */
3958                 if (reg == 0)
3959                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3960                 else if (reg == 1)
3961                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3962                 if (chars96 < 0)
3963                   goto invalid_code;
3964               }
3965               continue;
3966             }
3967         }
3968
3969       if (cmp_status->state == COMPOSING_NO
3970           && charset->id != charset_ascii
3971           && last_id != charset->id)
3972         {
3973           if (last_id != charset_ascii)
3974             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3975           last_id = charset->id;
3976           last_offset = char_offset;
3977         }
3978
3979       /* Now we know CHARSET and 1st position code C1 of a character.
3980          Produce a decoded character while getting 2nd position code
3981          C2 if necessary.  */
3982       c1 &= 0x7F;
3983       if (CHARSET_DIMENSION (charset) > 1)
3984         {
3985           ONE_MORE_BYTE (c2);
3986           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3987             /* C2 is not in a valid range.  */
3988             goto invalid_code;
3989           c1 = (c1 << 8) | (c2 & 0x7F);
3990           if (CHARSET_DIMENSION (charset) > 2)
3991             {
3992               ONE_MORE_BYTE (c2);
3993               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3994                 /* C2 is not in a valid range.  */
3995                 goto invalid_code;
3996               c1 = (c1 << 8) | (c2 & 0x7F);
3997             }
3998         }
3999
4000       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4001       if (c < 0)
4002         {
4003           MAYBE_FINISH_COMPOSITION ();
4004           for (; src_base < src; src_base++, char_offset++)
4005             {
4006               if (ASCII_BYTE_P (*src_base))
4007                 *charbuf++ = *src_base;
4008               else
4009                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4010             }
4011         }
4012       else if (cmp_status->state == COMPOSING_NO)
4013         {
4014           *charbuf++ = c;
4015           char_offset++;
4016         }
4017       else if ((cmp_status->state == COMPOSING_CHAR
4018                 ? cmp_status->nchars
4019                 : cmp_status->ncomps)
4020                >= MAX_COMPOSITION_COMPONENTS)
4021         {
4022           /* Too long composition.  */
4023           MAYBE_FINISH_COMPOSITION ();
4024           *charbuf++ = c;
4025           char_offset++;
4026         }
4027       else
4028         STORE_COMPOSITION_CHAR (c);
4029       continue;
4030
4031     invalid_code:
4032       MAYBE_FINISH_COMPOSITION ();
4033       src = src_base;
4034       consumed_chars = consumed_chars_base;
4035       ONE_MORE_BYTE (c);
4036       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4037       char_offset++;
4038       coding->errors++;
4039       continue;
4040
4041     break_loop:
4042       break;
4043     }
4044
4045  no_more_source:
4046   if (cmp_status->state != COMPOSING_NO)
4047     {
4048       if (coding->mode & CODING_MODE_LAST_BLOCK)
4049         MAYBE_FINISH_COMPOSITION ();
4050       else
4051         {
4052           charbuf -= cmp_status->length;
4053           for (i = 0; i < cmp_status->length; i++)
4054             cmp_status->carryover[i] = charbuf[i];
4055         }
4056     }
4057   else if (last_id != charset_ascii)
4058     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4059   coding->consumed_char += consumed_chars_base;
4060   coding->consumed = src_base - coding->source;
4061   coding->charbuf_used = charbuf - coding->charbuf;
4062 }
4063
4064
4065 /* ISO2022 encoding stuff.  */
4066
4067 /*
4068    It is not enough to say just "ISO2022" on encoding, we have to
4069    specify more details.  In Emacs, each coding system of ISO2022
4070    variant has the following specifications:
4071         1. Initial designation to G0 thru G3.
4072         2. Allows short-form designation?
4073         3. ASCII should be designated to G0 before control characters?
4074         4. ASCII should be designated to G0 at end of line?
4075         5. 7-bit environment or 8-bit environment?
4076         6. Use locking-shift?
4077         7. Use Single-shift?
4078    And the following two are only for Japanese:
4079         8. Use ASCII in place of JIS0201-1976-Roman?
4080         9. Use JISX0208-1983 in place of JISX0208-1978?
4081    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4082    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4083    details.
4084 */
4085
4086 /* Produce codes (escape sequence) for designating CHARSET to graphic
4087    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4088    '@', 'A', or 'B' and the coding system CODING allows, produce
4089    designation sequence of short-form.  */
4090
4091 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4092   do {                                                                  \
4093     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4094     char *intermediate_char_94 = "()*+";                                \
4095     char *intermediate_char_96 = ",-./";                                \
4096     int revision = -1;                                                  \
4097     int c;                                                              \
4098                                                                         \
4099     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4100       revision = CHARSET_ISO_REVISION (charset);                        \
4101                                                                         \
4102     if (revision >= 0)                                                  \
4103       {                                                                 \
4104         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4105         EMIT_ONE_BYTE ('@' + revision);                                 \
4106       }                                                                 \
4107     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4108     if (CHARSET_DIMENSION (charset) == 1)                               \
4109       {                                                                 \
4110         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4111           c = intermediate_char_94[reg];                                \
4112         else                                                            \
4113           c = intermediate_char_96[reg];                                \
4114         EMIT_ONE_ASCII_BYTE (c);                                        \
4115       }                                                                 \
4116     else                                                                \
4117       {                                                                 \
4118         EMIT_ONE_ASCII_BYTE ('$');                                      \
4119         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4120           {                                                             \
4121             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4122                 || reg != 0                                             \
4123                 || final_char < '@' || final_char > 'B')                \
4124               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4125           }                                                             \
4126         else                                                            \
4127           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4128       }                                                                 \
4129     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4130                                                                         \
4131     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4132   } while (0)
4133
4134
4135 /* The following two macros produce codes (control character or escape
4136    sequence) for ISO2022 single-shift functions (single-shift-2 and
4137    single-shift-3).  */
4138
4139 #define ENCODE_SINGLE_SHIFT_2                                           \
4140   do {                                                                  \
4141     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4142       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4143     else                                                                \
4144       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4145     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4146   } while (0)
4147
4148
4149 #define ENCODE_SINGLE_SHIFT_3                                           \
4150   do {                                                                  \
4151     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4152       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4153     else                                                                \
4154       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4155     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4156   } while (0)
4157
4158
4159 /* The following four macros produce codes (control character or
4160    escape sequence) for ISO2022 locking-shift functions (shift-in,
4161    shift-out, locking-shift-2, and locking-shift-3).  */
4162
4163 #define ENCODE_SHIFT_IN                                 \
4164   do {                                                  \
4165     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4166     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4167   } while (0)
4168
4169
4170 #define ENCODE_SHIFT_OUT                                \
4171   do {                                                  \
4172     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4173     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4174   } while (0)
4175
4176
4177 #define ENCODE_LOCKING_SHIFT_2                          \
4178   do {                                                  \
4179     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4180     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4181   } while (0)
4182
4183
4184 #define ENCODE_LOCKING_SHIFT_3                          \
4185   do {                                                  \
4186     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4187     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4188   } while (0)
4189
4190
4191 /* Produce codes for a DIMENSION1 character whose character set is
4192    CHARSET and whose position-code is C1.  Designation and invocation
4193    sequences are also produced in advance if necessary.  */
4194
4195 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4196   do {                                                                  \
4197     int id = CHARSET_ID (charset);                                      \
4198                                                                         \
4199     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4200         && id == charset_ascii)                                         \
4201       {                                                                 \
4202         id = charset_jisx0201_roman;                                    \
4203         charset = CHARSET_FROM_ID (id);                                 \
4204       }                                                                 \
4205                                                                         \
4206     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4207       {                                                                 \
4208         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4209           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4210         else                                                            \
4211           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4212         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4213         break;                                                          \
4214       }                                                                 \
4215     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4216       {                                                                 \
4217         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4218         break;                                                          \
4219       }                                                                 \
4220     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4221       {                                                                 \
4222         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4223         break;                                                          \
4224       }                                                                 \
4225     else                                                                \
4226       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4227          must invoke it, or, at first, designate it to some graphic     \
4228          register.  Then repeat the loop to actually produce the        \
4229          character.  */                                                 \
4230       dst = encode_invocation_designation (charset, coding, dst,        \
4231                                            &produced_chars);            \
4232   } while (1)
4233
4234
4235 /* Produce codes for a DIMENSION2 character whose character set is
4236    CHARSET and whose position-codes are C1 and C2.  Designation and
4237    invocation codes are also produced in advance if necessary.  */
4238
4239 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4240   do {                                                                  \
4241     int id = CHARSET_ID (charset);                                      \
4242                                                                         \
4243     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4244         && id == charset_jisx0208)                                      \
4245       {                                                                 \
4246         id = charset_jisx0208_1978;                                     \
4247         charset = CHARSET_FROM_ID (id);                                 \
4248       }                                                                 \
4249                                                                         \
4250     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4251       {                                                                 \
4252         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4253           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4254         else                                                            \
4255           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4256         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4260       {                                                                 \
4261         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4262         break;                                                          \
4263       }                                                                 \
4264     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4265       {                                                                 \
4266         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4267         break;                                                          \
4268       }                                                                 \
4269     else                                                                \
4270       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4271          must invoke it, or, at first, designate it to some graphic     \
4272          register.  Then repeat the loop to actually produce the        \
4273          character.  */                                                 \
4274       dst = encode_invocation_designation (charset, coding, dst,        \
4275                                            &produced_chars);            \
4276   } while (1)
4277
4278
4279 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4280   do {                                                                     \
4281     int code = ENCODE_CHAR ((charset),(c));                                \
4282                                                                            \
4283     if (CHARSET_DIMENSION (charset) == 1)                                  \
4284       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4285     else                                                                   \
4286       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4287   } while (0)
4288
4289
4290 /* Produce designation and invocation codes at a place pointed by DST
4291    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4292    Return new DST.  */
4293
4294 unsigned char *
4295 encode_invocation_designation (charset, coding, dst, p_nchars)
4296      struct charset *charset;
4297      struct coding_system *coding;
4298      unsigned char *dst;
4299      int *p_nchars;
4300 {
4301   int multibytep = coding->dst_multibyte;
4302   int produced_chars = *p_nchars;
4303   int reg;                      /* graphic register number */
4304   int id = CHARSET_ID (charset);
4305
4306   /* At first, check designations.  */
4307   for (reg = 0; reg < 4; reg++)
4308     if (id == CODING_ISO_DESIGNATION (coding, reg))
4309       break;
4310
4311   if (reg >= 4)
4312     {
4313       /* CHARSET is not yet designated to any graphic registers.  */
4314       /* At first check the requested designation.  */
4315       reg = CODING_ISO_REQUEST (coding, id);
4316       if (reg < 0)
4317         /* Since CHARSET requests no special designation, designate it
4318            to graphic register 0.  */
4319         reg = 0;
4320
4321       ENCODE_DESIGNATION (charset, reg, coding);
4322     }
4323
4324   if (CODING_ISO_INVOCATION (coding, 0) != reg
4325       && CODING_ISO_INVOCATION (coding, 1) != reg)
4326     {
4327       /* Since the graphic register REG is not invoked to any graphic
4328          planes, invoke it to graphic plane 0.  */
4329       switch (reg)
4330         {
4331         case 0:                 /* graphic register 0 */
4332           ENCODE_SHIFT_IN;
4333           break;
4334
4335         case 1:                 /* graphic register 1 */
4336           ENCODE_SHIFT_OUT;
4337           break;
4338
4339         case 2:                 /* graphic register 2 */
4340           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4341             ENCODE_SINGLE_SHIFT_2;
4342           else
4343             ENCODE_LOCKING_SHIFT_2;
4344           break;
4345
4346         case 3:                 /* graphic register 3 */
4347           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4348             ENCODE_SINGLE_SHIFT_3;
4349           else
4350             ENCODE_LOCKING_SHIFT_3;
4351           break;
4352         }
4353     }
4354
4355   *p_nchars = produced_chars;
4356   return dst;
4357 }
4358
4359 /* The following three macros produce codes for indicating direction
4360    of text.  */
4361 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4362   do {                                                                  \
4363     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4364       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4365     else                                                                \
4366       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4367   } while (0)
4368
4369
4370 #define ENCODE_DIRECTION_R2L()                  \
4371   do {                                          \
4372     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4373     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4374   } while (0)
4375
4376
4377 #define ENCODE_DIRECTION_L2R()                  \
4378   do {                                          \
4379     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4380     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4381   } while (0)
4382
4383
4384 /* Produce codes for designation and invocation to reset the graphic
4385    planes and registers to initial state.  */
4386 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4387   do {                                                                  \
4388     int reg;                                                            \
4389     struct charset *charset;                                            \
4390                                                                         \
4391     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4392       ENCODE_SHIFT_IN;                                                  \
4393     for (reg = 0; reg < 4; reg++)                                       \
4394       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4395           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4396               != CODING_ISO_INITIAL (coding, reg)))                     \
4397         {                                                               \
4398           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4399           ENCODE_DESIGNATION (charset, reg, coding);                    \
4400         }                                                               \
4401   } while (0)
4402
4403
4404 /* Produce designation sequences of charsets in the line started from
4405    SRC to a place pointed by DST, and return updated DST.
4406
4407    If the current block ends before any end-of-line, we may fail to
4408    find all the necessary designations.  */
4409
4410 static unsigned char *
4411 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4412      struct coding_system *coding;
4413      int *charbuf, *charbuf_end;
4414      unsigned char *dst;
4415 {
4416   struct charset *charset;
4417   /* Table of charsets to be designated to each graphic register.  */
4418   int r[4];
4419   int c, found = 0, reg;
4420   int produced_chars = 0;
4421   int multibytep = coding->dst_multibyte;
4422   Lisp_Object attrs;
4423   Lisp_Object charset_list;
4424
4425   attrs = CODING_ID_ATTRS (coding->id);
4426   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4427   if (EQ (charset_list, Qiso_2022))
4428     charset_list = Viso_2022_charset_list;
4429
4430   for (reg = 0; reg < 4; reg++)
4431     r[reg] = -1;
4432
4433   while (found < 4)
4434     {
4435       int id;
4436
4437       c = *charbuf++;
4438       if (c == '\n')
4439         break;
4440       charset = char_charset (c, charset_list, NULL);
4441       id = CHARSET_ID (charset);
4442       reg = CODING_ISO_REQUEST (coding, id);
4443       if (reg >= 0 && r[reg] < 0)
4444         {
4445           found++;
4446           r[reg] = id;
4447         }
4448     }
4449
4450   if (found)
4451     {
4452       for (reg = 0; reg < 4; reg++)
4453         if (r[reg] >= 0
4454             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4455           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4456     }
4457
4458   return dst;
4459 }
4460
4461 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4462
4463 static int
4464 encode_coding_iso_2022 (coding)
4465      struct coding_system *coding;
4466 {
4467   int multibytep = coding->dst_multibyte;
4468   int *charbuf = coding->charbuf;
4469   int *charbuf_end = charbuf + coding->charbuf_used;
4470   unsigned char *dst = coding->destination + coding->produced;
4471   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4472   int safe_room = 16;
4473   int bol_designation
4474     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4475        && CODING_ISO_BOL (coding));
4476   int produced_chars = 0;
4477   Lisp_Object attrs, eol_type, charset_list;
4478   int ascii_compatible;
4479   int c;
4480   int preferred_charset_id = -1;
4481
4482   CODING_GET_INFO (coding, attrs, charset_list);
4483   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4484   if (VECTORP (eol_type))
4485     eol_type = Qunix;
4486
4487   setup_iso_safe_charsets (attrs);
4488   /* Charset list may have been changed.  */
4489   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4490   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4491
4492   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4493
4494   while (charbuf < charbuf_end)
4495     {
4496       ASSURE_DESTINATION (safe_room);
4497
4498       if (bol_designation)
4499         {
4500           unsigned char *dst_prev = dst;
4501
4502           /* We have to produce designation sequences if any now.  */
4503           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4504           bol_designation = 0;
4505           /* We are sure that designation sequences are all ASCII bytes.  */
4506           produced_chars += dst - dst_prev;
4507         }
4508
4509       c = *charbuf++;
4510
4511       if (c < 0)
4512         {
4513           /* Handle an annotation.  */
4514           switch (*charbuf)
4515             {
4516             case CODING_ANNOTATE_COMPOSITION_MASK:
4517               /* Not yet implemented.  */
4518               break;
4519             case CODING_ANNOTATE_CHARSET_MASK:
4520               preferred_charset_id = charbuf[2];
4521               if (preferred_charset_id >= 0
4522                   && NILP (Fmemq (make_number (preferred_charset_id),
4523                                   charset_list)))
4524                 preferred_charset_id = -1;
4525               break;
4526             default:
4527               abort ();
4528             }
4529           charbuf += -c - 1;
4530           continue;
4531         }
4532
4533       /* Now encode the character C.  */
4534       if (c < 0x20 || c == 0x7F)
4535         {
4536           if (c == '\n'
4537               || (c == '\r' && EQ (eol_type, Qmac)))
4538             {
4539               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4540                 ENCODE_RESET_PLANE_AND_REGISTER ();
4541               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4542                 {
4543                   int i;
4544
4545                   for (i = 0; i < 4; i++)
4546                     CODING_ISO_DESIGNATION (coding, i)
4547                       = CODING_ISO_INITIAL (coding, i);
4548                 }
4549               bol_designation
4550                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4551             }
4552           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4553             ENCODE_RESET_PLANE_AND_REGISTER ();
4554           EMIT_ONE_ASCII_BYTE (c);
4555         }
4556       else if (ASCII_CHAR_P (c))
4557         {
4558           if (ascii_compatible)
4559             EMIT_ONE_ASCII_BYTE (c);
4560           else
4561             {
4562               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4563               ENCODE_ISO_CHARACTER (charset, c);
4564             }
4565         }
4566       else if (CHAR_BYTE8_P (c))
4567         {
4568           c = CHAR_TO_BYTE8 (c);
4569           EMIT_ONE_BYTE (c);
4570         }
4571       else
4572         {
4573           struct charset *charset;
4574
4575           if (preferred_charset_id >= 0)
4576             {
4577               charset = CHARSET_FROM_ID (preferred_charset_id);
4578               if (! CHAR_CHARSET_P (c, charset))
4579                 charset = char_charset (c, charset_list, NULL);
4580             }
4581           else
4582             charset = char_charset (c, charset_list, NULL);
4583           if (!charset)
4584             {
4585               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4586                 {
4587                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4588                   charset = CHARSET_FROM_ID (charset_ascii);
4589                 }
4590               else
4591                 {
4592                   c = coding->default_char;
4593                   charset = char_charset (c, charset_list, NULL);
4594                 }
4595             }
4596           ENCODE_ISO_CHARACTER (charset, c);
4597         }
4598     }
4599
4600   if (coding->mode & CODING_MODE_LAST_BLOCK
4601       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4602     {
4603       ASSURE_DESTINATION (safe_room);
4604       ENCODE_RESET_PLANE_AND_REGISTER ();
4605     }
4606   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4607   CODING_ISO_BOL (coding) = bol_designation;
4608   coding->produced_char += produced_chars;
4609   coding->produced = dst - coding->destination;
4610   return 0;
4611 }
4612
4613 \f
4614 /*** 8,9. SJIS and BIG5 handlers ***/
4615
4616 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4617    quite widely.  So, for the moment, Emacs supports them in the bare
4618    C code.  But, in the future, they may be supported only by CCL.  */
4619
4620 /* SJIS is a coding system encoding three character sets: ASCII, right
4621    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4622    as is.  A character of charset katakana-jisx0201 is encoded by
4623    "position-code + 0x80".  A character of charset japanese-jisx0208
4624    is encoded in 2-byte but two position-codes are divided and shifted
4625    so that it fit in the range below.
4626
4627    --- CODE RANGE of SJIS ---
4628    (character set)      (range)
4629    ASCII                0x00 .. 0x7F
4630    KATAKANA-JISX0201    0xA0 .. 0xDF
4631    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4632             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4633    -------------------------------
4634
4635 */
4636
4637 /* BIG5 is a coding system encoding two character sets: ASCII and
4638    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4639    character set and is encoded in two-byte.
4640
4641    --- CODE RANGE of BIG5 ---
4642    (character set)      (range)
4643    ASCII                0x00 .. 0x7F
4644    Big5 (1st byte)      0xA1 .. 0xFE
4645         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4646    --------------------------
4647
4648   */
4649
4650 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4651    Check if a text is encoded in SJIS.  If it is, return
4652    CATEGORY_MASK_SJIS, else return 0.  */
4653
4654 static int
4655 detect_coding_sjis (coding, detect_info)
4656      struct coding_system *coding;
4657      struct coding_detection_info *detect_info;
4658 {
4659   const unsigned char *src = coding->source, *src_base;
4660   const unsigned char *src_end = coding->source + coding->src_bytes;
4661   int multibytep = coding->src_multibyte;
4662   int consumed_chars = 0;
4663   int found = 0;
4664   int c;
4665
4666   detect_info->checked |= CATEGORY_MASK_SJIS;
4667   /* A coding system of this category is always ASCII compatible.  */
4668   src += coding->head_ascii;
4669
4670   while (1)
4671     {
4672       src_base = src;
4673       ONE_MORE_BYTE (c);
4674       if (c < 0x80)
4675         continue;
4676       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4677         {
4678           ONE_MORE_BYTE (c);
4679           if (c < 0x40 || c == 0x7F || c > 0xFC)
4680             break;
4681           found = CATEGORY_MASK_SJIS;
4682         }
4683       else if (c >= 0xA0 && c < 0xE0)
4684         found = CATEGORY_MASK_SJIS;
4685       else
4686         break;
4687     }
4688   detect_info->rejected |= CATEGORY_MASK_SJIS;
4689   return 0;
4690
4691  no_more_source:
4692   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4693     {
4694       detect_info->rejected |= CATEGORY_MASK_SJIS;
4695       return 0;
4696     }
4697   detect_info->found |= found;
4698   return 1;
4699 }
4700
4701 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4702    Check if a text is encoded in BIG5.  If it is, return
4703    CATEGORY_MASK_BIG5, else return 0.  */
4704
4705 static int
4706 detect_coding_big5 (coding, detect_info)
4707      struct coding_system *coding;
4708      struct coding_detection_info *detect_info;
4709 {
4710   const unsigned char *src = coding->source, *src_base;
4711   const unsigned char *src_end = coding->source + coding->src_bytes;
4712   int multibytep = coding->src_multibyte;
4713   int consumed_chars = 0;
4714   int found = 0;
4715   int c;
4716
4717   detect_info->checked |= CATEGORY_MASK_BIG5;
4718   /* A coding system of this category is always ASCII compatible.  */
4719   src += coding->head_ascii;
4720
4721   while (1)
4722     {
4723       src_base = src;
4724       ONE_MORE_BYTE (c);
4725       if (c < 0x80)
4726         continue;
4727       if (c >= 0xA1)
4728         {
4729           ONE_MORE_BYTE (c);
4730           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4731             return 0;
4732           found = CATEGORY_MASK_BIG5;
4733         }
4734       else
4735         break;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_BIG5;
4738   return 0;
4739
4740  no_more_source:
4741   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742     {
4743       detect_info->rejected |= CATEGORY_MASK_BIG5;
4744       return 0;
4745     }
4746   detect_info->found |= found;
4747   return 1;
4748 }
4749
4750 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4751    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4752
4753 static void
4754 decode_coding_sjis (coding)
4755      struct coding_system *coding;
4756 {
4757   const unsigned char *src = coding->source + coding->consumed;
4758   const unsigned char *src_end = coding->source + coding->src_bytes;
4759   const unsigned char *src_base;
4760   int *charbuf = coding->charbuf + coding->charbuf_used;
4761   int *charbuf_end
4762     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4763   int consumed_chars = 0, consumed_chars_base;
4764   int multibytep = coding->src_multibyte;
4765   struct charset *charset_roman, *charset_kanji, *charset_kana;
4766   struct charset *charset_kanji2;
4767   Lisp_Object attrs, charset_list, val;
4768   int char_offset = coding->produced_char;
4769   int last_offset = char_offset;
4770   int last_id = charset_ascii;
4771   int eol_crlf =
4772     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4773   int byte_after_cr = -1;
4774
4775   CODING_GET_INFO (coding, attrs, charset_list);
4776
4777   val = charset_list;
4778   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4779   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4780   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4781   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4782
4783   while (1)
4784     {
4785       int c, c1;
4786       struct charset *charset;
4787
4788       src_base = src;
4789       consumed_chars_base = consumed_chars;
4790
4791       if (charbuf >= charbuf_end)
4792         {
4793           if (byte_after_cr >= 0)
4794             src_base--;
4795           break;
4796         }
4797
4798       if (byte_after_cr >= 0)
4799         c = byte_after_cr, byte_after_cr = -1;
4800       else
4801         ONE_MORE_BYTE (c);
4802       if (c < 0)
4803         goto invalid_code;
4804       if (c < 0x80)
4805         {
4806           if (eol_crlf && c == '\r')
4807             ONE_MORE_BYTE (byte_after_cr);
4808           charset = charset_roman;
4809         }
4810       else if (c == 0x80 || c == 0xA0)
4811         goto invalid_code;
4812       else if (c >= 0xA1 && c <= 0xDF)
4813         {
4814           /* SJIS -> JISX0201-Kana */
4815           c &= 0x7F;
4816           charset = charset_kana;
4817         }
4818       else if (c <= 0xEF)
4819         {
4820           /* SJIS -> JISX0208 */
4821           ONE_MORE_BYTE (c1);
4822           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4823             goto invalid_code;
4824           c = (c << 8) | c1;
4825           SJIS_TO_JIS (c);
4826           charset = charset_kanji;
4827         }
4828       else if (c <= 0xFC && charset_kanji2)
4829         {
4830           /* SJIS -> JISX0213-2 */
4831           ONE_MORE_BYTE (c1);
4832           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4833             goto invalid_code;
4834           c = (c << 8) | c1;
4835           SJIS_TO_JIS2 (c);
4836           charset = charset_kanji2;
4837         }
4838       else
4839         goto invalid_code;
4840       if (charset->id != charset_ascii
4841           && last_id != charset->id)
4842         {
4843           if (last_id != charset_ascii)
4844             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845           last_id = charset->id;
4846           last_offset = char_offset;
4847         }
4848       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4849       *charbuf++ = c;
4850       char_offset++;
4851       continue;
4852
4853     invalid_code:
4854       src = src_base;
4855       consumed_chars = consumed_chars_base;
4856       ONE_MORE_BYTE (c);
4857       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4858       char_offset++;
4859       coding->errors++;
4860     }
4861
4862  no_more_source:
4863   if (last_id != charset_ascii)
4864     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4865   coding->consumed_char += consumed_chars_base;
4866   coding->consumed = src_base - coding->source;
4867   coding->charbuf_used = charbuf - coding->charbuf;
4868 }
4869
4870 static void
4871 decode_coding_big5 (coding)
4872      struct coding_system *coding;
4873 {
4874   const unsigned char *src = coding->source + coding->consumed;
4875   const unsigned char *src_end = coding->source + coding->src_bytes;
4876   const unsigned char *src_base;
4877   int *charbuf = coding->charbuf + coding->charbuf_used;
4878   int *charbuf_end
4879     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4880   int consumed_chars = 0, consumed_chars_base;
4881   int multibytep = coding->src_multibyte;
4882   struct charset *charset_roman, *charset_big5;
4883   Lisp_Object attrs, charset_list, val;
4884   int char_offset = coding->produced_char;
4885   int last_offset = char_offset;
4886   int last_id = charset_ascii;
4887   int eol_crlf =
4888     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4889   int byte_after_cr = -1;
4890
4891   CODING_GET_INFO (coding, attrs, charset_list);
4892   val = charset_list;
4893   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4894   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4895
4896   while (1)
4897     {
4898       int c, c1;
4899       struct charset *charset;
4900
4901       src_base = src;
4902       consumed_chars_base = consumed_chars;
4903
4904       if (charbuf >= charbuf_end)
4905         {
4906           if (byte_after_cr >= 0)
4907             src_base--;
4908           break;
4909         }
4910
4911       if (byte_after_cr >= 0)
4912         c = byte_after_cr, byte_after_cr = -1;
4913       else
4914         ONE_MORE_BYTE (c);
4915
4916       if (c < 0)
4917         goto invalid_code;
4918       if (c < 0x80)
4919         {
4920           if (eol_crlf && c == '\r')
4921             ONE_MORE_BYTE (byte_after_cr);
4922           charset = charset_roman;
4923         }
4924       else
4925         {
4926           /* BIG5 -> Big5 */
4927           if (c < 0xA1 || c > 0xFE)
4928             goto invalid_code;
4929           ONE_MORE_BYTE (c1);
4930           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4931             goto invalid_code;
4932           c = c << 8 | c1;
4933           charset = charset_big5;
4934         }
4935       if (charset->id != charset_ascii
4936           && last_id != charset->id)
4937         {
4938           if (last_id != charset_ascii)
4939             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4940           last_id = charset->id;
4941           last_offset = char_offset;
4942         }
4943       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4944       *charbuf++ = c;
4945       char_offset++;
4946       continue;
4947
4948     invalid_code:
4949       src = src_base;
4950       consumed_chars = consumed_chars_base;
4951       ONE_MORE_BYTE (c);
4952       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4953       char_offset++;
4954       coding->errors++;
4955     }
4956
4957  no_more_source:
4958   if (last_id != charset_ascii)
4959     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4960   coding->consumed_char += consumed_chars_base;
4961   coding->consumed = src_base - coding->source;
4962   coding->charbuf_used = charbuf - coding->charbuf;
4963 }
4964
4965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4966    This function can encode charsets `ascii', `katakana-jisx0201',
4967    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4968    are sure that all these charsets are registered as official charset
4969    (i.e. do not have extended leading-codes).  Characters of other
4970    charsets are produced without any encoding.  If SJIS_P is 1, encode
4971    SJIS text, else encode BIG5 text.  */
4972
4973 static int
4974 encode_coding_sjis (coding)
4975      struct coding_system *coding;
4976 {
4977   int multibytep = coding->dst_multibyte;
4978   int *charbuf = coding->charbuf;
4979   int *charbuf_end = charbuf + coding->charbuf_used;
4980   unsigned char *dst = coding->destination + coding->produced;
4981   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4982   int safe_room = 4;
4983   int produced_chars = 0;
4984   Lisp_Object attrs, charset_list, val;
4985   int ascii_compatible;
4986   struct charset *charset_roman, *charset_kanji, *charset_kana;
4987   struct charset *charset_kanji2;
4988   int c;
4989
4990   CODING_GET_INFO (coding, attrs, charset_list);
4991   val = charset_list;
4992   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4993   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4994   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4995   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4996
4997   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4998
4999   while (charbuf < charbuf_end)
5000     {
5001       ASSURE_DESTINATION (safe_room);
5002       c = *charbuf++;
5003       /* Now encode the character C.  */
5004       if (ASCII_CHAR_P (c) && ascii_compatible)
5005         EMIT_ONE_ASCII_BYTE (c);
5006       else if (CHAR_BYTE8_P (c))
5007         {
5008           c = CHAR_TO_BYTE8 (c);
5009           EMIT_ONE_BYTE (c);
5010         }
5011       else
5012         {
5013           unsigned code;
5014           struct charset *charset = char_charset (c, charset_list, &code);
5015
5016           if (!charset)
5017             {
5018               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5019                 {
5020                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5021                   charset = CHARSET_FROM_ID (charset_ascii);
5022                 }
5023               else
5024                 {
5025                   c = coding->default_char;
5026                   charset = char_charset (c, charset_list, &code);
5027                 }
5028             }
5029           if (code == CHARSET_INVALID_CODE (charset))
5030             abort ();
5031           if (charset == charset_kanji)
5032             {
5033               int c1, c2;
5034               JIS_TO_SJIS (code);
5035               c1 = code >> 8, c2 = code & 0xFF;
5036               EMIT_TWO_BYTES (c1, c2);
5037             }
5038           else if (charset == charset_kana)
5039             EMIT_ONE_BYTE (code | 0x80);
5040           else if (charset_kanji2 && charset == charset_kanji2)
5041             {
5042               int c1, c2;
5043
5044               c1 = code >> 8;
5045               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
5046                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5047                 {
5048                   JIS_TO_SJIS2 (code);
5049                   c1 = code >> 8, c2 = code & 0xFF;
5050                   EMIT_TWO_BYTES (c1, c2);
5051                 }
5052               else
5053                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5054             }
5055           else
5056             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5057         }
5058     }
5059   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5060   coding->produced_char += produced_chars;
5061   coding->produced = dst - coding->destination;
5062   return 0;
5063 }
5064
5065 static int
5066 encode_coding_big5 (coding)
5067      struct coding_system *coding;
5068 {
5069   int multibytep = coding->dst_multibyte;
5070   int *charbuf = coding->charbuf;
5071   int *charbuf_end = charbuf + coding->charbuf_used;
5072   unsigned char *dst = coding->destination + coding->produced;
5073   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5074   int safe_room = 4;
5075   int produced_chars = 0;
5076   Lisp_Object attrs, charset_list, val;
5077   int ascii_compatible;
5078   struct charset *charset_roman, *charset_big5;
5079   int c;
5080
5081   CODING_GET_INFO (coding, attrs, charset_list);
5082   val = charset_list;
5083   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5084   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5085   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5086
5087   while (charbuf < charbuf_end)
5088     {
5089       ASSURE_DESTINATION (safe_room);
5090       c = *charbuf++;
5091       /* Now encode the character C.  */
5092       if (ASCII_CHAR_P (c) && ascii_compatible)
5093         EMIT_ONE_ASCII_BYTE (c);
5094       else if (CHAR_BYTE8_P (c))
5095         {
5096           c = CHAR_TO_BYTE8 (c);
5097           EMIT_ONE_BYTE (c);
5098         }
5099       else
5100         {
5101           unsigned code;
5102           struct charset *charset = char_charset (c, charset_list, &code);
5103
5104           if (! charset)
5105             {
5106               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5107                 {
5108                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5109                   charset = CHARSET_FROM_ID (charset_ascii);
5110                 }
5111               else
5112                 {
5113                   c = coding->default_char;
5114                   charset = char_charset (c, charset_list, &code);
5115                 }
5116             }
5117           if (code == CHARSET_INVALID_CODE (charset))
5118             abort ();
5119           if (charset == charset_big5)
5120             {
5121               int c1, c2;
5122
5123               c1 = code >> 8, c2 = code & 0xFF;
5124               EMIT_TWO_BYTES (c1, c2);
5125             }
5126           else
5127             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5128         }
5129     }
5130   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5131   coding->produced_char += produced_chars;
5132   coding->produced = dst - coding->destination;
5133   return 0;
5134 }
5135
5136 \f
5137 /*** 10. CCL handlers ***/
5138
5139 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5140    Check if a text is encoded in a coding system of which
5141    encoder/decoder are written in CCL program.  If it is, return
5142    CATEGORY_MASK_CCL, else return 0.  */
5143
5144 static int
5145 detect_coding_ccl (coding, detect_info)
5146      struct coding_system *coding;
5147      struct coding_detection_info *detect_info;
5148 {
5149   const unsigned char *src = coding->source, *src_base;
5150   const unsigned char *src_end = coding->source + coding->src_bytes;
5151   int multibytep = coding->src_multibyte;
5152   int consumed_chars = 0;
5153   int found = 0;
5154   unsigned char *valids;
5155   int head_ascii = coding->head_ascii;
5156   Lisp_Object attrs;
5157
5158   detect_info->checked |= CATEGORY_MASK_CCL;
5159
5160   coding = &coding_categories[coding_category_ccl];
5161   valids = CODING_CCL_VALIDS (coding);
5162   attrs = CODING_ID_ATTRS (coding->id);
5163   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5164     src += head_ascii;
5165
5166   while (1)
5167     {
5168       int c;
5169
5170       src_base = src;
5171       ONE_MORE_BYTE (c);
5172       if (c < 0 || ! valids[c])
5173         break;
5174       if ((valids[c] > 1))
5175         found = CATEGORY_MASK_CCL;
5176     }
5177   detect_info->rejected |= CATEGORY_MASK_CCL;
5178   return 0;
5179
5180  no_more_source:
5181   detect_info->found |= found;
5182   return 1;
5183 }
5184
5185 static void
5186 decode_coding_ccl (coding)
5187      struct coding_system *coding;
5188 {
5189   const unsigned char *src = coding->source + coding->consumed;
5190   const unsigned char *src_end = coding->source + coding->src_bytes;
5191   int *charbuf = coding->charbuf + coding->charbuf_used;
5192   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5193   int consumed_chars = 0;
5194   int multibytep = coding->src_multibyte;
5195   struct ccl_program ccl;
5196   int source_charbuf[1024];
5197   int source_byteidx[1024];
5198   Lisp_Object attrs, charset_list;
5199
5200   CODING_GET_INFO (coding, attrs, charset_list);
5201   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
5202
5203   while (src < src_end)
5204     {
5205       const unsigned char *p = src;
5206       int *source, *source_end;
5207       int i = 0;
5208
5209       if (multibytep)
5210         while (i < 1024 && p < src_end)
5211           {
5212             source_byteidx[i] = p - src;
5213             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5214           }
5215       else
5216         while (i < 1024 && p < src_end)
5217           source_charbuf[i++] = *p++;
5218
5219       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5220         ccl.last_block = 1;
5221
5222       source = source_charbuf;
5223       source_end = source + i;
5224       while (source < source_end)
5225         {
5226           ccl_driver (&ccl, source, charbuf,
5227                       source_end - source, charbuf_end - charbuf,
5228                       charset_list);
5229           source += ccl.consumed;
5230           charbuf += ccl.produced;
5231           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
5232             break;
5233         }
5234       if (source < source_end)
5235         src += source_byteidx[source - source_charbuf];
5236       else
5237         src = p;
5238       consumed_chars += source - source_charbuf;
5239
5240       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
5241           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
5242         break;
5243     }
5244
5245   switch (ccl.status)
5246     {
5247     case CCL_STAT_SUSPEND_BY_SRC:
5248       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5249       break;
5250     case CCL_STAT_SUSPEND_BY_DST:
5251       break;
5252     case CCL_STAT_QUIT:
5253     case CCL_STAT_INVALID_CMD:
5254       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5255       break;
5256     default:
5257       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5258       break;
5259     }
5260   coding->consumed_char += consumed_chars;
5261   coding->consumed = src - coding->source;
5262   coding->charbuf_used = charbuf - coding->charbuf;
5263 }
5264
5265 static int
5266 encode_coding_ccl (coding)
5267      struct coding_system *coding;
5268 {
5269   struct ccl_program ccl;
5270   int multibytep = coding->dst_multibyte;
5271   int *charbuf = coding->charbuf;
5272   int *charbuf_end = charbuf + coding->charbuf_used;
5273   unsigned char *dst = coding->destination + coding->produced;
5274   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5275   int destination_charbuf[1024];
5276   int i, produced_chars = 0;
5277   Lisp_Object attrs, charset_list;
5278
5279   CODING_GET_INFO (coding, attrs, charset_list);
5280   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5281
5282   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5283   ccl.dst_multibyte = coding->dst_multibyte;
5284
5285   while (charbuf < charbuf_end)
5286     {
5287       ccl_driver (&ccl, charbuf, destination_charbuf,
5288                   charbuf_end - charbuf, 1024, charset_list);
5289       if (multibytep)
5290         {
5291           ASSURE_DESTINATION (ccl.produced * 2);
5292           for (i = 0; i < ccl.produced; i++)
5293             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5294         }
5295       else
5296         {
5297           ASSURE_DESTINATION (ccl.produced);
5298           for (i = 0; i < ccl.produced; i++)
5299             *dst++ = destination_charbuf[i] & 0xFF;
5300           produced_chars += ccl.produced;
5301         }
5302       charbuf += ccl.consumed;
5303       if (ccl.status == CCL_STAT_QUIT
5304           || ccl.status == CCL_STAT_INVALID_CMD)
5305         break;
5306     }
5307
5308   switch (ccl.status)
5309     {
5310     case CCL_STAT_SUSPEND_BY_SRC:
5311       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5312       break;
5313     case CCL_STAT_SUSPEND_BY_DST:
5314       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5315       break;
5316     case CCL_STAT_QUIT:
5317     case CCL_STAT_INVALID_CMD:
5318       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5319       break;
5320     default:
5321       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5322       break;
5323     }
5324
5325   coding->produced_char += produced_chars;
5326   coding->produced = dst - coding->destination;
5327   return 0;
5328 }
5329
5330
5331 \f
5332 /*** 10, 11. no-conversion handlers ***/
5333
5334 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5335
5336 static void
5337 decode_coding_raw_text (coding)
5338      struct coding_system *coding;
5339 {
5340   int eol_crlf =
5341     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5342
5343   coding->chars_at_source = 1;
5344   coding->consumed_char = coding->src_chars;
5345   coding->consumed = coding->src_bytes;
5346   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5347     {
5348       coding->consumed_char--;
5349       coding->consumed--;
5350       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5351     }
5352   else
5353     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5354 }
5355
5356 static int
5357 encode_coding_raw_text (coding)
5358      struct coding_system *coding;
5359 {
5360   int multibytep = coding->dst_multibyte;
5361   int *charbuf = coding->charbuf;
5362   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5363   unsigned char *dst = coding->destination + coding->produced;
5364   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5365   int produced_chars = 0;
5366   int c;
5367
5368   if (multibytep)
5369     {
5370       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5371
5372       if (coding->src_multibyte)
5373         while (charbuf < charbuf_end)
5374           {
5375             ASSURE_DESTINATION (safe_room);
5376             c = *charbuf++;
5377             if (ASCII_CHAR_P (c))
5378               EMIT_ONE_ASCII_BYTE (c);
5379             else if (CHAR_BYTE8_P (c))
5380               {
5381                 c = CHAR_TO_BYTE8 (c);
5382                 EMIT_ONE_BYTE (c);
5383               }
5384             else
5385               {
5386                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5387
5388                 CHAR_STRING_ADVANCE (c, p1);
5389                 while (p0 < p1)
5390                   {
5391                     EMIT_ONE_BYTE (*p0);
5392                     p0++;
5393                   }
5394               }
5395           }
5396       else
5397         while (charbuf < charbuf_end)
5398           {
5399             ASSURE_DESTINATION (safe_room);
5400             c = *charbuf++;
5401             EMIT_ONE_BYTE (c);
5402           }
5403     }
5404   else
5405     {
5406       if (coding->src_multibyte)
5407         {
5408           int safe_room = MAX_MULTIBYTE_LENGTH;
5409
5410           while (charbuf < charbuf_end)
5411             {
5412               ASSURE_DESTINATION (safe_room);
5413               c = *charbuf++;
5414               if (ASCII_CHAR_P (c))
5415                 *dst++ = c;
5416               else if (CHAR_BYTE8_P (c))
5417                 *dst++ = CHAR_TO_BYTE8 (c);
5418               else
5419                 CHAR_STRING_ADVANCE (c, dst);
5420             }
5421         }
5422       else
5423         {
5424           ASSURE_DESTINATION (charbuf_end - charbuf);
5425           while (charbuf < charbuf_end && dst < dst_end)
5426             *dst++ = *charbuf++;
5427         }
5428       produced_chars = dst - (coding->destination + coding->produced);
5429     }
5430   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5431   coding->produced_char += produced_chars;
5432   coding->produced = dst - coding->destination;
5433   return 0;
5434 }
5435
5436 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5437    Check if a text is encoded in a charset-based coding system.  If it
5438    is, return 1, else return 0.  */
5439
5440 static int
5441 detect_coding_charset (coding, detect_info)
5442      struct coding_system *coding;
5443      struct coding_detection_info *detect_info;
5444 {
5445   const unsigned char *src = coding->source, *src_base;
5446   const unsigned char *src_end = coding->source + coding->src_bytes;
5447   int multibytep = coding->src_multibyte;
5448   int consumed_chars = 0;
5449   Lisp_Object attrs, valids, name;
5450   int found = 0;
5451   int head_ascii = coding->head_ascii;
5452   int check_latin_extra = 0;
5453
5454   detect_info->checked |= CATEGORY_MASK_CHARSET;
5455
5456   coding = &coding_categories[coding_category_charset];
5457   attrs = CODING_ID_ATTRS (coding->id);
5458   valids = AREF (attrs, coding_attr_charset_valids);
5459   name = CODING_ID_NAME (coding->id);
5460   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5461                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5462       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5463                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5464     check_latin_extra = 1;
5465
5466   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5467     src += head_ascii;
5468
5469   while (1)
5470     {
5471       int c;
5472       Lisp_Object val;
5473       struct charset *charset;
5474       int dim, idx;
5475
5476       src_base = src;
5477       ONE_MORE_BYTE (c);
5478       if (c < 0)
5479         continue;
5480       val = AREF (valids, c);
5481       if (NILP (val))
5482         break;
5483       if (c >= 0x80)
5484         {
5485           if (c < 0xA0
5486               && check_latin_extra
5487               && (!VECTORP (Vlatin_extra_code_table)
5488                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5489             break;
5490           found = CATEGORY_MASK_CHARSET;
5491         }
5492       if (INTEGERP (val))
5493         {
5494           charset = CHARSET_FROM_ID (XFASTINT (val));
5495           dim = CHARSET_DIMENSION (charset);
5496           for (idx = 1; idx < dim; idx++)
5497             {
5498               if (src == src_end)
5499                 goto too_short;
5500               ONE_MORE_BYTE (c);
5501               if (c < charset->code_space[(dim - 1 - idx) * 2]
5502                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5503                 break;
5504             }
5505           if (idx < dim)
5506             break;
5507         }
5508       else
5509         {
5510           idx = 1;
5511           for (; CONSP (val); val = XCDR (val))
5512             {
5513               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5514               dim = CHARSET_DIMENSION (charset);
5515               while (idx < dim)
5516                 {
5517                   if (src == src_end)
5518                     goto too_short;
5519                   ONE_MORE_BYTE (c);
5520                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5521                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5522                     break;
5523                   idx++;
5524                 }
5525               if (idx == dim)
5526                 {
5527                   val = Qnil;
5528                   break;
5529                 }
5530             }
5531           if (CONSP (val))
5532             break;
5533         }
5534     }
5535  too_short:
5536   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5537   return 0;
5538
5539  no_more_source:
5540   detect_info->found |= found;
5541   return 1;
5542 }
5543
5544 static void
5545 decode_coding_charset (coding)
5546      struct coding_system *coding;
5547 {
5548   const unsigned char *src = coding->source + coding->consumed;
5549   const unsigned char *src_end = coding->source + coding->src_bytes;
5550   const unsigned char *src_base;
5551   int *charbuf = coding->charbuf + coding->charbuf_used;
5552   int *charbuf_end
5553     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
5554   int consumed_chars = 0, consumed_chars_base;
5555   int multibytep = coding->src_multibyte;
5556   Lisp_Object attrs, charset_list, valids;
5557   int char_offset = coding->produced_char;
5558   int last_offset = char_offset;
5559   int last_id = charset_ascii;
5560   int eol_crlf =
5561     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5562   int byte_after_cr = -1;
5563
5564   CODING_GET_INFO (coding, attrs, charset_list);
5565   valids = AREF (attrs, coding_attr_charset_valids);
5566
5567   while (1)
5568     {
5569       int c;
5570       Lisp_Object val;
5571       struct charset *charset;
5572       int dim;
5573       int len = 1;
5574       unsigned code;
5575
5576       src_base = src;
5577       consumed_chars_base = consumed_chars;
5578
5579       if (charbuf >= charbuf_end)
5580         {
5581           if (byte_after_cr >= 0)
5582             src_base--;
5583           break;
5584         }
5585
5586       if (byte_after_cr >= 0)
5587         {
5588           c = byte_after_cr;
5589           byte_after_cr = -1;
5590         }
5591       else
5592         {
5593           ONE_MORE_BYTE (c);
5594           if (eol_crlf && c == '\r')
5595             ONE_MORE_BYTE (byte_after_cr);
5596         }
5597       if (c < 0)
5598         goto invalid_code;
5599       code = c;
5600
5601       val = AREF (valids, c);
5602       if (! INTEGERP (val) && ! CONSP (val))
5603         goto invalid_code;
5604       if (INTEGERP (val))
5605         {
5606           charset = CHARSET_FROM_ID (XFASTINT (val));
5607           dim = CHARSET_DIMENSION (charset);
5608           while (len < dim)
5609             {
5610               ONE_MORE_BYTE (c);
5611               code = (code << 8) | c;
5612               len++;
5613             }
5614           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5615                               charset, code, c);
5616         }
5617       else
5618         {
5619           /* VAL is a list of charset IDs.  It is assured that the
5620              list is sorted by charset dimensions (smaller one
5621              comes first).  */
5622           while (CONSP (val))
5623             {
5624               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5625               dim = CHARSET_DIMENSION (charset);
5626               while (len < dim)
5627                 {
5628                   ONE_MORE_BYTE (c);
5629                   code = (code << 8) | c;
5630                   len++;
5631                 }
5632               CODING_DECODE_CHAR (coding, src, src_base,
5633                                   src_end, charset, code, c);
5634               if (c >= 0)
5635                 break;
5636               val = XCDR (val);
5637             }
5638         }
5639       if (c < 0)
5640         goto invalid_code;
5641       if (charset->id != charset_ascii
5642           && last_id != charset->id)
5643         {
5644           if (last_id != charset_ascii)
5645             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5646           last_id = charset->id;
5647           last_offset = char_offset;
5648         }
5649
5650       *charbuf++ = c;
5651       char_offset++;
5652       continue;
5653
5654     invalid_code:
5655       src = src_base;
5656       consumed_chars = consumed_chars_base;
5657       ONE_MORE_BYTE (c);
5658       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5659       char_offset++;
5660       coding->errors++;
5661     }
5662
5663  no_more_source:
5664   if (last_id != charset_ascii)
5665     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5666   coding->consumed_char += consumed_chars_base;
5667   coding->consumed = src_base - coding->source;
5668   coding->charbuf_used = charbuf - coding->charbuf;
5669 }
5670
5671 static int
5672 encode_coding_charset (coding)
5673      struct coding_system *coding;
5674 {
5675   int multibytep = coding->dst_multibyte;
5676   int *charbuf = coding->charbuf;
5677   int *charbuf_end = charbuf + coding->charbuf_used;
5678   unsigned char *dst = coding->destination + coding->produced;
5679   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5680   int safe_room = MAX_MULTIBYTE_LENGTH;
5681   int produced_chars = 0;
5682   Lisp_Object attrs, charset_list;
5683   int ascii_compatible;
5684   int c;
5685
5686   CODING_GET_INFO (coding, attrs, charset_list);
5687   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5688
5689   while (charbuf < charbuf_end)
5690     {
5691       struct charset *charset;
5692       unsigned code;
5693
5694       ASSURE_DESTINATION (safe_room);
5695       c = *charbuf++;
5696       if (ascii_compatible && ASCII_CHAR_P (c))
5697         EMIT_ONE_ASCII_BYTE (c);
5698       else if (CHAR_BYTE8_P (c))
5699         {
5700           c = CHAR_TO_BYTE8 (c);
5701           EMIT_ONE_BYTE (c);
5702         }
5703       else
5704         {
5705           charset = char_charset (c, charset_list, &code);
5706           if (charset)
5707             {
5708               if (CHARSET_DIMENSION (charset) == 1)
5709                 EMIT_ONE_BYTE (code);
5710               else if (CHARSET_DIMENSION (charset) == 2)
5711                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5712               else if (CHARSET_DIMENSION (charset) == 3)
5713                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5714               else
5715                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5716                                  (code >> 8) & 0xFF, code & 0xFF);
5717             }
5718           else
5719             {
5720               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5721                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5722               else
5723                 c = coding->default_char;
5724               EMIT_ONE_BYTE (c);
5725             }
5726         }
5727     }
5728
5729   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5730   coding->produced_char += produced_chars;
5731   coding->produced = dst - coding->destination;
5732   return 0;
5733 }
5734
5735 \f
5736 /*** 7. C library functions ***/
5737
5738 /* Setup coding context CODING from information about CODING_SYSTEM.
5739    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5740    CODING_SYSTEM is invalid, signal an error.  */
5741
5742 void
5743 setup_coding_system (coding_system, coding)
5744      Lisp_Object coding_system;
5745      struct coding_system *coding;
5746 {
5747   Lisp_Object attrs;
5748   Lisp_Object eol_type;
5749   Lisp_Object coding_type;
5750   Lisp_Object val;
5751
5752   if (NILP (coding_system))
5753     coding_system = Qundecided;
5754
5755   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5756
5757   attrs = CODING_ID_ATTRS (coding->id);
5758   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5759
5760   coding->mode = 0;
5761   coding->head_ascii = -1;
5762   if (VECTORP (eol_type))
5763     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5764                             | CODING_REQUIRE_DETECTION_MASK);
5765   else if (! EQ (eol_type, Qunix))
5766     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5767                             | CODING_REQUIRE_ENCODING_MASK);
5768   else
5769     coding->common_flags = 0;
5770   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5771     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5772   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5773     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5774   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5775     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5776
5777   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5778   coding->max_charset_id = SCHARS (val) - 1;
5779   coding->safe_charsets = SDATA (val);
5780   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5781
5782   coding_type = CODING_ATTR_TYPE (attrs);
5783   if (EQ (coding_type, Qundecided))
5784     {
5785       coding->detector = NULL;
5786       coding->decoder = decode_coding_raw_text;
5787       coding->encoder = encode_coding_raw_text;
5788       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5789     }
5790   else if (EQ (coding_type, Qiso_2022))
5791     {
5792       int i;
5793       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5794
5795       /* Invoke graphic register 0 to plane 0.  */
5796       CODING_ISO_INVOCATION (coding, 0) = 0;
5797       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5798       CODING_ISO_INVOCATION (coding, 1)
5799         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5800       /* Setup the initial status of designation.  */
5801       for (i = 0; i < 4; i++)
5802         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5803       /* Not single shifting initially.  */
5804       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5805       /* Beginning of buffer should also be regarded as bol. */
5806       CODING_ISO_BOL (coding) = 1;
5807       coding->detector = detect_coding_iso_2022;
5808       coding->decoder = decode_coding_iso_2022;
5809       coding->encoder = encode_coding_iso_2022;
5810       if (flags & CODING_ISO_FLAG_SAFE)
5811         coding->mode |= CODING_MODE_SAFE_ENCODING;
5812       coding->common_flags
5813         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5814             | CODING_REQUIRE_FLUSHING_MASK);
5815       if (flags & CODING_ISO_FLAG_COMPOSITION)
5816         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5817       if (flags & CODING_ISO_FLAG_DESIGNATION)
5818         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5819       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5820         {
5821           setup_iso_safe_charsets (attrs);
5822           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5823           coding->max_charset_id = SCHARS (val) - 1;
5824           coding->safe_charsets = SDATA (val);
5825         }
5826       CODING_ISO_FLAGS (coding) = flags;
5827       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5828       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5829       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5830       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5831     }
5832   else if (EQ (coding_type, Qcharset))
5833     {
5834       coding->detector = detect_coding_charset;
5835       coding->decoder = decode_coding_charset;
5836       coding->encoder = encode_coding_charset;
5837       coding->common_flags
5838         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5839     }
5840   else if (EQ (coding_type, Qutf_8))
5841     {
5842       val = AREF (attrs, coding_attr_utf_bom);
5843       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5844                                    : EQ (val, Qt) ? utf_with_bom
5845                                    : utf_without_bom);
5846       coding->detector = detect_coding_utf_8;
5847       coding->decoder = decode_coding_utf_8;
5848       coding->encoder = encode_coding_utf_8;
5849       coding->common_flags
5850         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5852         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5853     }
5854   else if (EQ (coding_type, Qutf_16))
5855     {
5856       val = AREF (attrs, coding_attr_utf_bom);
5857       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5858                                     : EQ (val, Qt) ? utf_with_bom
5859                                     : utf_without_bom);
5860       val = AREF (attrs, coding_attr_utf_16_endian);
5861       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5862                                        : utf_16_little_endian);
5863       CODING_UTF_16_SURROGATE (coding) = 0;
5864       coding->detector = detect_coding_utf_16;
5865       coding->decoder = decode_coding_utf_16;
5866       coding->encoder = encode_coding_utf_16;
5867       coding->common_flags
5868         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5869       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5870         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5871     }
5872   else if (EQ (coding_type, Qccl))
5873     {
5874       coding->detector = detect_coding_ccl;
5875       coding->decoder = decode_coding_ccl;
5876       coding->encoder = encode_coding_ccl;
5877       coding->common_flags
5878         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5879             | CODING_REQUIRE_FLUSHING_MASK);
5880     }
5881   else if (EQ (coding_type, Qemacs_mule))
5882     {
5883       coding->detector = detect_coding_emacs_mule;
5884       coding->decoder = decode_coding_emacs_mule;
5885       coding->encoder = encode_coding_emacs_mule;
5886       coding->common_flags
5887         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5888       coding->spec.emacs_mule.full_support = 1;
5889       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5890           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5891         {
5892           Lisp_Object tail, safe_charsets;
5893           int max_charset_id = 0;
5894
5895           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5896                tail = XCDR (tail))
5897             if (max_charset_id < XFASTINT (XCAR (tail)))
5898               max_charset_id = XFASTINT (XCAR (tail));
5899           safe_charsets = make_uninit_string (max_charset_id + 1);
5900           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5901           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5902                tail = XCDR (tail))
5903             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5904           coding->max_charset_id = max_charset_id;
5905           coding->safe_charsets = SDATA (safe_charsets);
5906           coding->spec.emacs_mule.full_support = 1;
5907         }
5908       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5909       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5910     }
5911   else if (EQ (coding_type, Qshift_jis))
5912     {
5913       coding->detector = detect_coding_sjis;
5914       coding->decoder = decode_coding_sjis;
5915       coding->encoder = encode_coding_sjis;
5916       coding->common_flags
5917         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5918     }
5919   else if (EQ (coding_type, Qbig5))
5920     {
5921       coding->detector = detect_coding_big5;
5922       coding->decoder = decode_coding_big5;
5923       coding->encoder = encode_coding_big5;
5924       coding->common_flags
5925         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5926     }
5927   else                          /* EQ (coding_type, Qraw_text) */
5928     {
5929       coding->detector = NULL;
5930       coding->decoder = decode_coding_raw_text;
5931       coding->encoder = encode_coding_raw_text;
5932       if (! EQ (eol_type, Qunix))
5933         {
5934           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5935           if (! VECTORP (eol_type))
5936             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5937         }
5938
5939     }
5940
5941   return;
5942 }
5943
5944 /* Return a list of charsets supported by CODING.  */
5945
5946 Lisp_Object
5947 coding_charset_list (coding)
5948      struct coding_system *coding;
5949 {
5950   Lisp_Object attrs, charset_list;
5951
5952   CODING_GET_INFO (coding, attrs, charset_list);
5953   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5954     {
5955       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5956
5957       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5958         charset_list = Viso_2022_charset_list;
5959     }
5960   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5961     {
5962       charset_list = Vemacs_mule_charset_list;
5963     }
5964   return charset_list;
5965 }
5966
5967
5968 /* Return a list of charsets supported by CODING-SYSTEM.  */
5969
5970 Lisp_Object
5971 coding_system_charset_list (coding_system)
5972      Lisp_Object coding_system;
5973 {
5974   int id;
5975   Lisp_Object attrs, charset_list;
5976
5977   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5978   attrs = CODING_ID_ATTRS (id);
5979
5980   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5981     {
5982       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5983
5984       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5985         charset_list = Viso_2022_charset_list;
5986       else
5987         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5988     }
5989   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5990     {
5991       charset_list = Vemacs_mule_charset_list;
5992     }
5993   else
5994     {
5995       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5996     }
5997   return charset_list;
5998 }
5999
6000
6001 /* Return raw-text or one of its subsidiaries that has the same
6002    eol_type as CODING-SYSTEM.  */
6003
6004 Lisp_Object
6005 raw_text_coding_system (coding_system)
6006      Lisp_Object coding_system;
6007 {
6008   Lisp_Object spec, attrs;
6009   Lisp_Object eol_type, raw_text_eol_type;
6010
6011   if (NILP (coding_system))
6012     return Qraw_text;
6013   spec = CODING_SYSTEM_SPEC (coding_system);
6014   attrs = AREF (spec, 0);
6015
6016   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6017     return coding_system;
6018
6019   eol_type = AREF (spec, 2);
6020   if (VECTORP (eol_type))
6021     return Qraw_text;
6022   spec = CODING_SYSTEM_SPEC (Qraw_text);
6023   raw_text_eol_type = AREF (spec, 2);
6024   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6025           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6026           : AREF (raw_text_eol_type, 2));
6027 }
6028
6029
6030 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6031    does, return one of the subsidiary that has the same eol-spec as
6032    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
6033    inherit end-of-line format from the system's setting
6034    (system_eol_type).  */
6035
6036 Lisp_Object
6037 coding_inherit_eol_type (coding_system, parent)
6038      Lisp_Object coding_system, parent;
6039 {
6040   Lisp_Object spec, eol_type;
6041
6042   if (NILP (coding_system))
6043     coding_system = Qraw_text;
6044   spec = CODING_SYSTEM_SPEC (coding_system);
6045   eol_type = AREF (spec, 2);
6046   if (VECTORP (eol_type))
6047     {
6048       Lisp_Object parent_eol_type;
6049
6050       if (! NILP (parent))
6051         {
6052           Lisp_Object parent_spec;
6053
6054           parent_spec = CODING_SYSTEM_SPEC (parent);
6055           parent_eol_type = AREF (parent_spec, 2);
6056         }
6057       else
6058         parent_eol_type = system_eol_type;
6059       if (EQ (parent_eol_type, Qunix))
6060         coding_system = AREF (eol_type, 0);
6061       else if (EQ (parent_eol_type, Qdos))
6062         coding_system = AREF (eol_type, 1);
6063       else if (EQ (parent_eol_type, Qmac))
6064         coding_system = AREF (eol_type, 2);
6065     }
6066   return coding_system;
6067 }
6068
6069 /* Emacs has a mechanism to automatically detect a coding system if it
6070    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6071    it's impossible to distinguish some coding systems accurately
6072    because they use the same range of codes.  So, at first, coding
6073    systems are categorized into 7, those are:
6074
6075    o coding-category-emacs-mule
6076
6077         The category for a coding system which has the same code range
6078         as Emacs' internal format.  Assigned the coding-system (Lisp
6079         symbol) `emacs-mule' by default.
6080
6081    o coding-category-sjis
6082
6083         The category for a coding system which has the same code range
6084         as SJIS.  Assigned the coding-system (Lisp
6085         symbol) `japanese-shift-jis' by default.
6086
6087    o coding-category-iso-7
6088
6089         The category for a coding system which has the same code range
6090         as ISO2022 of 7-bit environment.  This doesn't use any locking
6091         shift and single shift functions.  This can encode/decode all
6092         charsets.  Assigned the coding-system (Lisp symbol)
6093         `iso-2022-7bit' by default.
6094
6095    o coding-category-iso-7-tight
6096
6097         Same as coding-category-iso-7 except that this can
6098         encode/decode only the specified charsets.
6099
6100    o coding-category-iso-8-1
6101
6102         The category for a coding system which has the same code range
6103         as ISO2022 of 8-bit environment and graphic plane 1 used only
6104         for DIMENSION1 charset.  This doesn't use any locking shift
6105         and single shift functions.  Assigned the coding-system (Lisp
6106         symbol) `iso-latin-1' by default.
6107
6108    o coding-category-iso-8-2
6109
6110         The category for a coding system which has the same code range
6111         as ISO2022 of 8-bit environment and graphic plane 1 used only
6112         for DIMENSION2 charset.  This doesn't use any locking shift
6113         and single shift functions.  Assigned the coding-system (Lisp
6114         symbol) `japanese-iso-8bit' by default.
6115
6116    o coding-category-iso-7-else
6117
6118         The category for a coding system which has the same code range
6119         as ISO2022 of 7-bit environemnt but uses locking shift or
6120         single shift functions.  Assigned the coding-system (Lisp
6121         symbol) `iso-2022-7bit-lock' by default.
6122
6123    o coding-category-iso-8-else
6124
6125         The category for a coding system which has the same code range
6126         as ISO2022 of 8-bit environemnt but uses locking shift or
6127         single shift functions.  Assigned the coding-system (Lisp
6128         symbol) `iso-2022-8bit-ss2' by default.
6129
6130    o coding-category-big5
6131
6132         The category for a coding system which has the same code range
6133         as BIG5.  Assigned the coding-system (Lisp symbol)
6134         `cn-big5' by default.
6135
6136    o coding-category-utf-8
6137
6138         The category for a coding system which has the same code range
6139         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6140         symbol) `utf-8' by default.
6141
6142    o coding-category-utf-16-be
6143
6144         The category for a coding system in which a text has an
6145         Unicode signature (cf. Unicode Standard) in the order of BIG
6146         endian at the head.  Assigned the coding-system (Lisp symbol)
6147         `utf-16-be' by default.
6148
6149    o coding-category-utf-16-le
6150
6151         The category for a coding system in which a text has an
6152         Unicode signature (cf. Unicode Standard) in the order of
6153         LITTLE endian at the head.  Assigned the coding-system (Lisp
6154         symbol) `utf-16-le' by default.
6155
6156    o coding-category-ccl
6157
6158         The category for a coding system of which encoder/decoder is
6159         written in CCL programs.  The default value is nil, i.e., no
6160         coding system is assigned.
6161
6162    o coding-category-binary
6163
6164         The category for a coding system not categorized in any of the
6165         above.  Assigned the coding-system (Lisp symbol)
6166         `no-conversion' by default.
6167
6168    Each of them is a Lisp symbol and the value is an actual
6169    `coding-system's (this is also a Lisp symbol) assigned by a user.
6170    What Emacs does actually is to detect a category of coding system.
6171    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6172    decide only one possible category, it selects a category of the
6173    highest priority.  Priorities of categories are also specified by a
6174    user in a Lisp variable `coding-category-list'.
6175
6176 */
6177
6178 #define EOL_SEEN_NONE   0
6179 #define EOL_SEEN_LF     1
6180 #define EOL_SEEN_CR     2
6181 #define EOL_SEEN_CRLF   4
6182
6183 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6184    SOURCE is encoded.  If CATEGORY is one of
6185    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6186    two-byte, else they are encoded by one-byte.
6187
6188    Return one of EOL_SEEN_XXX.  */
6189
6190 #define MAX_EOL_CHECK_COUNT 3
6191
6192 static int
6193 detect_eol (source, src_bytes, category)
6194      const unsigned char *source;
6195      EMACS_INT src_bytes;
6196      enum coding_category category;
6197 {
6198   const unsigned char *src = source, *src_end = src + src_bytes;
6199   unsigned char c;
6200   int total  = 0;
6201   int eol_seen = EOL_SEEN_NONE;
6202
6203   if ((1 << category) & CATEGORY_MASK_UTF_16)
6204     {
6205       int msb, lsb;
6206
6207       msb = category == (coding_category_utf_16_le
6208                          | coding_category_utf_16_le_nosig);
6209       lsb = 1 - msb;
6210
6211       while (src + 1 < src_end)
6212         {
6213           c = src[lsb];
6214           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6215             {
6216               int this_eol;
6217
6218               if (c == '\n')
6219                 this_eol = EOL_SEEN_LF;
6220               else if (src + 3 >= src_end
6221                        || src[msb + 2] != 0
6222                        || src[lsb + 2] != '\n')
6223                 this_eol = EOL_SEEN_CR;
6224               else
6225                 {
6226                   this_eol = EOL_SEEN_CRLF;
6227                   src += 2;
6228                 }
6229
6230               if (eol_seen == EOL_SEEN_NONE)
6231                 /* This is the first end-of-line.  */
6232                 eol_seen = this_eol;
6233               else if (eol_seen != this_eol)
6234                 {
6235                   /* The found type is different from what found before.
6236                      Allow for stray ^M characters in DOS EOL files.  */
6237                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6238                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6239                     eol_seen = EOL_SEEN_CRLF;
6240                   else
6241                     {
6242                       eol_seen = EOL_SEEN_LF;
6243                       break;
6244                     }
6245                 }
6246               if (++total == MAX_EOL_CHECK_COUNT)
6247                 break;
6248             }
6249           src += 2;
6250         }
6251     }
6252   else
6253     {
6254       while (src < src_end)
6255         {
6256           c = *src++;
6257           if (c == '\n' || c == '\r')
6258             {
6259               int this_eol;
6260
6261               if (c == '\n')
6262                 this_eol = EOL_SEEN_LF;
6263               else if (src >= src_end || *src != '\n')
6264                 this_eol = EOL_SEEN_CR;
6265               else
6266                 this_eol = EOL_SEEN_CRLF, src++;
6267
6268               if (eol_seen == EOL_SEEN_NONE)
6269                 /* This is the first end-of-line.  */
6270                 eol_seen = this_eol;
6271               else if (eol_seen != this_eol)
6272                 {
6273                   /* The found type is different from what found before.
6274                      Allow for stray ^M characters in DOS EOL files.  */
6275                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6276                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6277                     eol_seen = EOL_SEEN_CRLF;
6278                   else
6279                     {
6280                       eol_seen = EOL_SEEN_LF;
6281                       break;
6282                     }
6283                 }
6284               if (++total == MAX_EOL_CHECK_COUNT)
6285                 break;
6286             }
6287         }
6288     }
6289   return eol_seen;
6290 }
6291
6292
6293 static Lisp_Object
6294 adjust_coding_eol_type (coding, eol_seen)
6295      struct coding_system *coding;
6296      int eol_seen;
6297 {
6298   Lisp_Object eol_type;
6299
6300   eol_type = CODING_ID_EOL_TYPE (coding->id);
6301   if (eol_seen & EOL_SEEN_LF)
6302     {
6303       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6304       eol_type = Qunix;
6305     }
6306   else if (eol_seen & EOL_SEEN_CRLF)
6307     {
6308       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6309       eol_type = Qdos;
6310     }
6311   else if (eol_seen & EOL_SEEN_CR)
6312     {
6313       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6314       eol_type = Qmac;
6315     }
6316   return eol_type;
6317 }
6318
6319 /* Detect how a text specified in CODING is encoded.  If a coding
6320    system is detected, update fields of CODING by the detected coding
6321    system.  */
6322
6323 void
6324 detect_coding (coding)
6325      struct coding_system *coding;
6326 {
6327   const unsigned char *src, *src_end;
6328   int saved_mode = coding->mode;
6329
6330   coding->consumed = coding->consumed_char = 0;
6331   coding->produced = coding->produced_char = 0;
6332   coding_set_source (coding);
6333
6334   src_end = coding->source + coding->src_bytes;
6335   coding->head_ascii = 0;
6336
6337   /* If we have not yet decided the text encoding type, detect it
6338      now.  */
6339   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6340     {
6341       int c, i;
6342       struct coding_detection_info detect_info;
6343       int null_byte_found = 0, eight_bit_found = 0;
6344
6345       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6346       for (src = coding->source; src < src_end; src++)
6347         {
6348           c = *src;
6349           if (c & 0x80)
6350             {
6351               eight_bit_found = 1;
6352               if (null_byte_found)
6353                 break;
6354             }
6355           else if (c < 0x20)
6356             {
6357               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6358                   && ! inhibit_iso_escape_detection
6359                   && ! detect_info.checked)
6360                 {
6361                   if (detect_coding_iso_2022 (coding, &detect_info))
6362                     {
6363                       /* We have scanned the whole data.  */
6364                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6365                         {
6366                           /* We didn't find an 8-bit code.  We may
6367                              have found a null-byte, but it's very
6368                              rare that a binary file confirm to
6369                              ISO-2022.  */
6370                           src = src_end;
6371                           coding->head_ascii = src - coding->source;
6372                         }
6373                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6374                       break;
6375                     }
6376                 }
6377               else if (! c && !inhibit_null_byte_detection)
6378                 {
6379                   null_byte_found = 1;
6380                   if (eight_bit_found)
6381                     break;
6382                 }
6383               if (! eight_bit_found)
6384                 coding->head_ascii++;
6385             }
6386           else if (! eight_bit_found)
6387             coding->head_ascii++;
6388         }
6389
6390       if (null_byte_found || eight_bit_found
6391           || coding->head_ascii < coding->src_bytes
6392           || detect_info.found)
6393         {
6394           enum coding_category category;
6395           struct coding_system *this;
6396
6397           if (coding->head_ascii == coding->src_bytes)
6398             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6399             for (i = 0; i < coding_category_raw_text; i++)
6400               {
6401                 category = coding_priorities[i];
6402                 this = coding_categories + category;
6403                 if (detect_info.found & (1 << category))
6404                   break;
6405               }
6406           else
6407             {
6408               if (null_byte_found)
6409                 {
6410                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6411                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6412                 }
6413               for (i = 0; i < coding_category_raw_text; i++)
6414                 {
6415                   category = coding_priorities[i];
6416                   this = coding_categories + category;
6417                   if (this->id < 0)
6418                     {
6419                       /* No coding system of this category is defined.  */
6420                       detect_info.rejected |= (1 << category);
6421                     }
6422                   else if (category >= coding_category_raw_text)
6423                     continue;
6424                   else if (detect_info.checked & (1 << category))
6425                     {
6426                       if (detect_info.found & (1 << category))
6427                         break;
6428                     }
6429                   else if ((*(this->detector)) (coding, &detect_info)
6430                            && detect_info.found & (1 << category))
6431                     {
6432                       if (category == coding_category_utf_16_auto)
6433                         {
6434                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6435                             category = coding_category_utf_16_le;
6436                           else
6437                             category = coding_category_utf_16_be;
6438                         }
6439                       break;
6440                     }
6441                 }
6442             }
6443
6444           if (i < coding_category_raw_text)
6445             setup_coding_system (CODING_ID_NAME (this->id), coding);
6446           else if (null_byte_found)
6447             setup_coding_system (Qno_conversion, coding);
6448           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6449                    == CATEGORY_MASK_ANY)
6450             setup_coding_system (Qraw_text, coding);
6451           else if (detect_info.rejected)
6452             for (i = 0; i < coding_category_raw_text; i++)
6453               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6454                 {
6455                   this = coding_categories + coding_priorities[i];
6456                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6457                   break;
6458                 }
6459         }
6460     }
6461   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6462            == coding_category_utf_8_auto)
6463     {
6464       Lisp_Object coding_systems;
6465       struct coding_detection_info detect_info;
6466
6467       coding_systems
6468         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6469       detect_info.found = detect_info.rejected = 0;
6470       coding->head_ascii = 0;
6471       if (CONSP (coding_systems)
6472           && detect_coding_utf_8 (coding, &detect_info))
6473         {
6474           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6475             setup_coding_system (XCAR (coding_systems), coding);
6476           else
6477             setup_coding_system (XCDR (coding_systems), coding);
6478         }
6479     }
6480   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6481            == coding_category_utf_16_auto)
6482     {
6483       Lisp_Object coding_systems;
6484       struct coding_detection_info detect_info;
6485
6486       coding_systems
6487         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6488       detect_info.found = detect_info.rejected = 0;
6489       coding->head_ascii = 0;
6490       if (CONSP (coding_systems)
6491           && detect_coding_utf_16 (coding, &detect_info))
6492         {
6493           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6494             setup_coding_system (XCAR (coding_systems), coding);
6495           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6496             setup_coding_system (XCDR (coding_systems), coding);
6497         }
6498     }
6499   coding->mode = saved_mode;
6500 }
6501
6502
6503 static void
6504 decode_eol (coding)
6505      struct coding_system *coding;
6506 {
6507   Lisp_Object eol_type;
6508   unsigned char *p, *pbeg, *pend;
6509
6510   eol_type = CODING_ID_EOL_TYPE (coding->id);
6511   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6512     return;
6513
6514   if (NILP (coding->dst_object))
6515     pbeg = coding->destination;
6516   else
6517     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6518   pend = pbeg + coding->produced;
6519
6520   if (VECTORP (eol_type))
6521     {
6522       int eol_seen = EOL_SEEN_NONE;
6523
6524       for (p = pbeg; p < pend; p++)
6525         {
6526           if (*p == '\n')
6527             eol_seen |= EOL_SEEN_LF;
6528           else if (*p == '\r')
6529             {
6530               if (p + 1 < pend && *(p + 1) == '\n')
6531                 {
6532                   eol_seen |= EOL_SEEN_CRLF;
6533                   p++;
6534                 }
6535               else
6536                 eol_seen |= EOL_SEEN_CR;
6537             }
6538         }
6539       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6540       if ((eol_seen & EOL_SEEN_CRLF) != 0
6541           && (eol_seen & EOL_SEEN_CR) != 0
6542           && (eol_seen & EOL_SEEN_LF) == 0)
6543         eol_seen = EOL_SEEN_CRLF;
6544       else if (eol_seen != EOL_SEEN_NONE
6545           && eol_seen != EOL_SEEN_LF
6546           && eol_seen != EOL_SEEN_CRLF
6547           && eol_seen != EOL_SEEN_CR)
6548         eol_seen = EOL_SEEN_LF;
6549       if (eol_seen != EOL_SEEN_NONE)
6550         eol_type = adjust_coding_eol_type (coding, eol_seen);
6551     }
6552
6553   if (EQ (eol_type, Qmac))
6554     {
6555       for (p = pbeg; p < pend; p++)
6556         if (*p == '\r')
6557           *p = '\n';
6558     }
6559   else if (EQ (eol_type, Qdos))
6560     {
6561       int n = 0;
6562
6563       if (NILP (coding->dst_object))
6564         {
6565           /* Start deleting '\r' from the tail to minimize the memory
6566              movement.  */
6567           for (p = pend - 2; p >= pbeg; p--)
6568             if (*p == '\r')
6569               {
6570                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6571                 n++;
6572               }
6573         }
6574       else
6575         {
6576           int pos_byte = coding->dst_pos_byte;
6577           int pos = coding->dst_pos;
6578           int pos_end = pos + coding->produced_char - 1;
6579
6580           while (pos < pos_end)
6581             {
6582               p = BYTE_POS_ADDR (pos_byte);
6583               if (*p == '\r' && p[1] == '\n')
6584                 {
6585                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6586                   n++;
6587                   pos_end--;
6588                 }
6589               pos++;
6590               if (coding->dst_multibyte)
6591                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6592               else
6593                 pos_byte++;
6594             }
6595         }
6596       coding->produced -= n;
6597       coding->produced_char -= n;
6598     }
6599 }
6600
6601
6602 /* Return a translation table (or list of them) from coding system
6603    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6604    decoding (ENCODEP is zero). */
6605
6606 static Lisp_Object
6607 get_translation_table (attrs, encodep, max_lookup)
6608      Lisp_Object attrs;
6609      int encodep, *max_lookup;
6610 {
6611   Lisp_Object standard, translation_table;
6612   Lisp_Object val;
6613
6614   if (encodep)
6615     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6616       standard = Vstandard_translation_table_for_encode;
6617   else
6618     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6619       standard = Vstandard_translation_table_for_decode;
6620   if (NILP (translation_table))
6621     translation_table = standard;
6622   else
6623     {
6624       if (SYMBOLP (translation_table))
6625         translation_table = Fget (translation_table, Qtranslation_table);
6626       else if (CONSP (translation_table))
6627         {
6628           translation_table = Fcopy_sequence (translation_table);
6629           for (val = translation_table; CONSP (val); val = XCDR (val))
6630             if (SYMBOLP (XCAR (val)))
6631               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6632         }
6633       if (CHAR_TABLE_P (standard))
6634         {
6635           if (CONSP (translation_table))
6636             translation_table = nconc2 (translation_table,
6637                                         Fcons (standard, Qnil));
6638           else
6639             translation_table = Fcons (translation_table,
6640                                        Fcons (standard, Qnil));
6641         }
6642     }
6643
6644   if (max_lookup)
6645     {
6646       *max_lookup = 1;
6647       if (CHAR_TABLE_P (translation_table)
6648           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6649         {
6650           val = XCHAR_TABLE (translation_table)->extras[1];
6651           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6652             *max_lookup = XFASTINT (val);
6653         }
6654       else if (CONSP (translation_table))
6655         {
6656           Lisp_Object tail, val;
6657
6658           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6659             if (CHAR_TABLE_P (XCAR (tail))
6660                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6661               {
6662                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6663                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6664                   *max_lookup = XFASTINT (val);
6665               }
6666         }
6667     }
6668   return translation_table;
6669 }
6670
6671 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6672   do {                                                          \
6673     trans = Qnil;                                               \
6674     if (CHAR_TABLE_P (table))                                   \
6675       {                                                         \
6676         trans = CHAR_TABLE_REF (table, c);                      \
6677         if (CHARACTERP (trans))                                 \
6678           c = XFASTINT (trans), trans = Qnil;                   \
6679       }                                                         \
6680     else if (CONSP (table))                                     \
6681       {                                                         \
6682         Lisp_Object tail;                                       \
6683                                                                 \
6684         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6685           if (CHAR_TABLE_P (XCAR (tail)))                       \
6686             {                                                   \
6687               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6688               if (CHARACTERP (trans))                           \
6689                 c = XFASTINT (trans), trans = Qnil;             \
6690               else if (! NILP (trans))                          \
6691                 break;                                          \
6692             }                                                   \
6693       }                                                         \
6694   } while (0)
6695
6696
6697 /* Return a translation of character(s) at BUF according to TRANS.
6698    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6699    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6700    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6701    translation is found, and Qnil if not found..
6702    If BUF is too short to lookup characters in FROM, return Qt.  */
6703
6704 static Lisp_Object
6705 get_translation (trans, buf, buf_end)
6706      Lisp_Object trans;
6707      int *buf, *buf_end;
6708 {
6709
6710   if (INTEGERP (trans))
6711     return trans;
6712   for (; CONSP (trans); trans = XCDR (trans))
6713     {
6714       Lisp_Object val = XCAR (trans);
6715       Lisp_Object from = XCAR (val);
6716       int len = ASIZE (from);
6717       int i;
6718
6719       for (i = 0; i < len; i++)
6720         {
6721           if (buf + i == buf_end)
6722             return Qt;
6723           if (XINT (AREF (from, i)) != buf[i])
6724             break;
6725         }
6726       if (i == len)
6727         return val;
6728     }
6729   return Qnil;
6730 }
6731
6732
6733 static int
6734 produce_chars (coding, translation_table, last_block)
6735      struct coding_system *coding;
6736      Lisp_Object translation_table;
6737      int last_block;
6738 {
6739   unsigned char *dst = coding->destination + coding->produced;
6740   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6741   EMACS_INT produced;
6742   EMACS_INT produced_chars = 0;
6743   int carryover = 0;
6744
6745   if (! coding->chars_at_source)
6746     {
6747       /* Source characters are in coding->charbuf.  */
6748       int *buf = coding->charbuf;
6749       int *buf_end = buf + coding->charbuf_used;
6750
6751       if (EQ (coding->src_object, coding->dst_object))
6752         {
6753           coding_set_source (coding);
6754           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6755         }
6756
6757       while (buf < buf_end)
6758         {
6759           int c = *buf, i;
6760
6761           if (c >= 0)
6762             {
6763               int from_nchars = 1, to_nchars = 1;
6764               Lisp_Object trans = Qnil;
6765
6766               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6767               if (! NILP (trans))
6768                 {
6769                   trans = get_translation (trans, buf, buf_end);
6770                   if (INTEGERP (trans))
6771                     c = XINT (trans);
6772                   else if (CONSP (trans))
6773                     {
6774                       from_nchars = ASIZE (XCAR (trans));
6775                       trans = XCDR (trans);
6776                       if (INTEGERP (trans))
6777                         c = XINT (trans);
6778                       else
6779                         {
6780                           to_nchars = ASIZE (trans);
6781                           c = XINT (AREF (trans, 0));
6782                         }
6783                     }
6784                   else if (EQ (trans, Qt) && ! last_block)
6785                     break;
6786                 }
6787
6788               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6789                 {
6790                   dst = alloc_destination (coding,
6791                                            buf_end - buf
6792                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6793                                            dst);
6794                   if (EQ (coding->src_object, coding->dst_object))
6795                     {
6796                       coding_set_source (coding);
6797                       dst_end = (((unsigned char *) coding->source)
6798                                  + coding->consumed);
6799                     }
6800                   else
6801                     dst_end = coding->destination + coding->dst_bytes;
6802                 }
6803
6804               for (i = 0; i < to_nchars; i++)
6805                 {
6806                   if (i > 0)
6807                     c = XINT (AREF (trans, i));
6808                   if (coding->dst_multibyte
6809                       || ! CHAR_BYTE8_P (c))
6810                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6811                   else
6812                     *dst++ = CHAR_TO_BYTE8 (c);
6813                 }
6814               produced_chars += to_nchars;
6815               buf += from_nchars;
6816             }
6817           else
6818             /* This is an annotation datum.  (-C) is the length.  */
6819             buf += -c;
6820         }
6821       carryover = buf_end - buf;
6822     }
6823   else
6824     {
6825       /* Source characters are at coding->source.  */
6826       const unsigned char *src = coding->source;
6827       const unsigned char *src_end = src + coding->consumed;
6828
6829       if (EQ (coding->dst_object, coding->src_object))
6830         dst_end = (unsigned char *) src;
6831       if (coding->src_multibyte != coding->dst_multibyte)
6832         {
6833           if (coding->src_multibyte)
6834             {
6835               int multibytep = 1;
6836               EMACS_INT consumed_chars = 0;
6837
6838               while (1)
6839                 {
6840                   const unsigned char *src_base = src;
6841                   int c;
6842
6843                   ONE_MORE_BYTE (c);
6844                   if (dst == dst_end)
6845                     {
6846                       if (EQ (coding->src_object, coding->dst_object))
6847                         dst_end = (unsigned char *) src;
6848                       if (dst == dst_end)
6849                         {
6850                           EMACS_INT offset = src - coding->source;
6851
6852                           dst = alloc_destination (coding, src_end - src + 1,
6853                                                    dst);
6854                           dst_end = coding->destination + coding->dst_bytes;
6855                           coding_set_source (coding);
6856                           src = coding->source + offset;
6857                           src_end = coding->source + coding->src_bytes;
6858                           if (EQ (coding->src_object, coding->dst_object))
6859                             dst_end = (unsigned char *) src;
6860                         }
6861                     }
6862                   *dst++ = c;
6863                   produced_chars++;
6864                 }
6865             no_more_source:
6866               ;
6867             }
6868           else
6869             while (src < src_end)
6870               {
6871                 int multibytep = 1;
6872                 int c = *src++;
6873
6874                 if (dst >= dst_end - 1)
6875                   {
6876                     if (EQ (coding->src_object, coding->dst_object))
6877                       dst_end = (unsigned char *) src;
6878                     if (dst >= dst_end - 1)
6879                       {
6880                         EMACS_INT offset = src - coding->source;
6881                         EMACS_INT more_bytes;
6882
6883                         if (EQ (coding->src_object, coding->dst_object))
6884                           more_bytes = ((src_end - src) / 2) + 2;
6885                         else
6886                           more_bytes = src_end - src + 2;
6887                         dst = alloc_destination (coding, more_bytes, dst);
6888                         dst_end = coding->destination + coding->dst_bytes;
6889                         coding_set_source (coding);
6890                         src = coding->source + offset;
6891                         src_end = coding->source + coding->src_bytes;
6892                         if (EQ (coding->src_object, coding->dst_object))
6893                           dst_end = (unsigned char *) src;
6894                       }
6895                   }
6896                 EMIT_ONE_BYTE (c);
6897               }
6898         }
6899       else
6900         {
6901           if (!EQ (coding->src_object, coding->dst_object))
6902             {
6903               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6904
6905               if (require > 0)
6906                 {
6907                   EMACS_INT offset = src - coding->source;
6908
6909                   dst = alloc_destination (coding, require, dst);
6910                   coding_set_source (coding);
6911                   src = coding->source + offset;
6912                   src_end = coding->source + coding->src_bytes;
6913                 }
6914             }
6915           produced_chars = coding->consumed_char;
6916           while (src < src_end)
6917             *dst++ = *src++;
6918         }
6919     }
6920
6921   produced = dst - (coding->destination + coding->produced);
6922   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6923     insert_from_gap (produced_chars, produced);
6924   coding->produced += produced;
6925   coding->produced_char += produced_chars;
6926   return carryover;
6927 }
6928
6929 /* Compose text in CODING->object according to the annotation data at
6930    CHARBUF.  CHARBUF is an array:
6931      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6932  */
6933
6934 static INLINE void
6935 produce_composition (coding, charbuf, pos)
6936      struct coding_system *coding;
6937      int *charbuf;
6938      EMACS_INT pos;
6939 {
6940   int len;
6941   EMACS_INT to;
6942   enum composition_method method;
6943   Lisp_Object components;
6944
6945   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6946   to = pos + charbuf[2];
6947   method = (enum composition_method) (charbuf[4]);
6948
6949   if (method == COMPOSITION_RELATIVE)
6950     components = Qnil;
6951   else
6952     {
6953       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6954       int i, j;
6955
6956       if (method == COMPOSITION_WITH_RULE)
6957         len = charbuf[2] * 3 - 2;
6958       charbuf += MAX_ANNOTATION_LENGTH;
6959       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6960       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6961         {
6962           if (charbuf[i] >= 0)
6963             args[j] = make_number (charbuf[i]);
6964           else
6965             {
6966               i++;
6967               args[j] = make_number (charbuf[i] % 0x100);
6968             }
6969         }
6970       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6971     }
6972   compose_text (pos, to, components, Qnil, coding->dst_object);
6973 }
6974
6975
6976 /* Put `charset' property on text in CODING->object according to
6977    the annotation data at CHARBUF.  CHARBUF is an array:
6978      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6979  */
6980
6981 static INLINE void
6982 produce_charset (coding, charbuf, pos)
6983      struct coding_system *coding;
6984      int *charbuf;
6985      EMACS_INT pos;
6986 {
6987   EMACS_INT from = pos - charbuf[2];
6988   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6989
6990   Fput_text_property (make_number (from), make_number (pos),
6991                       Qcharset, CHARSET_NAME (charset),
6992                       coding->dst_object);
6993 }
6994
6995
6996 #define CHARBUF_SIZE 0x4000
6997
6998 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6999   do {                                                                  \
7000     int size = CHARBUF_SIZE;                                            \
7001                                                                         \
7002     coding->charbuf = NULL;                                             \
7003     while (size > 1024)                                                 \
7004       {                                                                 \
7005         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7006         if (coding->charbuf)                                            \
7007           break;                                                        \
7008         size >>= 1;                                                     \
7009       }                                                                 \
7010     if (! coding->charbuf)                                              \
7011       {                                                                 \
7012         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7013         return coding->result;                                          \
7014       }                                                                 \
7015     coding->charbuf_size = size;                                        \
7016   } while (0)
7017
7018
7019 static void
7020 produce_annotation (coding, pos)
7021      struct coding_system *coding;
7022      EMACS_INT pos;
7023 {
7024   int *charbuf = coding->charbuf;
7025   int *charbuf_end = charbuf + coding->charbuf_used;
7026
7027   if (NILP (coding->dst_object))
7028     return;
7029
7030   while (charbuf < charbuf_end)
7031     {
7032       if (*charbuf >= 0)
7033         pos++, charbuf++;
7034       else
7035         {
7036           int len = -*charbuf;
7037
7038           if (len > 2)
7039             switch (charbuf[1])
7040               {
7041               case CODING_ANNOTATE_COMPOSITION_MASK:
7042                 produce_composition (coding, charbuf, pos);
7043                 break;
7044               case CODING_ANNOTATE_CHARSET_MASK:
7045                 produce_charset (coding, charbuf, pos);
7046                 break;
7047               }
7048           charbuf += len;
7049         }
7050     }
7051 }
7052
7053 /* Decode the data at CODING->src_object into CODING->dst_object.
7054    CODING->src_object is a buffer, a string, or nil.
7055    CODING->dst_object is a buffer.
7056
7057    If CODING->src_object is a buffer, it must be the current buffer.
7058    In this case, if CODING->src_pos is positive, it is a position of
7059    the source text in the buffer, otherwise, the source text is in the
7060    gap area of the buffer, and CODING->src_pos specifies the offset of
7061    the text from GPT (which must be the same as PT).  If this is the
7062    same buffer as CODING->dst_object, CODING->src_pos must be
7063    negative.
7064
7065    If CODING->src_object is a string, CODING->src_pos is an index to
7066    that string.
7067
7068    If CODING->src_object is nil, CODING->source must already point to
7069    the non-relocatable memory area.  In this case, CODING->src_pos is
7070    an offset from CODING->source.
7071
7072    The decoded data is inserted at the current point of the buffer
7073    CODING->dst_object.
7074 */
7075
7076 static int
7077 decode_coding (coding)
7078      struct coding_system *coding;
7079 {
7080   Lisp_Object attrs;
7081   Lisp_Object undo_list;
7082   Lisp_Object translation_table;
7083   int carryover;
7084   int i;
7085
7086   if (BUFFERP (coding->src_object)
7087       && coding->src_pos > 0
7088       && coding->src_pos < GPT
7089       && coding->src_pos + coding->src_chars > GPT)
7090     move_gap_both (coding->src_pos, coding->src_pos_byte);
7091
7092   undo_list = Qt;
7093   if (BUFFERP (coding->dst_object))
7094     {
7095       if (current_buffer != XBUFFER (coding->dst_object))
7096         set_buffer_internal (XBUFFER (coding->dst_object));
7097       if (GPT != PT)
7098         move_gap_both (PT, PT_BYTE);
7099       undo_list = current_buffer->undo_list;
7100       current_buffer->undo_list = Qt;
7101     }
7102
7103   coding->consumed = coding->consumed_char = 0;
7104   coding->produced = coding->produced_char = 0;
7105   coding->chars_at_source = 0;
7106   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7107   coding->errors = 0;
7108
7109   ALLOC_CONVERSION_WORK_AREA (coding);
7110
7111   attrs = CODING_ID_ATTRS (coding->id);
7112   translation_table = get_translation_table (attrs, 0, NULL);
7113
7114   carryover = 0;
7115   do
7116     {
7117       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7118
7119       coding_set_source (coding);
7120       coding->annotated = 0;
7121       coding->charbuf_used = carryover;
7122       (*(coding->decoder)) (coding);
7123       coding_set_destination (coding);
7124       carryover = produce_chars (coding, translation_table, 0);
7125       if (coding->annotated)
7126         produce_annotation (coding, pos);
7127       for (i = 0; i < carryover; i++)
7128         coding->charbuf[i]
7129           = coding->charbuf[coding->charbuf_used - carryover + i];
7130     }
7131   while (coding->consumed < coding->src_bytes
7132          && (coding->result == CODING_RESULT_SUCCESS
7133              || coding->result == CODING_RESULT_INVALID_SRC));
7134
7135   if (carryover > 0)
7136     {
7137       coding_set_destination (coding);
7138       coding->charbuf_used = carryover;
7139       produce_chars (coding, translation_table, 1);
7140     }
7141
7142   coding->carryover_bytes = 0;
7143   if (coding->consumed < coding->src_bytes)
7144     {
7145       int nbytes = coding->src_bytes - coding->consumed;
7146       const unsigned char *src;
7147
7148       coding_set_source (coding);
7149       coding_set_destination (coding);
7150       src = coding->source + coding->consumed;
7151
7152       if (coding->mode & CODING_MODE_LAST_BLOCK)
7153         {
7154           /* Flush out unprocessed data as binary chars.  We are sure
7155              that the number of data is less than the size of
7156              coding->charbuf.  */
7157           coding->charbuf_used = 0;
7158           coding->chars_at_source = 0;
7159
7160           while (nbytes-- > 0)
7161             {
7162               int c = *src++;
7163
7164               if (c & 0x80)
7165                 c = BYTE8_TO_CHAR (c);
7166               coding->charbuf[coding->charbuf_used++] = c;
7167             }
7168           produce_chars (coding, Qnil, 1);
7169         }
7170       else
7171         {
7172           /* Record unprocessed bytes in coding->carryover.  We are
7173              sure that the number of data is less than the size of
7174              coding->carryover.  */
7175           unsigned char *p = coding->carryover;
7176
7177           if (nbytes > sizeof coding->carryover)
7178             nbytes = sizeof coding->carryover;
7179           coding->carryover_bytes = nbytes;
7180           while (nbytes-- > 0)
7181             *p++ = *src++;
7182         }
7183       coding->consumed = coding->src_bytes;
7184     }
7185
7186   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7187       && !inhibit_eol_conversion)
7188     decode_eol (coding);
7189   if (BUFFERP (coding->dst_object))
7190     {
7191       current_buffer->undo_list = undo_list;
7192       record_insert (coding->dst_pos, coding->produced_char);
7193     }
7194   return coding->result;
7195 }
7196
7197
7198 /* Extract an annotation datum from a composition starting at POS and
7199    ending before LIMIT of CODING->src_object (buffer or string), store
7200    the data in BUF, set *STOP to a starting position of the next
7201    composition (if any) or to LIMIT, and return the address of the
7202    next element of BUF.
7203
7204    If such an annotation is not found, set *STOP to a starting
7205    position of a composition after POS (if any) or to LIMIT, and
7206    return BUF.  */
7207
7208 static INLINE int *
7209 handle_composition_annotation (pos, limit, coding, buf, stop)
7210      EMACS_INT pos, limit;
7211      struct coding_system *coding;
7212      int *buf;
7213      EMACS_INT *stop;
7214 {
7215   EMACS_INT start, end;
7216   Lisp_Object prop;
7217
7218   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7219       || end > limit)
7220     *stop = limit;
7221   else if (start > pos)
7222     *stop = start;
7223   else
7224     {
7225       if (start == pos)
7226         {
7227           /* We found a composition.  Store the corresponding
7228              annotation data in BUF.  */
7229           int *head = buf;
7230           enum composition_method method = COMPOSITION_METHOD (prop);
7231           int nchars = COMPOSITION_LENGTH (prop);
7232
7233           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7234           if (method != COMPOSITION_RELATIVE)
7235             {
7236               Lisp_Object components;
7237               int len, i, i_byte;
7238
7239               components = COMPOSITION_COMPONENTS (prop);
7240               if (VECTORP (components))
7241                 {
7242                   len = XVECTOR (components)->size;
7243                   for (i = 0; i < len; i++)
7244                     *buf++ = XINT (AREF (components, i));
7245                 }
7246               else if (STRINGP (components))
7247                 {
7248                   len = SCHARS (components);
7249                   i = i_byte = 0;
7250                   while (i < len)
7251                     {
7252                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7253                       buf++;
7254                     }
7255                 }
7256               else if (INTEGERP (components))
7257                 {
7258                   len = 1;
7259                   *buf++ = XINT (components);
7260                 }
7261               else if (CONSP (components))
7262                 {
7263                   for (len = 0; CONSP (components);
7264                        len++, components = XCDR (components))
7265                     *buf++ = XINT (XCAR (components));
7266                 }
7267               else
7268                 abort ();
7269               *head -= len;
7270             }
7271         }
7272
7273       if (find_composition (end, limit, &start, &end, &prop,
7274                             coding->src_object)
7275           && end <= limit)
7276         *stop = start;
7277       else
7278         *stop = limit;
7279     }
7280   return buf;
7281 }
7282
7283
7284 /* Extract an annotation datum from a text property `charset' at POS of
7285    CODING->src_object (buffer of string), store the data in BUF, set
7286    *STOP to the position where the value of `charset' property changes
7287    (limiting by LIMIT), and return the address of the next element of
7288    BUF.
7289
7290    If the property value is nil, set *STOP to the position where the
7291    property value is non-nil (limiting by LIMIT), and return BUF.  */
7292
7293 static INLINE int *
7294 handle_charset_annotation (pos, limit, coding, buf, stop)
7295      EMACS_INT pos, limit;
7296      struct coding_system *coding;
7297      int *buf;
7298      EMACS_INT *stop;
7299 {
7300   Lisp_Object val, next;
7301   int id;
7302
7303   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7304   if (! NILP (val) && CHARSETP (val))
7305     id = XINT (CHARSET_SYMBOL_ID (val));
7306   else
7307     id = -1;
7308   ADD_CHARSET_DATA (buf, 0, id);
7309   next = Fnext_single_property_change (make_number (pos), Qcharset,
7310                                        coding->src_object,
7311                                        make_number (limit));
7312   *stop = XINT (next);
7313   return buf;
7314 }
7315
7316
7317 static void
7318 consume_chars (coding, translation_table, max_lookup)
7319      struct coding_system *coding;
7320      Lisp_Object translation_table;
7321      int max_lookup;
7322 {
7323   int *buf = coding->charbuf;
7324   int *buf_end = coding->charbuf + coding->charbuf_size;
7325   const unsigned char *src = coding->source + coding->consumed;
7326   const unsigned char *src_end = coding->source + coding->src_bytes;
7327   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7328   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7329   int multibytep = coding->src_multibyte;
7330   Lisp_Object eol_type;
7331   int c;
7332   EMACS_INT stop, stop_composition, stop_charset;
7333   int *lookup_buf = NULL;
7334
7335   if (! NILP (translation_table))
7336     lookup_buf = alloca (sizeof (int) * max_lookup);
7337
7338   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7339   if (VECTORP (eol_type))
7340     eol_type = Qunix;
7341
7342   /* Note: composition handling is not yet implemented.  */
7343   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7344
7345   if (NILP (coding->src_object))
7346     stop = stop_composition = stop_charset = end_pos;
7347   else
7348     {
7349       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7350         stop = stop_composition = pos;
7351       else
7352         stop = stop_composition = end_pos;
7353       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7354         stop = stop_charset = pos;
7355       else
7356         stop_charset = end_pos;
7357     }
7358
7359   /* Compensate for CRLF and conversion.  */
7360   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7361   while (buf < buf_end)
7362     {
7363       Lisp_Object trans;
7364
7365       if (pos == stop)
7366         {
7367           if (pos == end_pos)
7368             break;
7369           if (pos == stop_composition)
7370             buf = handle_composition_annotation (pos, end_pos, coding,
7371                                                  buf, &stop_composition);
7372           if (pos == stop_charset)
7373             buf = handle_charset_annotation (pos, end_pos, coding,
7374                                              buf, &stop_charset);
7375           stop = (stop_composition < stop_charset
7376                   ? stop_composition : stop_charset);
7377         }
7378
7379       if (! multibytep)
7380         {
7381           EMACS_INT bytes;
7382
7383           if (coding->encoder == encode_coding_raw_text)
7384             c = *src++, pos++;
7385           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7386             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7387           else
7388             c = BYTE8_TO_CHAR (*src), src++, pos++;
7389         }
7390       else
7391         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7392       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7393         c = '\n';
7394       if (! EQ (eol_type, Qunix))
7395         {
7396           if (c == '\n')
7397             {
7398               if (EQ (eol_type, Qdos))
7399                 *buf++ = '\r';
7400               else
7401                 c = '\r';
7402             }
7403         }
7404
7405       trans = Qnil;
7406       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7407       if (NILP (trans))
7408         *buf++ = c;
7409       else
7410         {
7411           int from_nchars = 1, to_nchars = 1;
7412           int *lookup_buf_end;
7413           const unsigned char *p = src;
7414           int i;
7415
7416           lookup_buf[0] = c;
7417           for (i = 1; i < max_lookup && p < src_end; i++)
7418             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7419           lookup_buf_end = lookup_buf + i;
7420           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7421           if (INTEGERP (trans))
7422             c = XINT (trans);
7423           else if (CONSP (trans))
7424             {
7425               from_nchars = ASIZE (XCAR (trans));
7426               trans = XCDR (trans);
7427               if (INTEGERP (trans))
7428                 c = XINT (trans);
7429               else
7430                 {
7431                   to_nchars = ASIZE (trans);
7432                   if (buf + to_nchars > buf_end)
7433                     break;
7434                   c = XINT (AREF (trans, 0));
7435                 }
7436             }
7437           else
7438             break;
7439           *buf++ = c;
7440           for (i = 1; i < to_nchars; i++)
7441             *buf++ = XINT (AREF (trans, i));
7442           for (i = 1; i < from_nchars; i++, pos++)
7443             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7444         }
7445     }
7446
7447   coding->consumed = src - coding->source;
7448   coding->consumed_char = pos - coding->src_pos;
7449   coding->charbuf_used = buf - coding->charbuf;
7450   coding->chars_at_source = 0;
7451 }
7452
7453
7454 /* Encode the text at CODING->src_object into CODING->dst_object.
7455    CODING->src_object is a buffer or a string.
7456    CODING->dst_object is a buffer or nil.
7457
7458    If CODING->src_object is a buffer, it must be the current buffer.
7459    In this case, if CODING->src_pos is positive, it is a position of
7460    the source text in the buffer, otherwise. the source text is in the
7461    gap area of the buffer, and coding->src_pos specifies the offset of
7462    the text from GPT (which must be the same as PT).  If this is the
7463    same buffer as CODING->dst_object, CODING->src_pos must be
7464    negative and CODING should not have `pre-write-conversion'.
7465
7466    If CODING->src_object is a string, CODING should not have
7467    `pre-write-conversion'.
7468
7469    If CODING->dst_object is a buffer, the encoded data is inserted at
7470    the current point of that buffer.
7471
7472    If CODING->dst_object is nil, the encoded data is placed at the
7473    memory area specified by CODING->destination.  */
7474
7475 static int
7476 encode_coding (coding)
7477      struct coding_system *coding;
7478 {
7479   Lisp_Object attrs;
7480   Lisp_Object translation_table;
7481   int max_lookup;
7482
7483   attrs = CODING_ID_ATTRS (coding->id);
7484   if (coding->encoder == encode_coding_raw_text)
7485     translation_table = Qnil, max_lookup = 0;
7486   else
7487     translation_table = get_translation_table (attrs, 1, &max_lookup);
7488
7489   if (BUFFERP (coding->dst_object))
7490     {
7491       set_buffer_internal (XBUFFER (coding->dst_object));
7492       coding->dst_multibyte
7493         = ! NILP (current_buffer->enable_multibyte_characters);
7494     }
7495
7496   coding->consumed = coding->consumed_char = 0;
7497   coding->produced = coding->produced_char = 0;
7498   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7499   coding->errors = 0;
7500
7501   ALLOC_CONVERSION_WORK_AREA (coding);
7502
7503   do {
7504     coding_set_source (coding);
7505     consume_chars (coding, translation_table, max_lookup);
7506     coding_set_destination (coding);
7507     (*(coding->encoder)) (coding);
7508   } while (coding->consumed_char < coding->src_chars);
7509
7510   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7511     insert_from_gap (coding->produced_char, coding->produced);
7512
7513   return (coding->result);
7514 }
7515
7516
7517 /* Name (or base name) of work buffer for code conversion.  */
7518 static Lisp_Object Vcode_conversion_workbuf_name;
7519
7520 /* A working buffer used by the top level conversion.  Once it is
7521    created, it is never destroyed.  It has the name
7522    Vcode_conversion_workbuf_name.  The other working buffers are
7523    destroyed after the use is finished, and their names are modified
7524    versions of Vcode_conversion_workbuf_name.  */
7525 static Lisp_Object Vcode_conversion_reused_workbuf;
7526
7527 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7528 static int reused_workbuf_in_use;
7529
7530
7531 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7532    multibyteness of returning buffer.  */
7533
7534 static Lisp_Object
7535 make_conversion_work_buffer (multibyte)
7536      int multibyte;
7537 {
7538   Lisp_Object name, workbuf;
7539   struct buffer *current;
7540
7541   if (reused_workbuf_in_use++)
7542     {
7543       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7544       workbuf = Fget_buffer_create (name);
7545     }
7546   else
7547     {
7548       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7549         Vcode_conversion_reused_workbuf
7550           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7551       workbuf = Vcode_conversion_reused_workbuf;
7552     }
7553   current = current_buffer;
7554   set_buffer_internal (XBUFFER (workbuf));
7555   /* We can't allow modification hooks to run in the work buffer.  For
7556      instance, directory_files_internal assumes that file decoding
7557      doesn't compile new regexps.  */
7558   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7559   Ferase_buffer ();
7560   current_buffer->undo_list = Qt;
7561   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7562   set_buffer_internal (current);
7563   return workbuf;
7564 }
7565
7566
7567 static Lisp_Object
7568 code_conversion_restore (arg)
7569      Lisp_Object arg;
7570 {
7571   Lisp_Object current, workbuf;
7572   struct gcpro gcpro1;
7573
7574   GCPRO1 (arg);
7575   current = XCAR (arg);
7576   workbuf = XCDR (arg);
7577   if (! NILP (workbuf))
7578     {
7579       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7580         reused_workbuf_in_use = 0;
7581       else if (! NILP (Fbuffer_live_p (workbuf)))
7582         Fkill_buffer (workbuf);
7583     }
7584   set_buffer_internal (XBUFFER (current));
7585   UNGCPRO;
7586   return Qnil;
7587 }
7588
7589 Lisp_Object
7590 code_conversion_save (with_work_buf, multibyte)
7591      int with_work_buf, multibyte;
7592 {
7593   Lisp_Object workbuf = Qnil;
7594
7595   if (with_work_buf)
7596     workbuf = make_conversion_work_buffer (multibyte);
7597   record_unwind_protect (code_conversion_restore,
7598                          Fcons (Fcurrent_buffer (), workbuf));
7599   return workbuf;
7600 }
7601
7602 int
7603 decode_coding_gap (coding, chars, bytes)
7604      struct coding_system *coding;
7605      EMACS_INT chars, bytes;
7606 {
7607   int count = specpdl_ptr - specpdl;
7608   Lisp_Object attrs;
7609
7610   code_conversion_save (0, 0);
7611
7612   coding->src_object = Fcurrent_buffer ();
7613   coding->src_chars = chars;
7614   coding->src_bytes = bytes;
7615   coding->src_pos = -chars;
7616   coding->src_pos_byte = -bytes;
7617   coding->src_multibyte = chars < bytes;
7618   coding->dst_object = coding->src_object;
7619   coding->dst_pos = PT;
7620   coding->dst_pos_byte = PT_BYTE;
7621   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7622
7623   if (CODING_REQUIRE_DETECTION (coding))
7624     detect_coding (coding);
7625
7626   coding->mode |= CODING_MODE_LAST_BLOCK;
7627   current_buffer->text->inhibit_shrinking = 1;
7628   decode_coding (coding);
7629   current_buffer->text->inhibit_shrinking = 0;
7630
7631   attrs = CODING_ID_ATTRS (coding->id);
7632   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7633     {
7634       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7635       Lisp_Object val;
7636
7637       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7638       val = call1 (CODING_ATTR_POST_READ (attrs),
7639                    make_number (coding->produced_char));
7640       CHECK_NATNUM (val);
7641       coding->produced_char += Z - prev_Z;
7642       coding->produced += Z_BYTE - prev_Z_BYTE;
7643     }
7644
7645   unbind_to (count, Qnil);
7646   return coding->result;
7647 }
7648
7649 int
7650 encode_coding_gap (coding, chars, bytes)
7651      struct coding_system *coding;
7652      EMACS_INT chars, bytes;
7653 {
7654   int count = specpdl_ptr - specpdl;
7655
7656   code_conversion_save (0, 0);
7657
7658   coding->src_object = Fcurrent_buffer ();
7659   coding->src_chars = chars;
7660   coding->src_bytes = bytes;
7661   coding->src_pos = -chars;
7662   coding->src_pos_byte = -bytes;
7663   coding->src_multibyte = chars < bytes;
7664   coding->dst_object = coding->src_object;
7665   coding->dst_pos = PT;
7666   coding->dst_pos_byte = PT_BYTE;
7667
7668   encode_coding (coding);
7669
7670   unbind_to (count, Qnil);
7671   return coding->result;
7672 }
7673
7674
7675 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7676    SRC_OBJECT into DST_OBJECT by coding context CODING.
7677
7678    SRC_OBJECT is a buffer, a string, or Qnil.
7679
7680    If it is a buffer, the text is at point of the buffer.  FROM and TO
7681    are positions in the buffer.
7682
7683    If it is a string, the text is at the beginning of the string.
7684    FROM and TO are indices to the string.
7685
7686    If it is nil, the text is at coding->source.  FROM and TO are
7687    indices to coding->source.
7688
7689    DST_OBJECT is a buffer, Qt, or Qnil.
7690
7691    If it is a buffer, the decoded text is inserted at point of the
7692    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7693    is deleted.
7694
7695    If it is Qt, a string is made from the decoded text, and
7696    set in CODING->dst_object.
7697
7698    If it is Qnil, the decoded text is stored at CODING->destination.
7699    The caller must allocate CODING->dst_bytes bytes at
7700    CODING->destination by xmalloc.  If the decoded text is longer than
7701    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7702  */
7703
7704 void
7705 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7706                       dst_object)
7707      struct coding_system *coding;
7708      Lisp_Object src_object;
7709      EMACS_INT from, from_byte, to, to_byte;
7710      Lisp_Object dst_object;
7711 {
7712   int count = specpdl_ptr - specpdl;
7713   unsigned char *destination;
7714   EMACS_INT dst_bytes;
7715   EMACS_INT chars = to - from;
7716   EMACS_INT bytes = to_byte - from_byte;
7717   Lisp_Object attrs;
7718   int saved_pt = -1, saved_pt_byte;
7719   int need_marker_adjustment = 0;
7720   Lisp_Object old_deactivate_mark;
7721
7722   old_deactivate_mark = Vdeactivate_mark;
7723
7724   if (NILP (dst_object))
7725     {
7726       destination = coding->destination;
7727       dst_bytes = coding->dst_bytes;
7728     }
7729
7730   coding->src_object = src_object;
7731   coding->src_chars = chars;
7732   coding->src_bytes = bytes;
7733   coding->src_multibyte = chars < bytes;
7734
7735   if (STRINGP (src_object))
7736     {
7737       coding->src_pos = from;
7738       coding->src_pos_byte = from_byte;
7739     }
7740   else if (BUFFERP (src_object))
7741     {
7742       set_buffer_internal (XBUFFER (src_object));
7743       if (from != GPT)
7744         move_gap_both (from, from_byte);
7745       if (EQ (src_object, dst_object))
7746         {
7747           struct Lisp_Marker *tail;
7748
7749           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7750             {
7751               tail->need_adjustment
7752                 = tail->charpos == (tail->insertion_type ? from : to);
7753               need_marker_adjustment |= tail->need_adjustment;
7754             }
7755           saved_pt = PT, saved_pt_byte = PT_BYTE;
7756           TEMP_SET_PT_BOTH (from, from_byte);
7757           current_buffer->text->inhibit_shrinking = 1;
7758           del_range_both (from, from_byte, to, to_byte, 1);
7759           coding->src_pos = -chars;
7760           coding->src_pos_byte = -bytes;
7761         }
7762       else
7763         {
7764           coding->src_pos = from;
7765           coding->src_pos_byte = from_byte;
7766         }
7767     }
7768
7769   if (CODING_REQUIRE_DETECTION (coding))
7770     detect_coding (coding);
7771   attrs = CODING_ID_ATTRS (coding->id);
7772
7773   if (EQ (dst_object, Qt)
7774       || (! NILP (CODING_ATTR_POST_READ (attrs))
7775           && NILP (dst_object)))
7776     {
7777       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7778       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7779       coding->dst_pos = BEG;
7780       coding->dst_pos_byte = BEG_BYTE;
7781     }
7782   else if (BUFFERP (dst_object))
7783     {
7784       code_conversion_save (0, 0);
7785       coding->dst_object = dst_object;
7786       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7787       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7788       coding->dst_multibyte
7789         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7790     }
7791   else
7792     {
7793       code_conversion_save (0, 0);
7794       coding->dst_object = Qnil;
7795       /* Most callers presume this will return a multibyte result, and they
7796          won't use `binary' or `raw-text' anyway, so let's not worry about
7797          CODING_FOR_UNIBYTE.  */
7798       coding->dst_multibyte = 1;
7799     }
7800
7801   decode_coding (coding);
7802
7803   if (BUFFERP (coding->dst_object))
7804     set_buffer_internal (XBUFFER (coding->dst_object));
7805
7806   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7807     {
7808       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7809       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7810       Lisp_Object val;
7811
7812       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7813       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7814               old_deactivate_mark);
7815       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7816                         make_number (coding->produced_char));
7817       UNGCPRO;
7818       CHECK_NATNUM (val);
7819       coding->produced_char += Z - prev_Z;
7820       coding->produced += Z_BYTE - prev_Z_BYTE;
7821     }
7822
7823   if (EQ (dst_object, Qt))
7824     {
7825       coding->dst_object = Fbuffer_string ();
7826     }
7827   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7828     {
7829       set_buffer_internal (XBUFFER (coding->dst_object));
7830       if (dst_bytes < coding->produced)
7831         {
7832           destination = xrealloc (destination, coding->produced);
7833           if (! destination)
7834             {
7835               record_conversion_result (coding,
7836                                         CODING_RESULT_INSUFFICIENT_DST);
7837               unbind_to (count, Qnil);
7838               return;
7839             }
7840           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7841             move_gap_both (BEGV, BEGV_BYTE);
7842           bcopy (BEGV_ADDR, destination, coding->produced);
7843           coding->destination = destination;
7844         }
7845     }
7846
7847   if (saved_pt >= 0)
7848     {
7849       /* This is the case of:
7850          (BUFFERP (src_object) && EQ (src_object, dst_object))
7851          As we have moved PT while replacing the original buffer
7852          contents, we must recover it now.  */
7853       set_buffer_internal (XBUFFER (src_object));
7854       current_buffer->text->inhibit_shrinking = 0;
7855       if (saved_pt < from)
7856         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7857       else if (saved_pt < from + chars)
7858         TEMP_SET_PT_BOTH (from, from_byte);
7859       else if (! NILP (current_buffer->enable_multibyte_characters))
7860         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7861                           saved_pt_byte + (coding->produced - bytes));
7862       else
7863         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7864                           saved_pt_byte + (coding->produced - bytes));
7865
7866       if (need_marker_adjustment)
7867         {
7868           struct Lisp_Marker *tail;
7869
7870           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7871             if (tail->need_adjustment)
7872               {
7873                 tail->need_adjustment = 0;
7874                 if (tail->insertion_type)
7875                   {
7876                     tail->bytepos = from_byte;
7877                     tail->charpos = from;
7878                   }
7879                 else
7880                   {
7881                     tail->bytepos = from_byte + coding->produced;
7882                     tail->charpos
7883                       = (NILP (current_buffer->enable_multibyte_characters)
7884                          ? tail->bytepos : from + coding->produced_char);
7885                   }
7886               }
7887         }
7888     }
7889
7890   Vdeactivate_mark = old_deactivate_mark;
7891   unbind_to (count, coding->dst_object);
7892 }
7893
7894
7895 void
7896 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7897                       dst_object)
7898      struct coding_system *coding;
7899      Lisp_Object src_object;
7900      EMACS_INT from, from_byte, to, to_byte;
7901      Lisp_Object dst_object;
7902 {
7903   int count = specpdl_ptr - specpdl;
7904   EMACS_INT chars = to - from;
7905   EMACS_INT bytes = to_byte - from_byte;
7906   Lisp_Object attrs;
7907   int saved_pt = -1, saved_pt_byte;
7908   int need_marker_adjustment = 0;
7909   int kill_src_buffer = 0;
7910   Lisp_Object old_deactivate_mark;
7911
7912   old_deactivate_mark = Vdeactivate_mark;
7913
7914   coding->src_object = src_object;
7915   coding->src_chars = chars;
7916   coding->src_bytes = bytes;
7917   coding->src_multibyte = chars < bytes;
7918
7919   attrs = CODING_ID_ATTRS (coding->id);
7920
7921   if (EQ (src_object, dst_object))
7922     {
7923       struct Lisp_Marker *tail;
7924
7925       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7926         {
7927           tail->need_adjustment
7928             = tail->charpos == (tail->insertion_type ? from : to);
7929           need_marker_adjustment |= tail->need_adjustment;
7930         }
7931     }
7932
7933   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7934     {
7935       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7936       set_buffer_internal (XBUFFER (coding->src_object));
7937       if (STRINGP (src_object))
7938         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7939       else if (BUFFERP (src_object))
7940         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7941       else
7942         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7943
7944       if (EQ (src_object, dst_object))
7945         {
7946           set_buffer_internal (XBUFFER (src_object));
7947           saved_pt = PT, saved_pt_byte = PT_BYTE;
7948           del_range_both (from, from_byte, to, to_byte, 1);
7949           set_buffer_internal (XBUFFER (coding->src_object));
7950         }
7951
7952       {
7953         Lisp_Object args[3];
7954         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7955
7956         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7957                 old_deactivate_mark);
7958         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7959         args[1] = make_number (BEG);
7960         args[2] = make_number (Z);
7961         safe_call (3, args);
7962         UNGCPRO;
7963       }
7964       if (XBUFFER (coding->src_object) != current_buffer)
7965         kill_src_buffer = 1;
7966       coding->src_object = Fcurrent_buffer ();
7967       if (BEG != GPT)
7968         move_gap_both (BEG, BEG_BYTE);
7969       coding->src_chars = Z - BEG;
7970       coding->src_bytes = Z_BYTE - BEG_BYTE;
7971       coding->src_pos = BEG;
7972       coding->src_pos_byte = BEG_BYTE;
7973       coding->src_multibyte = Z < Z_BYTE;
7974     }
7975   else if (STRINGP (src_object))
7976     {
7977       code_conversion_save (0, 0);
7978       coding->src_pos = from;
7979       coding->src_pos_byte = from_byte;
7980     }
7981   else if (BUFFERP (src_object))
7982     {
7983       code_conversion_save (0, 0);
7984       set_buffer_internal (XBUFFER (src_object));
7985       if (EQ (src_object, dst_object))
7986         {
7987           saved_pt = PT, saved_pt_byte = PT_BYTE;
7988           coding->src_object = del_range_1 (from, to, 1, 1);
7989           coding->src_pos = 0;
7990           coding->src_pos_byte = 0;
7991         }
7992       else
7993         {
7994           if (from < GPT && to >= GPT)
7995             move_gap_both (from, from_byte);
7996           coding->src_pos = from;
7997           coding->src_pos_byte = from_byte;
7998         }
7999     }
8000   else
8001     code_conversion_save (0, 0);
8002
8003   if (BUFFERP (dst_object))
8004     {
8005       coding->dst_object = dst_object;
8006       if (EQ (src_object, dst_object))
8007         {
8008           coding->dst_pos = from;
8009           coding->dst_pos_byte = from_byte;
8010         }
8011       else
8012         {
8013           struct buffer *current = current_buffer;
8014
8015           set_buffer_temp (XBUFFER (dst_object));
8016           coding->dst_pos = PT;
8017           coding->dst_pos_byte = PT_BYTE;
8018           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8019           set_buffer_temp (current);
8020         }
8021       coding->dst_multibyte
8022         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8023     }
8024   else if (EQ (dst_object, Qt))
8025     {
8026       coding->dst_object = Qnil;
8027       coding->dst_bytes = coding->src_chars;
8028       if (coding->dst_bytes == 0)
8029         coding->dst_bytes = 1;
8030       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8031       coding->dst_multibyte = 0;
8032     }
8033   else
8034     {
8035       coding->dst_object = Qnil;
8036       coding->dst_multibyte = 0;
8037     }
8038
8039   encode_coding (coding);
8040
8041   if (EQ (dst_object, Qt))
8042     {
8043       if (BUFFERP (coding->dst_object))
8044         coding->dst_object = Fbuffer_string ();
8045       else
8046         {
8047           coding->dst_object
8048             = make_unibyte_string ((char *) coding->destination,
8049                                    coding->produced);
8050           xfree (coding->destination);
8051         }
8052     }
8053
8054   if (saved_pt >= 0)
8055     {
8056       /* This is the case of:
8057          (BUFFERP (src_object) && EQ (src_object, dst_object))
8058          As we have moved PT while replacing the original buffer
8059          contents, we must recover it now.  */
8060       set_buffer_internal (XBUFFER (src_object));
8061       if (saved_pt < from)
8062         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8063       else if (saved_pt < from + chars)
8064         TEMP_SET_PT_BOTH (from, from_byte);
8065       else if (! NILP (current_buffer->enable_multibyte_characters))
8066         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8067                           saved_pt_byte + (coding->produced - bytes));
8068       else
8069         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8070                           saved_pt_byte + (coding->produced - bytes));
8071
8072       if (need_marker_adjustment)
8073         {
8074           struct Lisp_Marker *tail;
8075
8076           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8077             if (tail->need_adjustment)
8078               {
8079                 tail->need_adjustment = 0;
8080                 if (tail->insertion_type)
8081                   {
8082                     tail->bytepos = from_byte;
8083                     tail->charpos = from;
8084                   }
8085                 else
8086                   {
8087                     tail->bytepos = from_byte + coding->produced;
8088                     tail->charpos
8089                       = (NILP (current_buffer->enable_multibyte_characters)
8090                          ? tail->bytepos : from + coding->produced_char);
8091                   }
8092               }
8093         }
8094     }
8095
8096   if (kill_src_buffer)
8097     Fkill_buffer (coding->src_object);
8098
8099   Vdeactivate_mark = old_deactivate_mark;
8100   unbind_to (count, Qnil);
8101 }
8102
8103
8104 Lisp_Object
8105 preferred_coding_system ()
8106 {
8107   int id = coding_categories[coding_priorities[0]].id;
8108
8109   return CODING_ID_NAME (id);
8110 }
8111
8112 \f
8113 #ifdef emacs
8114 /*** 8. Emacs Lisp library functions ***/
8115
8116 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8117        doc: /* Return t if OBJECT is nil or a coding-system.
8118 See the documentation of `define-coding-system' for information
8119 about coding-system objects.  */)
8120      (object)
8121      Lisp_Object object;
8122 {
8123   if (NILP (object)
8124       || CODING_SYSTEM_ID (object) >= 0)
8125     return Qt;
8126   if (! SYMBOLP (object)
8127       || NILP (Fget (object, Qcoding_system_define_form)))
8128     return Qnil;
8129   return Qt;
8130 }
8131
8132 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8133        Sread_non_nil_coding_system, 1, 1, 0,
8134        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8135      (prompt)
8136      Lisp_Object prompt;
8137 {
8138   Lisp_Object val;
8139   do
8140     {
8141       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8142                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8143     }
8144   while (SCHARS (val) == 0);
8145   return (Fintern (val, Qnil));
8146 }
8147
8148 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8149        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8150 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8151 Ignores case when completing coding systems (all Emacs coding systems
8152 are lower-case).  */)
8153      (prompt, default_coding_system)
8154      Lisp_Object prompt, default_coding_system;
8155 {
8156   Lisp_Object val;
8157   int count = SPECPDL_INDEX ();
8158
8159   if (SYMBOLP (default_coding_system))
8160     default_coding_system = SYMBOL_NAME (default_coding_system);
8161   specbind (Qcompletion_ignore_case, Qt);
8162   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8163                           Qt, Qnil, Qcoding_system_history,
8164                           default_coding_system, Qnil);
8165   unbind_to (count, Qnil);
8166   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8167 }
8168
8169 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8170        1, 1, 0,
8171        doc: /* Check validity of CODING-SYSTEM.
8172 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8173 It is valid if it is nil or a symbol defined as a coding system by the
8174 function `define-coding-system'.  */)
8175   (coding_system)
8176      Lisp_Object coding_system;
8177 {
8178   Lisp_Object define_form;
8179
8180   define_form = Fget (coding_system, Qcoding_system_define_form);
8181   if (! NILP (define_form))
8182     {
8183       Fput (coding_system, Qcoding_system_define_form, Qnil);
8184       safe_eval (define_form);
8185     }
8186   if (!NILP (Fcoding_system_p (coding_system)))
8187     return coding_system;
8188   xsignal1 (Qcoding_system_error, coding_system);
8189 }
8190
8191 \f
8192 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8193    HIGHEST is nonzero, return the coding system of the highest
8194    priority among the detected coding systems.  Otherwize return a
8195    list of detected coding systems sorted by their priorities.  If
8196    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8197    multibyte form but contains only ASCII and eight-bit chars.
8198    Otherwise, the bytes are raw bytes.
8199
8200    CODING-SYSTEM controls the detection as below:
8201
8202    If it is nil, detect both text-format and eol-format.  If the
8203    text-format part of CODING-SYSTEM is already specified
8204    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8205    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8206    detect only text-format.  */
8207
8208 Lisp_Object
8209 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8210                       coding_system)
8211      const unsigned char *src;
8212      EMACS_INT src_chars, src_bytes;
8213      int highest;
8214      int multibytep;
8215      Lisp_Object coding_system;
8216 {
8217   const unsigned char *src_end = src + src_bytes;
8218   Lisp_Object attrs, eol_type;
8219   Lisp_Object val = Qnil;
8220   struct coding_system coding;
8221   int id;
8222   struct coding_detection_info detect_info;
8223   enum coding_category base_category;
8224   int null_byte_found = 0, eight_bit_found = 0;
8225
8226   if (NILP (coding_system))
8227     coding_system = Qundecided;
8228   setup_coding_system (coding_system, &coding);
8229   attrs = CODING_ID_ATTRS (coding.id);
8230   eol_type = CODING_ID_EOL_TYPE (coding.id);
8231   coding_system = CODING_ATTR_BASE_NAME (attrs);
8232
8233   coding.source = src;
8234   coding.src_chars = src_chars;
8235   coding.src_bytes = src_bytes;
8236   coding.src_multibyte = multibytep;
8237   coding.consumed = 0;
8238   coding.mode |= CODING_MODE_LAST_BLOCK;
8239   coding.head_ascii = 0;
8240
8241   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8242
8243   /* At first, detect text-format if necessary.  */
8244   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8245   if (base_category == coding_category_undecided)
8246     {
8247       enum coding_category category;
8248       struct coding_system *this;
8249       int c, i;
8250
8251       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8252       for (; src < src_end; src++)
8253         {
8254           c = *src;
8255           if (c & 0x80)
8256             {
8257               eight_bit_found = 1;
8258               if (null_byte_found)
8259                 break;
8260             }
8261           else if (c < 0x20)
8262             {
8263               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8264                   && ! inhibit_iso_escape_detection
8265                   && ! detect_info.checked)
8266                 {
8267                   if (detect_coding_iso_2022 (&coding, &detect_info))
8268                     {
8269                       /* We have scanned the whole data.  */
8270                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8271                         {
8272                           /* We didn't find an 8-bit code.  We may
8273                              have found a null-byte, but it's very
8274                              rare that a binary file confirm to
8275                              ISO-2022.  */
8276                           src = src_end;
8277                           coding.head_ascii = src - coding.source;
8278                         }
8279                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8280                       break;
8281                     }
8282                 }
8283               else if (! c && !inhibit_null_byte_detection)
8284                 {
8285                   null_byte_found = 1;
8286                   if (eight_bit_found)
8287                     break;
8288                 }
8289               if (! eight_bit_found)
8290                 coding.head_ascii++;
8291             }
8292           else if (! eight_bit_found)
8293             coding.head_ascii++;
8294         }
8295
8296       if (null_byte_found || eight_bit_found
8297           || coding.head_ascii < coding.src_bytes
8298           || detect_info.found)
8299         {
8300           if (coding.head_ascii == coding.src_bytes)
8301             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8302             for (i = 0; i < coding_category_raw_text; i++)
8303               {
8304                 category = coding_priorities[i];
8305                 this = coding_categories + category;
8306                 if (detect_info.found & (1 << category))
8307                   break;
8308               }
8309           else
8310             {
8311               if (null_byte_found)
8312                 {
8313                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8314                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8315                 }
8316               for (i = 0; i < coding_category_raw_text; i++)
8317                 {
8318                   category = coding_priorities[i];
8319                   this = coding_categories + category;
8320
8321                   if (this->id < 0)
8322                     {
8323                       /* No coding system of this category is defined.  */
8324                       detect_info.rejected |= (1 << category);
8325                     }
8326                   else if (category >= coding_category_raw_text)
8327                     continue;
8328                   else if (detect_info.checked & (1 << category))
8329                     {
8330                       if (highest
8331                           && (detect_info.found & (1 << category)))
8332                         break;
8333                     }
8334                   else if ((*(this->detector)) (&coding, &detect_info)
8335                            && highest
8336                            && (detect_info.found & (1 << category)))
8337                     {
8338                       if (category == coding_category_utf_16_auto)
8339                         {
8340                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8341                             category = coding_category_utf_16_le;
8342                           else
8343                             category = coding_category_utf_16_be;
8344                         }
8345                       break;
8346                     }
8347                 }
8348             }
8349         }
8350
8351       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8352           || null_byte_found)
8353         {
8354           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8355           id = CODING_SYSTEM_ID (Qno_conversion);
8356           val = Fcons (make_number (id), Qnil);
8357         }
8358       else if (! detect_info.rejected && ! detect_info.found)
8359         {
8360           detect_info.found = CATEGORY_MASK_ANY;
8361           id = coding_categories[coding_category_undecided].id;
8362           val = Fcons (make_number (id), Qnil);
8363         }
8364       else if (highest)
8365         {
8366           if (detect_info.found)
8367             {
8368               detect_info.found = 1 << category;
8369               val = Fcons (make_number (this->id), Qnil);
8370             }
8371           else
8372             for (i = 0; i < coding_category_raw_text; i++)
8373               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8374                 {
8375                   detect_info.found = 1 << coding_priorities[i];
8376                   id = coding_categories[coding_priorities[i]].id;
8377                   val = Fcons (make_number (id), Qnil);
8378                   break;
8379                 }
8380         }
8381       else
8382         {
8383           int mask = detect_info.rejected | detect_info.found;
8384           int found = 0;
8385
8386           for (i = coding_category_raw_text - 1; i >= 0; i--)
8387             {
8388               category = coding_priorities[i];
8389               if (! (mask & (1 << category)))
8390                 {
8391                   found |= 1 << category;
8392                   id = coding_categories[category].id;
8393                   if (id >= 0)
8394                     val = Fcons (make_number (id), val);
8395                 }
8396             }
8397           for (i = coding_category_raw_text - 1; i >= 0; i--)
8398             {
8399               category = coding_priorities[i];
8400               if (detect_info.found & (1 << category))
8401                 {
8402                   id = coding_categories[category].id;
8403                   val = Fcons (make_number (id), val);
8404                 }
8405             }
8406           detect_info.found |= found;
8407         }
8408     }
8409   else if (base_category == coding_category_utf_8_auto)
8410     {
8411       if (detect_coding_utf_8 (&coding, &detect_info))
8412         {
8413           struct coding_system *this;
8414
8415           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8416             this = coding_categories + coding_category_utf_8_sig;
8417           else
8418             this = coding_categories + coding_category_utf_8_nosig;
8419           val = Fcons (make_number (this->id), Qnil);
8420         }
8421     }
8422   else if (base_category == coding_category_utf_16_auto)
8423     {
8424       if (detect_coding_utf_16 (&coding, &detect_info))
8425         {
8426           struct coding_system *this;
8427
8428           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8429             this = coding_categories + coding_category_utf_16_le;
8430           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8431             this = coding_categories + coding_category_utf_16_be;
8432           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8433             this = coding_categories + coding_category_utf_16_be_nosig;
8434           else
8435             this = coding_categories + coding_category_utf_16_le_nosig;
8436           val = Fcons (make_number (this->id), Qnil);
8437         }
8438     }
8439   else
8440     {
8441       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8442       val = Fcons (make_number (coding.id), Qnil);
8443     }
8444
8445   /* Then, detect eol-format if necessary.  */
8446   {
8447     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8448     Lisp_Object tail;
8449
8450     if (VECTORP (eol_type))
8451       {
8452         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8453           {
8454             if (null_byte_found)
8455               normal_eol = EOL_SEEN_LF;
8456             else
8457               normal_eol = detect_eol (coding.source, src_bytes,
8458                                        coding_category_raw_text);
8459           }
8460         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8461                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8462           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8463                                       coding_category_utf_16_be);
8464         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8465                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8466           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8467                                       coding_category_utf_16_le);
8468       }
8469     else
8470       {
8471         if (EQ (eol_type, Qunix))
8472           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8473         else if (EQ (eol_type, Qdos))
8474           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8475         else
8476           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8477       }
8478
8479     for (tail = val; CONSP (tail); tail = XCDR (tail))
8480       {
8481         enum coding_category category;
8482         int this_eol;
8483
8484         id = XINT (XCAR (tail));
8485         attrs = CODING_ID_ATTRS (id);
8486         category = XINT (CODING_ATTR_CATEGORY (attrs));
8487         eol_type = CODING_ID_EOL_TYPE (id);
8488         if (VECTORP (eol_type))
8489           {
8490             if (category == coding_category_utf_16_be
8491                 || category == coding_category_utf_16_be_nosig)
8492               this_eol = utf_16_be_eol;
8493             else if (category == coding_category_utf_16_le
8494                      || category == coding_category_utf_16_le_nosig)
8495               this_eol = utf_16_le_eol;
8496             else
8497               this_eol = normal_eol;
8498
8499             if (this_eol == EOL_SEEN_LF)
8500               XSETCAR (tail, AREF (eol_type, 0));
8501             else if (this_eol == EOL_SEEN_CRLF)
8502               XSETCAR (tail, AREF (eol_type, 1));
8503             else if (this_eol == EOL_SEEN_CR)
8504               XSETCAR (tail, AREF (eol_type, 2));
8505             else
8506               XSETCAR (tail, CODING_ID_NAME (id));
8507           }
8508         else
8509           XSETCAR (tail, CODING_ID_NAME (id));
8510       }
8511   }
8512
8513   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8514 }
8515
8516
8517 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8518        2, 3, 0,
8519        doc: /* Detect coding system of the text in the region between START and END.
8520 Return a list of possible coding systems ordered by priority.
8521 The coding systems to try and their priorities follows what
8522 the function `coding-system-priority-list' (which see) returns.
8523
8524 If only ASCII characters are found (except for such ISO-2022 control
8525 characters as ESC), it returns a list of single element `undecided'
8526 or its subsidiary coding system according to a detected end-of-line
8527 format.
8528
8529 If optional argument HIGHEST is non-nil, return the coding system of
8530 highest priority.  */)
8531      (start, end, highest)
8532      Lisp_Object start, end, highest;
8533 {
8534   int from, to;
8535   int from_byte, to_byte;
8536
8537   CHECK_NUMBER_COERCE_MARKER (start);
8538   CHECK_NUMBER_COERCE_MARKER (end);
8539
8540   validate_region (&start, &end);
8541   from = XINT (start), to = XINT (end);
8542   from_byte = CHAR_TO_BYTE (from);
8543   to_byte = CHAR_TO_BYTE (to);
8544
8545   if (from < GPT && to >= GPT)
8546     move_gap_both (to, to_byte);
8547
8548   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8549                                to - from, to_byte - from_byte,
8550                                !NILP (highest),
8551                                !NILP (current_buffer
8552                                       ->enable_multibyte_characters),
8553                                Qnil);
8554 }
8555
8556 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8557        1, 2, 0,
8558        doc: /* Detect coding system of the text in STRING.
8559 Return a list of possible coding systems ordered by priority.
8560 The coding systems to try and their priorities follows what
8561 the function `coding-system-priority-list' (which see) returns.
8562
8563 If only ASCII characters are found (except for such ISO-2022 control
8564 characters as ESC), it returns a list of single element `undecided'
8565 or its subsidiary coding system according to a detected end-of-line
8566 format.
8567
8568 If optional argument HIGHEST is non-nil, return the coding system of
8569 highest priority.  */)
8570      (string, highest)
8571      Lisp_Object string, highest;
8572 {
8573   CHECK_STRING (string);
8574
8575   return detect_coding_system (SDATA (string),
8576                                SCHARS (string), SBYTES (string),
8577                                !NILP (highest), STRING_MULTIBYTE (string),
8578                                Qnil);
8579 }
8580
8581
8582 static INLINE int
8583 char_encodable_p (c, attrs)
8584      int c;
8585      Lisp_Object attrs;
8586 {
8587   Lisp_Object tail;
8588   struct charset *charset;
8589   Lisp_Object translation_table;
8590
8591   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8592   if (! NILP (translation_table))
8593     c = translate_char (translation_table, c);
8594   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8595        CONSP (tail); tail = XCDR (tail))
8596     {
8597       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8598       if (CHAR_CHARSET_P (c, charset))
8599         break;
8600     }
8601   return (! NILP (tail));
8602 }
8603
8604
8605 /* Return a list of coding systems that safely encode the text between
8606    START and END.  If EXCLUDE is non-nil, it is a list of coding
8607    systems not to check.  The returned list doesn't contain any such
8608    coding systems.  In any case, if the text contains only ASCII or is
8609    unibyte, return t.  */
8610
8611 DEFUN ("find-coding-systems-region-internal",
8612        Ffind_coding_systems_region_internal,
8613        Sfind_coding_systems_region_internal, 2, 3, 0,
8614        doc: /* Internal use only.  */)
8615      (start, end, exclude)
8616      Lisp_Object start, end, exclude;
8617 {
8618   Lisp_Object coding_attrs_list, safe_codings;
8619   EMACS_INT start_byte, end_byte;
8620   const unsigned char *p, *pbeg, *pend;
8621   int c;
8622   Lisp_Object tail, elt;
8623
8624   if (STRINGP (start))
8625     {
8626       if (!STRING_MULTIBYTE (start)
8627           || SCHARS (start) == SBYTES (start))
8628         return Qt;
8629       start_byte = 0;
8630       end_byte = SBYTES (start);
8631     }
8632   else
8633     {
8634       CHECK_NUMBER_COERCE_MARKER (start);
8635       CHECK_NUMBER_COERCE_MARKER (end);
8636       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8637         args_out_of_range (start, end);
8638       if (NILP (current_buffer->enable_multibyte_characters))
8639         return Qt;
8640       start_byte = CHAR_TO_BYTE (XINT (start));
8641       end_byte = CHAR_TO_BYTE (XINT (end));
8642       if (XINT (end) - XINT (start) == end_byte - start_byte)
8643         return Qt;
8644
8645       if (XINT (start) < GPT && XINT (end) > GPT)
8646         {
8647           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8648             move_gap_both (XINT (start), start_byte);
8649           else
8650             move_gap_both (XINT (end), end_byte);
8651         }
8652     }
8653
8654   coding_attrs_list = Qnil;
8655   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8656     if (NILP (exclude)
8657         || NILP (Fmemq (XCAR (tail), exclude)))
8658       {
8659         Lisp_Object attrs;
8660
8661         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8662         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8663             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8664           {
8665             ASET (attrs, coding_attr_trans_tbl,
8666                   get_translation_table (attrs, 1, NULL));
8667             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8668           }
8669       }
8670
8671   if (STRINGP (start))
8672     p = pbeg = SDATA (start);
8673   else
8674     p = pbeg = BYTE_POS_ADDR (start_byte);
8675   pend = p + (end_byte - start_byte);
8676
8677   while (p < pend && ASCII_BYTE_P (*p)) p++;
8678   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8679
8680   while (p < pend)
8681     {
8682       if (ASCII_BYTE_P (*p))
8683         p++;
8684       else
8685         {
8686           c = STRING_CHAR_ADVANCE (p);
8687
8688           charset_map_loaded = 0;
8689           for (tail = coding_attrs_list; CONSP (tail);)
8690             {
8691               elt = XCAR (tail);
8692               if (NILP (elt))
8693                 tail = XCDR (tail);
8694               else if (char_encodable_p (c, elt))
8695                 tail = XCDR (tail);
8696               else if (CONSP (XCDR (tail)))
8697                 {
8698                   XSETCAR (tail, XCAR (XCDR (tail)));
8699                   XSETCDR (tail, XCDR (XCDR (tail)));
8700                 }
8701               else
8702                 {
8703                   XSETCAR (tail, Qnil);
8704                   tail = XCDR (tail);
8705                 }
8706             }
8707           if (charset_map_loaded)
8708             {
8709               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8710
8711               if (STRINGP (start))
8712                 pbeg = SDATA (start);
8713               else
8714                 pbeg = BYTE_POS_ADDR (start_byte);
8715               p = pbeg + p_offset;
8716               pend = pbeg + pend_offset;
8717             }
8718         }
8719     }
8720
8721   safe_codings = list2 (Qraw_text, Qno_conversion);
8722   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8723     if (! NILP (XCAR (tail)))
8724       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8725
8726   return safe_codings;
8727 }
8728
8729
8730 DEFUN ("unencodable-char-position", Funencodable_char_position,
8731        Sunencodable_char_position, 3, 5, 0,
8732        doc: /*
8733 Return position of first un-encodable character in a region.
8734 START and END specify the region and CODING-SYSTEM specifies the
8735 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8736
8737 If optional 4th argument COUNT is non-nil, it specifies at most how
8738 many un-encodable characters to search.  In this case, the value is a
8739 list of positions.
8740
8741 If optional 5th argument STRING is non-nil, it is a string to search
8742 for un-encodable characters.  In that case, START and END are indexes
8743 to the string.  */)
8744      (start, end, coding_system, count, string)
8745      Lisp_Object start, end, coding_system, count, string;
8746 {
8747   int n;
8748   struct coding_system coding;
8749   Lisp_Object attrs, charset_list, translation_table;
8750   Lisp_Object positions;
8751   int from, to;
8752   const unsigned char *p, *stop, *pend;
8753   int ascii_compatible;
8754
8755   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8756   attrs = CODING_ID_ATTRS (coding.id);
8757   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8758     return Qnil;
8759   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8760   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8761   translation_table = get_translation_table (attrs, 1, NULL);
8762
8763   if (NILP (string))
8764     {
8765       validate_region (&start, &end);
8766       from = XINT (start);
8767       to = XINT (end);
8768       if (NILP (current_buffer->enable_multibyte_characters)
8769           || (ascii_compatible
8770               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8771         return Qnil;
8772       p = CHAR_POS_ADDR (from);
8773       pend = CHAR_POS_ADDR (to);
8774       if (from < GPT && to >= GPT)
8775         stop = GPT_ADDR;
8776       else
8777         stop = pend;
8778     }
8779   else
8780     {
8781       CHECK_STRING (string);
8782       CHECK_NATNUM (start);
8783       CHECK_NATNUM (end);
8784       from = XINT (start);
8785       to = XINT (end);
8786       if (from > to
8787           || to > SCHARS (string))
8788         args_out_of_range_3 (string, start, end);
8789       if (! STRING_MULTIBYTE (string))
8790         return Qnil;
8791       p = SDATA (string) + string_char_to_byte (string, from);
8792       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8793       if (ascii_compatible && (to - from) == (pend - p))
8794         return Qnil;
8795     }
8796
8797   if (NILP (count))
8798     n = 1;
8799   else
8800     {
8801       CHECK_NATNUM (count);
8802       n = XINT (count);
8803     }
8804
8805   positions = Qnil;
8806   while (1)
8807     {
8808       int c;
8809
8810       if (ascii_compatible)
8811         while (p < stop && ASCII_BYTE_P (*p))
8812           p++, from++;
8813       if (p >= stop)
8814         {
8815           if (p >= pend)
8816             break;
8817           stop = pend;
8818           p = GAP_END_ADDR;
8819         }
8820
8821       c = STRING_CHAR_ADVANCE (p);
8822       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8823           && ! char_charset (translate_char (translation_table, c),
8824                              charset_list, NULL))
8825         {
8826           positions = Fcons (make_number (from), positions);
8827           n--;
8828           if (n == 0)
8829             break;
8830         }
8831
8832       from++;
8833     }
8834
8835   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8836 }
8837
8838
8839 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8840        Scheck_coding_systems_region, 3, 3, 0,
8841        doc: /* Check if the region is encodable by coding systems.
8842
8843 START and END are buffer positions specifying the region.
8844 CODING-SYSTEM-LIST is a list of coding systems to check.
8845
8846 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8847 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8848 whole region, POS0, POS1, ... are buffer positions where non-encodable
8849 characters are found.
8850
8851 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8852 value is nil.
8853
8854 START may be a string.  In that case, check if the string is
8855 encodable, and the value contains indices to the string instead of
8856 buffer positions.  END is ignored.
8857
8858 If the current buffer (or START if it is a string) is unibyte, the value
8859 is nil.  */)
8860      (start, end, coding_system_list)
8861      Lisp_Object start, end, coding_system_list;
8862 {
8863   Lisp_Object list;
8864   EMACS_INT start_byte, end_byte;
8865   int pos;
8866   const unsigned char *p, *pbeg, *pend;
8867   int c;
8868   Lisp_Object tail, elt, attrs;
8869
8870   if (STRINGP (start))
8871     {
8872       if (!STRING_MULTIBYTE (start)
8873           || SCHARS (start) == SBYTES (start))
8874         return Qnil;
8875       start_byte = 0;
8876       end_byte = SBYTES (start);
8877       pos = 0;
8878     }
8879   else
8880     {
8881       CHECK_NUMBER_COERCE_MARKER (start);
8882       CHECK_NUMBER_COERCE_MARKER (end);
8883       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8884         args_out_of_range (start, end);
8885       if (NILP (current_buffer->enable_multibyte_characters))
8886         return Qnil;
8887       start_byte = CHAR_TO_BYTE (XINT (start));
8888       end_byte = CHAR_TO_BYTE (XINT (end));
8889       if (XINT (end) - XINT (start) == end_byte - start_byte)
8890         return Qnil;
8891
8892       if (XINT (start) < GPT && XINT (end) > GPT)
8893         {
8894           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8895             move_gap_both (XINT (start), start_byte);
8896           else
8897             move_gap_both (XINT (end), end_byte);
8898         }
8899       pos = XINT (start);
8900     }
8901
8902   list = Qnil;
8903   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8904     {
8905       elt = XCAR (tail);
8906       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8907       ASET (attrs, coding_attr_trans_tbl,
8908             get_translation_table (attrs, 1, NULL));
8909       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8910     }
8911
8912   if (STRINGP (start))
8913     p = pbeg = SDATA (start);
8914   else
8915     p = pbeg = BYTE_POS_ADDR (start_byte);
8916   pend = p + (end_byte - start_byte);
8917
8918   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8919   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8920
8921   while (p < pend)
8922     {
8923       if (ASCII_BYTE_P (*p))
8924         p++;
8925       else
8926         {
8927           c = STRING_CHAR_ADVANCE (p);
8928
8929           charset_map_loaded = 0;
8930           for (tail = list; CONSP (tail); tail = XCDR (tail))
8931             {
8932               elt = XCDR (XCAR (tail));
8933               if (! char_encodable_p (c, XCAR (elt)))
8934                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8935             }
8936           if (charset_map_loaded)
8937             {
8938               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8939
8940               if (STRINGP (start))
8941                 pbeg = SDATA (start);
8942               else
8943                 pbeg = BYTE_POS_ADDR (start_byte);
8944               p = pbeg + p_offset;
8945               pend = pbeg + pend_offset;
8946             }
8947         }
8948       pos++;
8949     }
8950
8951   tail = list;
8952   list = Qnil;
8953   for (; CONSP (tail); tail = XCDR (tail))
8954     {
8955       elt = XCAR (tail);
8956       if (CONSP (XCDR (XCDR (elt))))
8957         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8958                       list);
8959     }
8960
8961   return list;
8962 }
8963
8964
8965 Lisp_Object
8966 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8967      Lisp_Object start, end, coding_system, dst_object;
8968      int encodep, norecord;
8969 {
8970   struct coding_system coding;
8971   EMACS_INT from, from_byte, to, to_byte;
8972   Lisp_Object src_object;
8973
8974   CHECK_NUMBER_COERCE_MARKER (start);
8975   CHECK_NUMBER_COERCE_MARKER (end);
8976   if (NILP (coding_system))
8977     coding_system = Qno_conversion;
8978   else
8979     CHECK_CODING_SYSTEM (coding_system);
8980   src_object = Fcurrent_buffer ();
8981   if (NILP (dst_object))
8982     dst_object = src_object;
8983   else if (! EQ (dst_object, Qt))
8984     CHECK_BUFFER (dst_object);
8985
8986   validate_region (&start, &end);
8987   from = XFASTINT (start);
8988   from_byte = CHAR_TO_BYTE (from);
8989   to = XFASTINT (end);
8990   to_byte = CHAR_TO_BYTE (to);
8991
8992   setup_coding_system (coding_system, &coding);
8993   coding.mode |= CODING_MODE_LAST_BLOCK;
8994
8995   if (encodep)
8996     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8997                           dst_object);
8998   else
8999     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9000                           dst_object);
9001   if (! norecord)
9002     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9003
9004   return (BUFFERP (dst_object)
9005           ? make_number (coding.produced_char)
9006           : coding.dst_object);
9007 }
9008
9009
9010 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9011        3, 4, "r\nzCoding system: ",
9012        doc: /* Decode the current region from the specified coding system.
9013 When called from a program, takes four arguments:
9014         START, END, CODING-SYSTEM, and DESTINATION.
9015 START and END are buffer positions.
9016
9017 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9018 If nil, the region between START and END is replaced by the decoded text.
9019 If buffer, the decoded text is inserted in that buffer after point (point
9020 does not move).
9021 In those cases, the length of the decoded text is returned.
9022 If DESTINATION is t, the decoded text is returned.
9023
9024 This function sets `last-coding-system-used' to the precise coding system
9025 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9026 not fully specified.)  */)
9027      (start, end, coding_system, destination)
9028      Lisp_Object start, end, coding_system, destination;
9029 {
9030   return code_convert_region (start, end, coding_system, destination, 0, 0);
9031 }
9032
9033 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9034        3, 4, "r\nzCoding system: ",
9035        doc: /* Encode the current region by specified coding system.
9036 When called from a program, takes four arguments:
9037         START, END, CODING-SYSTEM and DESTINATION.
9038 START and END are buffer positions.
9039
9040 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9041 If nil, the region between START and END is replace by the encoded text.
9042 If buffer, the encoded text is inserted in that buffer after point (point
9043 does not move).
9044 In those cases, the length of the encoded text is returned.
9045 If DESTINATION is t, the encoded text is returned.
9046
9047 This function sets `last-coding-system-used' to the precise coding system
9048 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9049 not fully specified.)  */)
9050   (start, end, coding_system, destination)
9051      Lisp_Object start, end, coding_system, destination;
9052 {
9053   return code_convert_region (start, end, coding_system, destination, 1, 0);
9054 }
9055
9056 Lisp_Object
9057 code_convert_string (string, coding_system, dst_object,
9058                      encodep, nocopy, norecord)
9059      Lisp_Object string, coding_system, dst_object;
9060      int encodep, nocopy, norecord;
9061 {
9062   struct coding_system coding;
9063   EMACS_INT chars, bytes;
9064
9065   CHECK_STRING (string);
9066   if (NILP (coding_system))
9067     {
9068       if (! norecord)
9069         Vlast_coding_system_used = Qno_conversion;
9070       if (NILP (dst_object))
9071         return (nocopy ? Fcopy_sequence (string) : string);
9072     }
9073
9074   if (NILP (coding_system))
9075     coding_system = Qno_conversion;
9076   else
9077     CHECK_CODING_SYSTEM (coding_system);
9078   if (NILP (dst_object))
9079     dst_object = Qt;
9080   else if (! EQ (dst_object, Qt))
9081     CHECK_BUFFER (dst_object);
9082
9083   setup_coding_system (coding_system, &coding);
9084   coding.mode |= CODING_MODE_LAST_BLOCK;
9085   chars = SCHARS (string);
9086   bytes = SBYTES (string);
9087   if (encodep)
9088     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9089   else
9090     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9091   if (! norecord)
9092     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9093
9094   return (BUFFERP (dst_object)
9095           ? make_number (coding.produced_char)
9096           : coding.dst_object);
9097 }
9098
9099
9100 /* Encode or decode STRING according to CODING_SYSTEM.
9101    Do not set Vlast_coding_system_used.
9102
9103    This function is called only from macros DECODE_FILE and
9104    ENCODE_FILE, thus we ignore character composition.  */
9105
9106 Lisp_Object
9107 code_convert_string_norecord (string, coding_system, encodep)
9108      Lisp_Object string, coding_system;
9109      int encodep;
9110 {
9111   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9112 }
9113
9114
9115 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9116        2, 4, 0,
9117        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9118
9119 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9120 if the decoding operation is trivial.
9121
9122 Optional fourth arg BUFFER non-nil means that the decoded text is
9123 inserted in that buffer after point (point does not move).  In this
9124 case, the return value is the length of the decoded text.
9125
9126 This function sets `last-coding-system-used' to the precise coding system
9127 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9128 not fully specified.)  */)
9129   (string, coding_system, nocopy, buffer)
9130      Lisp_Object string, coding_system, nocopy, buffer;
9131 {
9132   return code_convert_string (string, coding_system, buffer,
9133                               0, ! NILP (nocopy), 0);
9134 }
9135
9136 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9137        2, 4, 0,
9138        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9139
9140 Optional third arg NOCOPY non-nil means it is OK to return STRING
9141 itself if the encoding operation is trivial.
9142
9143 Optional fourth arg BUFFER non-nil means that the encoded text is
9144 inserted in that buffer after point (point does not move).  In this
9145 case, the return value is the length of the encoded text.
9146
9147 This function sets `last-coding-system-used' to the precise coding system
9148 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9149 not fully specified.)  */)
9150      (string, coding_system, nocopy, buffer)
9151      Lisp_Object string, coding_system, nocopy, buffer;
9152 {
9153   return code_convert_string (string, coding_system, buffer,
9154                               1, ! NILP (nocopy), 1);
9155 }
9156
9157 \f
9158 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9159        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9160 Return the corresponding character.  */)
9161      (code)
9162      Lisp_Object code;
9163 {
9164   Lisp_Object spec, attrs, val;
9165   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9166   int c;
9167
9168   CHECK_NATNUM (code);
9169   c = XFASTINT (code);
9170   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9171   attrs = AREF (spec, 0);
9172
9173   if (ASCII_BYTE_P (c)
9174       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9175     return code;
9176
9177   val = CODING_ATTR_CHARSET_LIST (attrs);
9178   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9179   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9180   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9181
9182   if (c <= 0x7F)
9183     charset = charset_roman;
9184   else if (c >= 0xA0 && c < 0xDF)
9185     {
9186       charset = charset_kana;
9187       c -= 0x80;
9188     }
9189   else
9190     {
9191       int s1 = c >> 8, s2 = c & 0xFF;
9192
9193       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9194           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9195         error ("Invalid code: %d", code);
9196       SJIS_TO_JIS (c);
9197       charset = charset_kanji;
9198     }
9199   c = DECODE_CHAR (charset, c);
9200   if (c < 0)
9201     error ("Invalid code: %d", code);
9202   return make_number (c);
9203 }
9204
9205
9206 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9207        doc: /* Encode a Japanese character CH to shift_jis encoding.
9208 Return the corresponding code in SJIS.  */)
9209      (ch)
9210     Lisp_Object ch;
9211 {
9212   Lisp_Object spec, attrs, charset_list;
9213   int c;
9214   struct charset *charset;
9215   unsigned code;
9216
9217   CHECK_CHARACTER (ch);
9218   c = XFASTINT (ch);
9219   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9220   attrs = AREF (spec, 0);
9221
9222   if (ASCII_CHAR_P (c)
9223       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9224     return ch;
9225
9226   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9227   charset = char_charset (c, charset_list, &code);
9228   if (code == CHARSET_INVALID_CODE (charset))
9229     error ("Can't encode by shift_jis encoding: %d", c);
9230   JIS_TO_SJIS (code);
9231
9232   return make_number (code);
9233 }
9234
9235 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9236        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9237 Return the corresponding character.  */)
9238      (code)
9239      Lisp_Object code;
9240 {
9241   Lisp_Object spec, attrs, val;
9242   struct charset *charset_roman, *charset_big5, *charset;
9243   int c;
9244
9245   CHECK_NATNUM (code);
9246   c = XFASTINT (code);
9247   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9248   attrs = AREF (spec, 0);
9249
9250   if (ASCII_BYTE_P (c)
9251       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9252     return code;
9253
9254   val = CODING_ATTR_CHARSET_LIST (attrs);
9255   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9256   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9257
9258   if (c <= 0x7F)
9259     charset = charset_roman;
9260   else
9261     {
9262       int b1 = c >> 8, b2 = c & 0x7F;
9263       if (b1 < 0xA1 || b1 > 0xFE
9264           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9265         error ("Invalid code: %d", code);
9266       charset = charset_big5;
9267     }
9268   c = DECODE_CHAR (charset, (unsigned )c);
9269   if (c < 0)
9270     error ("Invalid code: %d", code);
9271   return make_number (c);
9272 }
9273
9274 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9275        doc: /* Encode the Big5 character CH to BIG5 coding system.
9276 Return the corresponding character code in Big5.  */)
9277      (ch)
9278      Lisp_Object ch;
9279 {
9280   Lisp_Object spec, attrs, charset_list;
9281   struct charset *charset;
9282   int c;
9283   unsigned code;
9284
9285   CHECK_CHARACTER (ch);
9286   c = XFASTINT (ch);
9287   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9288   attrs = AREF (spec, 0);
9289   if (ASCII_CHAR_P (c)
9290       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9291     return ch;
9292
9293   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9294   charset = char_charset (c, charset_list, &code);
9295   if (code == CHARSET_INVALID_CODE (charset))
9296     error ("Can't encode by Big5 encoding: %d", c);
9297
9298   return make_number (code);
9299 }
9300
9301 \f
9302 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9303        Sset_terminal_coding_system_internal, 1, 2, 0,
9304        doc: /* Internal use only.  */)
9305      (coding_system, terminal)
9306      Lisp_Object coding_system;
9307      Lisp_Object terminal;
9308 {
9309   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9310   CHECK_SYMBOL (coding_system);
9311   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9312   /* We had better not send unsafe characters to terminal.  */
9313   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9314   /* Characer composition should be disabled.  */
9315   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9316   terminal_coding->src_multibyte = 1;
9317   terminal_coding->dst_multibyte = 0;
9318   return Qnil;
9319 }
9320
9321 DEFUN ("set-safe-terminal-coding-system-internal",
9322        Fset_safe_terminal_coding_system_internal,
9323        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9324        doc: /* Internal use only.  */)
9325      (coding_system)
9326      Lisp_Object coding_system;
9327 {
9328   CHECK_SYMBOL (coding_system);
9329   setup_coding_system (Fcheck_coding_system (coding_system),
9330                        &safe_terminal_coding);
9331   /* Characer composition should be disabled.  */
9332   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9333   safe_terminal_coding.src_multibyte = 1;
9334   safe_terminal_coding.dst_multibyte = 0;
9335   return Qnil;
9336 }
9337
9338 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9339        Sterminal_coding_system, 0, 1, 0,
9340        doc: /* Return coding system specified for terminal output on the given terminal.
9341 TERMINAL may be a terminal id, a frame, or nil for the selected
9342 frame's terminal device.  */)
9343      (terminal)
9344      Lisp_Object terminal;
9345 {
9346   struct coding_system *terminal_coding
9347     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9348   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9349
9350   /* For backward compatibility, return nil if it is `undecided'. */
9351   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9352 }
9353
9354 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9355        Sset_keyboard_coding_system_internal, 1, 2, 0,
9356        doc: /* Internal use only.  */)
9357      (coding_system, terminal)
9358      Lisp_Object coding_system;
9359      Lisp_Object terminal;
9360 {
9361   struct terminal *t = get_terminal (terminal, 1);
9362   CHECK_SYMBOL (coding_system);
9363   setup_coding_system (Fcheck_coding_system (coding_system),
9364                        TERMINAL_KEYBOARD_CODING (t));
9365   /* Characer composition should be disabled.  */
9366   TERMINAL_KEYBOARD_CODING (t)->common_flags
9367     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9368   return Qnil;
9369 }
9370
9371 DEFUN ("keyboard-coding-system",
9372        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9373        doc: /* Return coding system specified for decoding keyboard input.  */)
9374      (terminal)
9375      Lisp_Object terminal;
9376 {
9377   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9378                          (get_terminal (terminal, 1))->id);
9379 }
9380
9381 \f
9382 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9383        Sfind_operation_coding_system,  1, MANY, 0,
9384        doc: /* Choose a coding system for an operation based on the target name.
9385 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9386 DECODING-SYSTEM is the coding system to use for decoding
9387 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9388 for encoding (in case OPERATION does encoding).
9389
9390 The first argument OPERATION specifies an I/O primitive:
9391   For file I/O, `insert-file-contents' or `write-region'.
9392   For process I/O, `call-process', `call-process-region', or `start-process'.
9393   For network I/O, `open-network-stream'.
9394
9395 The remaining arguments should be the same arguments that were passed
9396 to the primitive.  Depending on which primitive, one of those arguments
9397 is selected as the TARGET.  For example, if OPERATION does file I/O,
9398 whichever argument specifies the file name is TARGET.
9399
9400 TARGET has a meaning which depends on OPERATION:
9401   For file I/O, TARGET is a file name (except for the special case below).
9402   For process I/O, TARGET is a process name.
9403   For network I/O, TARGET is a service name or a port number.
9404
9405 This function looks up what is specified for TARGET in
9406 `file-coding-system-alist', `process-coding-system-alist',
9407 or `network-coding-system-alist' depending on OPERATION.
9408 They may specify a coding system, a cons of coding systems,
9409 or a function symbol to call.
9410 In the last case, we call the function with one argument,
9411 which is a list of all the arguments given to this function.
9412 If the function can't decide a coding system, it can return
9413 `undecided' so that the normal code-detection is performed.
9414
9415 If OPERATION is `insert-file-contents', the argument corresponding to
9416 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9417 file name to look up, and BUFFER is a buffer that contains the file's
9418 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9419 function to call for FILENAME, that function should examine the
9420 contents of BUFFER instead of reading the file.
9421
9422 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9423      (nargs, args)
9424      int nargs;
9425      Lisp_Object *args;
9426 {
9427   Lisp_Object operation, target_idx, target, val;
9428   register Lisp_Object chain;
9429
9430   if (nargs < 2)
9431     error ("Too few arguments");
9432   operation = args[0];
9433   if (!SYMBOLP (operation)
9434       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9435     error ("Invalid first argument");
9436   if (nargs < 1 + XINT (target_idx))
9437     error ("Too few arguments for operation: %s",
9438            SDATA (SYMBOL_NAME (operation)));
9439   target = args[XINT (target_idx) + 1];
9440   if (!(STRINGP (target)
9441         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9442             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9443         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9444     error ("Invalid %dth argument", XINT (target_idx) + 1);
9445   if (CONSP (target))
9446     target = XCAR (target);
9447
9448   chain = ((EQ (operation, Qinsert_file_contents)
9449             || EQ (operation, Qwrite_region))
9450            ? Vfile_coding_system_alist
9451            : (EQ (operation, Qopen_network_stream)
9452               ? Vnetwork_coding_system_alist
9453               : Vprocess_coding_system_alist));
9454   if (NILP (chain))
9455     return Qnil;
9456
9457   for (; CONSP (chain); chain = XCDR (chain))
9458     {
9459       Lisp_Object elt;
9460
9461       elt = XCAR (chain);
9462       if (CONSP (elt)
9463           && ((STRINGP (target)
9464                && STRINGP (XCAR (elt))
9465                && fast_string_match (XCAR (elt), target) >= 0)
9466               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9467         {
9468           val = XCDR (elt);
9469           /* Here, if VAL is both a valid coding system and a valid
9470              function symbol, we return VAL as a coding system.  */
9471           if (CONSP (val))
9472             return val;
9473           if (! SYMBOLP (val))
9474             return Qnil;
9475           if (! NILP (Fcoding_system_p (val)))
9476             return Fcons (val, val);
9477           if (! NILP (Ffboundp (val)))
9478             {
9479               /* We use call1 rather than safe_call1
9480                  so as to get bug reports about functions called here
9481                  which don't handle the current interface.  */
9482               val = call1 (val, Flist (nargs, args));
9483               if (CONSP (val))
9484                 return val;
9485               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9486                 return Fcons (val, val);
9487             }
9488           return Qnil;
9489         }
9490     }
9491   return Qnil;
9492 }
9493
9494 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9495        Sset_coding_system_priority, 0, MANY, 0,
9496        doc: /* Assign higher priority to the coding systems given as arguments.
9497 If multiple coding systems belong to the same category,
9498 all but the first one are ignored.
9499
9500 usage: (set-coding-system-priority &rest coding-systems)  */)
9501      (nargs, args)
9502      int nargs;
9503      Lisp_Object *args;
9504 {
9505   int i, j;
9506   int changed[coding_category_max];
9507   enum coding_category priorities[coding_category_max];
9508
9509   bzero (changed, sizeof changed);
9510
9511   for (i = j = 0; i < nargs; i++)
9512     {
9513       enum coding_category category;
9514       Lisp_Object spec, attrs;
9515
9516       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9517       attrs = AREF (spec, 0);
9518       category = XINT (CODING_ATTR_CATEGORY (attrs));
9519       if (changed[category])
9520         /* Ignore this coding system because a coding system of the
9521            same category already had a higher priority.  */
9522         continue;
9523       changed[category] = 1;
9524       priorities[j++] = category;
9525       if (coding_categories[category].id >= 0
9526           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9527         setup_coding_system (args[i], &coding_categories[category]);
9528       Fset (AREF (Vcoding_category_table, category), args[i]);
9529     }
9530
9531   /* Now we have decided top J priorities.  Reflect the order of the
9532      original priorities to the remaining priorities.  */
9533
9534   for (i = j, j = 0; i < coding_category_max; i++, j++)
9535     {
9536       while (j < coding_category_max
9537              && changed[coding_priorities[j]])
9538         j++;
9539       if (j == coding_category_max)
9540         abort ();
9541       priorities[i] = coding_priorities[j];
9542     }
9543
9544   bcopy (priorities, coding_priorities, sizeof priorities);
9545
9546   /* Update `coding-category-list'.  */
9547   Vcoding_category_list = Qnil;
9548   for (i = coding_category_max - 1; i >= 0; i--)
9549     Vcoding_category_list
9550       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9551                Vcoding_category_list);
9552
9553   return Qnil;
9554 }
9555
9556 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9557        Scoding_system_priority_list, 0, 1, 0,
9558        doc: /* Return a list of coding systems ordered by their priorities.
9559 The list contains a subset of coding systems; i.e. coding systems
9560 assigned to each coding category (see `coding-category-list').
9561
9562 HIGHESTP non-nil means just return the highest priority one.  */)
9563      (highestp)
9564      Lisp_Object highestp;
9565 {
9566   int i;
9567   Lisp_Object val;
9568
9569   for (i = 0, val = Qnil; i < coding_category_max; i++)
9570     {
9571       enum coding_category category = coding_priorities[i];
9572       int id = coding_categories[category].id;
9573       Lisp_Object attrs;
9574
9575       if (id < 0)
9576         continue;
9577       attrs = CODING_ID_ATTRS (id);
9578       if (! NILP (highestp))
9579         return CODING_ATTR_BASE_NAME (attrs);
9580       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9581     }
9582   return Fnreverse (val);
9583 }
9584
9585 static char *suffixes[] = { "-unix", "-dos", "-mac" };
9586
9587 static Lisp_Object
9588 make_subsidiaries (base)
9589      Lisp_Object base;
9590 {
9591   Lisp_Object subsidiaries;
9592   int base_name_len = SBYTES (SYMBOL_NAME (base));
9593   char *buf = (char *) alloca (base_name_len + 6);
9594   int i;
9595
9596   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9597   subsidiaries = Fmake_vector (make_number (3), Qnil);
9598   for (i = 0; i < 3; i++)
9599     {
9600       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9601       ASET (subsidiaries, i, intern (buf));
9602     }
9603   return subsidiaries;
9604 }
9605
9606
9607 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9608        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9609        doc: /* For internal use only.
9610 usage: (define-coding-system-internal ...)  */)
9611      (nargs, args)
9612      int nargs;
9613      Lisp_Object *args;
9614 {
9615   Lisp_Object name;
9616   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9617   Lisp_Object attrs;            /* Vector of attributes.  */
9618   Lisp_Object eol_type;
9619   Lisp_Object aliases;
9620   Lisp_Object coding_type, charset_list, safe_charsets;
9621   enum coding_category category;
9622   Lisp_Object tail, val;
9623   int max_charset_id = 0;
9624   int i;
9625
9626   if (nargs < coding_arg_max)
9627     goto short_args;
9628
9629   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9630
9631   name = args[coding_arg_name];
9632   CHECK_SYMBOL (name);
9633   CODING_ATTR_BASE_NAME (attrs) = name;
9634
9635   val = args[coding_arg_mnemonic];
9636   if (! STRINGP (val))
9637     CHECK_CHARACTER (val);
9638   CODING_ATTR_MNEMONIC (attrs) = val;
9639
9640   coding_type = args[coding_arg_coding_type];
9641   CHECK_SYMBOL (coding_type);
9642   CODING_ATTR_TYPE (attrs) = coding_type;
9643
9644   charset_list = args[coding_arg_charset_list];
9645   if (SYMBOLP (charset_list))
9646     {
9647       if (EQ (charset_list, Qiso_2022))
9648         {
9649           if (! EQ (coding_type, Qiso_2022))
9650             error ("Invalid charset-list");
9651           charset_list = Viso_2022_charset_list;
9652         }
9653       else if (EQ (charset_list, Qemacs_mule))
9654         {
9655           if (! EQ (coding_type, Qemacs_mule))
9656             error ("Invalid charset-list");
9657           charset_list = Vemacs_mule_charset_list;
9658         }
9659       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9660         if (max_charset_id < XFASTINT (XCAR (tail)))
9661           max_charset_id = XFASTINT (XCAR (tail));
9662     }
9663   else
9664     {
9665       charset_list = Fcopy_sequence (charset_list);
9666       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9667         {
9668           struct charset *charset;
9669
9670           val = XCAR (tail);
9671           CHECK_CHARSET_GET_CHARSET (val, charset);
9672           if (EQ (coding_type, Qiso_2022)
9673               ? CHARSET_ISO_FINAL (charset) < 0
9674               : EQ (coding_type, Qemacs_mule)
9675               ? CHARSET_EMACS_MULE_ID (charset) < 0
9676               : 0)
9677             error ("Can't handle charset `%s'",
9678                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9679
9680           XSETCAR (tail, make_number (charset->id));
9681           if (max_charset_id < charset->id)
9682             max_charset_id = charset->id;
9683         }
9684     }
9685   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9686
9687   safe_charsets = make_uninit_string (max_charset_id + 1);
9688   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9689   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9690     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9691   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9692
9693   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9694
9695   val = args[coding_arg_decode_translation_table];
9696   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9697     CHECK_SYMBOL (val);
9698   CODING_ATTR_DECODE_TBL (attrs) = val;
9699
9700   val = args[coding_arg_encode_translation_table];
9701   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9702     CHECK_SYMBOL (val);
9703   CODING_ATTR_ENCODE_TBL (attrs) = val;
9704
9705   val = args[coding_arg_post_read_conversion];
9706   CHECK_SYMBOL (val);
9707   CODING_ATTR_POST_READ (attrs) = val;
9708
9709   val = args[coding_arg_pre_write_conversion];
9710   CHECK_SYMBOL (val);
9711   CODING_ATTR_PRE_WRITE (attrs) = val;
9712
9713   val = args[coding_arg_default_char];
9714   if (NILP (val))
9715     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9716   else
9717     {
9718       CHECK_CHARACTER (val);
9719       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9720     }
9721
9722   val = args[coding_arg_for_unibyte];
9723   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9724
9725   val = args[coding_arg_plist];
9726   CHECK_LIST (val);
9727   CODING_ATTR_PLIST (attrs) = val;
9728
9729   if (EQ (coding_type, Qcharset))
9730     {
9731       /* Generate a lisp vector of 256 elements.  Each element is nil,
9732          integer, or a list of charset IDs.
9733
9734          If Nth element is nil, the byte code N is invalid in this
9735          coding system.
9736
9737          If Nth element is a number NUM, N is the first byte of a
9738          charset whose ID is NUM.
9739
9740          If Nth element is a list of charset IDs, N is the first byte
9741          of one of them.  The list is sorted by dimensions of the
9742          charsets.  A charset of smaller dimension comes firtst. */
9743       val = Fmake_vector (make_number (256), Qnil);
9744
9745       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9746         {
9747           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9748           int dim = CHARSET_DIMENSION (charset);
9749           int idx = (dim - 1) * 4;
9750
9751           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9752             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9753
9754           for (i = charset->code_space[idx];
9755                i <= charset->code_space[idx + 1]; i++)
9756             {
9757               Lisp_Object tmp, tmp2;
9758               int dim2;
9759
9760               tmp = AREF (val, i);
9761               if (NILP (tmp))
9762                 tmp = XCAR (tail);
9763               else if (NUMBERP (tmp))
9764                 {
9765                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9766                   if (dim < dim2)
9767                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9768                   else
9769                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9770                 }
9771               else
9772                 {
9773                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9774                     {
9775                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9776                       if (dim < dim2)
9777                         break;
9778                     }
9779                   if (NILP (tmp2))
9780                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9781                   else
9782                     {
9783                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9784                       XSETCAR (tmp2, XCAR (tail));
9785                     }
9786                 }
9787               ASET (val, i, tmp);
9788             }
9789         }
9790       ASET (attrs, coding_attr_charset_valids, val);
9791       category = coding_category_charset;
9792     }
9793   else if (EQ (coding_type, Qccl))
9794     {
9795       Lisp_Object valids;
9796
9797       if (nargs < coding_arg_ccl_max)
9798         goto short_args;
9799
9800       val = args[coding_arg_ccl_decoder];
9801       CHECK_CCL_PROGRAM (val);
9802       if (VECTORP (val))
9803         val = Fcopy_sequence (val);
9804       ASET (attrs, coding_attr_ccl_decoder, val);
9805
9806       val = args[coding_arg_ccl_encoder];
9807       CHECK_CCL_PROGRAM (val);
9808       if (VECTORP (val))
9809         val = Fcopy_sequence (val);
9810       ASET (attrs, coding_attr_ccl_encoder, val);
9811
9812       val = args[coding_arg_ccl_valids];
9813       valids = Fmake_string (make_number (256), make_number (0));
9814       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9815         {
9816           int from, to;
9817
9818           val = Fcar (tail);
9819           if (INTEGERP (val))
9820             {
9821               from = to = XINT (val);
9822               if (from < 0 || from > 255)
9823                 args_out_of_range_3 (val, make_number (0), make_number (255));
9824             }
9825           else
9826             {
9827               CHECK_CONS (val);
9828               CHECK_NATNUM_CAR (val);
9829               CHECK_NATNUM_CDR (val);
9830               from = XINT (XCAR (val));
9831               if (from > 255)
9832                 args_out_of_range_3 (XCAR (val),
9833                                      make_number (0), make_number (255));
9834               to = XINT (XCDR (val));
9835               if (to < from || to > 255)
9836                 args_out_of_range_3 (XCDR (val),
9837                                      XCAR (val), make_number (255));
9838             }
9839           for (i = from; i <= to; i++)
9840             SSET (valids, i, 1);
9841         }
9842       ASET (attrs, coding_attr_ccl_valids, valids);
9843
9844       category = coding_category_ccl;
9845     }
9846   else if (EQ (coding_type, Qutf_16))
9847     {
9848       Lisp_Object bom, endian;
9849
9850       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9851
9852       if (nargs < coding_arg_utf16_max)
9853         goto short_args;
9854
9855       bom = args[coding_arg_utf16_bom];
9856       if (! NILP (bom) && ! EQ (bom, Qt))
9857         {
9858           CHECK_CONS (bom);
9859           val = XCAR (bom);
9860           CHECK_CODING_SYSTEM (val);
9861           val = XCDR (bom);
9862           CHECK_CODING_SYSTEM (val);
9863         }
9864       ASET (attrs, coding_attr_utf_bom, bom);
9865
9866       endian = args[coding_arg_utf16_endian];
9867       CHECK_SYMBOL (endian);
9868       if (NILP (endian))
9869         endian = Qbig;
9870       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9871         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9872       ASET (attrs, coding_attr_utf_16_endian, endian);
9873
9874       category = (CONSP (bom)
9875                   ? coding_category_utf_16_auto
9876                   : NILP (bom)
9877                   ? (EQ (endian, Qbig)
9878                      ? coding_category_utf_16_be_nosig
9879                      : coding_category_utf_16_le_nosig)
9880                   : (EQ (endian, Qbig)
9881                      ? coding_category_utf_16_be
9882                      : coding_category_utf_16_le));
9883     }
9884   else if (EQ (coding_type, Qiso_2022))
9885     {
9886       Lisp_Object initial, reg_usage, request, flags;
9887       int i;
9888
9889       if (nargs < coding_arg_iso2022_max)
9890         goto short_args;
9891
9892       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9893       CHECK_VECTOR (initial);
9894       for (i = 0; i < 4; i++)
9895         {
9896           val = Faref (initial, make_number (i));
9897           if (! NILP (val))
9898             {
9899               struct charset *charset;
9900
9901               CHECK_CHARSET_GET_CHARSET (val, charset);
9902               ASET (initial, i, make_number (CHARSET_ID (charset)));
9903               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9904                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9905             }
9906           else
9907             ASET (initial, i, make_number (-1));
9908         }
9909
9910       reg_usage = args[coding_arg_iso2022_reg_usage];
9911       CHECK_CONS (reg_usage);
9912       CHECK_NUMBER_CAR (reg_usage);
9913       CHECK_NUMBER_CDR (reg_usage);
9914
9915       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9916       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9917         {
9918           int id;
9919           Lisp_Object tmp;
9920
9921           val = Fcar (tail);
9922           CHECK_CONS (val);
9923           tmp = XCAR (val);
9924           CHECK_CHARSET_GET_ID (tmp, id);
9925           CHECK_NATNUM_CDR (val);
9926           if (XINT (XCDR (val)) >= 4)
9927             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9928           XSETCAR (val, make_number (id));
9929         }
9930
9931       flags = args[coding_arg_iso2022_flags];
9932       CHECK_NATNUM (flags);
9933       i = XINT (flags);
9934       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9935         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9936
9937       ASET (attrs, coding_attr_iso_initial, initial);
9938       ASET (attrs, coding_attr_iso_usage, reg_usage);
9939       ASET (attrs, coding_attr_iso_request, request);
9940       ASET (attrs, coding_attr_iso_flags, flags);
9941       setup_iso_safe_charsets (attrs);
9942
9943       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9944         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9945                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9946                     ? coding_category_iso_7_else
9947                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9948                     ? coding_category_iso_7
9949                     : coding_category_iso_7_tight);
9950       else
9951         {
9952           int id = XINT (AREF (initial, 1));
9953
9954           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9955                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9956                        || id < 0)
9957                       ? coding_category_iso_8_else
9958                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9959                       ? coding_category_iso_8_1
9960                       : coding_category_iso_8_2);
9961         }
9962       if (category != coding_category_iso_8_1
9963           && category != coding_category_iso_8_2)
9964         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9965     }
9966   else if (EQ (coding_type, Qemacs_mule))
9967     {
9968       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9969         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9970       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9971       category = coding_category_emacs_mule;
9972     }
9973   else if (EQ (coding_type, Qshift_jis))
9974     {
9975
9976       struct charset *charset;
9977
9978       if (XINT (Flength (charset_list)) != 3
9979           && XINT (Flength (charset_list)) != 4)
9980         error ("There should be three or four charsets");
9981
9982       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9983       if (CHARSET_DIMENSION (charset) != 1)
9984         error ("Dimension of charset %s is not one",
9985                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9986       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9987         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9988
9989       charset_list = XCDR (charset_list);
9990       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9991       if (CHARSET_DIMENSION (charset) != 1)
9992         error ("Dimension of charset %s is not one",
9993                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9994
9995       charset_list = XCDR (charset_list);
9996       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9997       if (CHARSET_DIMENSION (charset) != 2)
9998         error ("Dimension of charset %s is not two",
9999                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10000
10001       charset_list = XCDR (charset_list);
10002       if (! NILP (charset_list))
10003         {
10004           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10005           if (CHARSET_DIMENSION (charset) != 2)
10006             error ("Dimension of charset %s is not two",
10007                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10008         }
10009
10010       category = coding_category_sjis;
10011       Vsjis_coding_system = name;
10012     }
10013   else if (EQ (coding_type, Qbig5))
10014     {
10015       struct charset *charset;
10016
10017       if (XINT (Flength (charset_list)) != 2)
10018         error ("There should be just two charsets");
10019
10020       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10021       if (CHARSET_DIMENSION (charset) != 1)
10022         error ("Dimension of charset %s is not one",
10023                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10024       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10025         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10026
10027       charset_list = XCDR (charset_list);
10028       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10029       if (CHARSET_DIMENSION (charset) != 2)
10030         error ("Dimension of charset %s is not two",
10031                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10032
10033       category = coding_category_big5;
10034       Vbig5_coding_system = name;
10035     }
10036   else if (EQ (coding_type, Qraw_text))
10037     {
10038       category = coding_category_raw_text;
10039       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10040     }
10041   else if (EQ (coding_type, Qutf_8))
10042     {
10043       Lisp_Object bom;
10044
10045       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10046
10047       if (nargs < coding_arg_utf8_max)
10048         goto short_args;
10049
10050       bom = args[coding_arg_utf8_bom];
10051       if (! NILP (bom) && ! EQ (bom, Qt))
10052         {
10053           CHECK_CONS (bom);
10054           val = XCAR (bom);
10055           CHECK_CODING_SYSTEM (val);
10056           val = XCDR (bom);
10057           CHECK_CODING_SYSTEM (val);
10058         }
10059       ASET (attrs, coding_attr_utf_bom, bom);
10060
10061       category = (CONSP (bom) ? coding_category_utf_8_auto
10062                   : NILP (bom) ? coding_category_utf_8_nosig
10063                   : coding_category_utf_8_sig);
10064     }
10065   else if (EQ (coding_type, Qundecided))
10066     category = coding_category_undecided;
10067   else
10068     error ("Invalid coding system type: %s",
10069            SDATA (SYMBOL_NAME (coding_type)));
10070
10071   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10072   CODING_ATTR_PLIST (attrs)
10073     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10074                                 CODING_ATTR_PLIST (attrs)));
10075   CODING_ATTR_PLIST (attrs)
10076     = Fcons (QCascii_compatible_p,
10077              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10078                     CODING_ATTR_PLIST (attrs)));
10079
10080   eol_type = args[coding_arg_eol_type];
10081   if (! NILP (eol_type)
10082       && ! EQ (eol_type, Qunix)
10083       && ! EQ (eol_type, Qdos)
10084       && ! EQ (eol_type, Qmac))
10085     error ("Invalid eol-type");
10086
10087   aliases = Fcons (name, Qnil);
10088
10089   if (NILP (eol_type))
10090     {
10091       eol_type = make_subsidiaries (name);
10092       for (i = 0; i < 3; i++)
10093         {
10094           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10095
10096           this_name = AREF (eol_type, i);
10097           this_aliases = Fcons (this_name, Qnil);
10098           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10099           this_spec = Fmake_vector (make_number (3), attrs);
10100           ASET (this_spec, 1, this_aliases);
10101           ASET (this_spec, 2, this_eol_type);
10102           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10103           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10104           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10105           if (NILP (val))
10106             Vcoding_system_alist
10107               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10108                        Vcoding_system_alist);
10109         }
10110     }
10111
10112   spec_vec = Fmake_vector (make_number (3), attrs);
10113   ASET (spec_vec, 1, aliases);
10114   ASET (spec_vec, 2, eol_type);
10115
10116   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10117   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10118   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10119   if (NILP (val))
10120     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10121                                   Vcoding_system_alist);
10122
10123   {
10124     int id = coding_categories[category].id;
10125
10126     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10127       setup_coding_system (name, &coding_categories[category]);
10128   }
10129
10130   return Qnil;
10131
10132  short_args:
10133   return Fsignal (Qwrong_number_of_arguments,
10134                   Fcons (intern ("define-coding-system-internal"),
10135                          make_number (nargs)));
10136 }
10137
10138
10139 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10140        3, 3, 0,
10141        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10142   (coding_system, prop, val)
10143      Lisp_Object coding_system, prop, val;
10144 {
10145   Lisp_Object spec, attrs;
10146
10147   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10148   attrs = AREF (spec, 0);
10149   if (EQ (prop, QCmnemonic))
10150     {
10151       if (! STRINGP (val))
10152         CHECK_CHARACTER (val);
10153       CODING_ATTR_MNEMONIC (attrs) = val;
10154     }
10155   else if (EQ (prop, QCdefault_char))
10156     {
10157       if (NILP (val))
10158         val = make_number (' ');
10159       else
10160         CHECK_CHARACTER (val);
10161       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10162     }
10163   else if (EQ (prop, QCdecode_translation_table))
10164     {
10165       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10166         CHECK_SYMBOL (val);
10167       CODING_ATTR_DECODE_TBL (attrs) = val;
10168     }
10169   else if (EQ (prop, QCencode_translation_table))
10170     {
10171       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10172         CHECK_SYMBOL (val);
10173       CODING_ATTR_ENCODE_TBL (attrs) = val;
10174     }
10175   else if (EQ (prop, QCpost_read_conversion))
10176     {
10177       CHECK_SYMBOL (val);
10178       CODING_ATTR_POST_READ (attrs) = val;
10179     }
10180   else if (EQ (prop, QCpre_write_conversion))
10181     {
10182       CHECK_SYMBOL (val);
10183       CODING_ATTR_PRE_WRITE (attrs) = val;
10184     }
10185   else if (EQ (prop, QCascii_compatible_p))
10186     {
10187       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10188     }
10189
10190   CODING_ATTR_PLIST (attrs)
10191     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10192   return val;
10193 }
10194
10195
10196 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10197        Sdefine_coding_system_alias, 2, 2, 0,
10198        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10199      (alias, coding_system)
10200      Lisp_Object alias, coding_system;
10201 {
10202   Lisp_Object spec, aliases, eol_type, val;
10203
10204   CHECK_SYMBOL (alias);
10205   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10206   aliases = AREF (spec, 1);
10207   /* ALIASES should be a list of length more than zero, and the first
10208      element is a base coding system.  Append ALIAS at the tail of the
10209      list.  */
10210   while (!NILP (XCDR (aliases)))
10211     aliases = XCDR (aliases);
10212   XSETCDR (aliases, Fcons (alias, Qnil));
10213
10214   eol_type = AREF (spec, 2);
10215   if (VECTORP (eol_type))
10216     {
10217       Lisp_Object subsidiaries;
10218       int i;
10219
10220       subsidiaries = make_subsidiaries (alias);
10221       for (i = 0; i < 3; i++)
10222         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10223                                      AREF (eol_type, i));
10224     }
10225
10226   Fputhash (alias, spec, Vcoding_system_hash_table);
10227   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10228   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10229   if (NILP (val))
10230     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10231                                   Vcoding_system_alist);
10232
10233   return Qnil;
10234 }
10235
10236 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10237        1, 1, 0,
10238        doc: /* Return the base of CODING-SYSTEM.
10239 Any alias or subsidiary coding system is not a base coding system.  */)
10240   (coding_system)
10241      Lisp_Object coding_system;
10242 {
10243   Lisp_Object spec, attrs;
10244
10245   if (NILP (coding_system))
10246     return (Qno_conversion);
10247   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10248   attrs = AREF (spec, 0);
10249   return CODING_ATTR_BASE_NAME (attrs);
10250 }
10251
10252 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10253        1, 1, 0,
10254        doc: "Return the property list of CODING-SYSTEM.")
10255      (coding_system)
10256      Lisp_Object coding_system;
10257 {
10258   Lisp_Object spec, attrs;
10259
10260   if (NILP (coding_system))
10261     coding_system = Qno_conversion;
10262   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10263   attrs = AREF (spec, 0);
10264   return CODING_ATTR_PLIST (attrs);
10265 }
10266
10267
10268 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10269        1, 1, 0,
10270        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10271      (coding_system)
10272      Lisp_Object coding_system;
10273 {
10274   Lisp_Object spec;
10275
10276   if (NILP (coding_system))
10277     coding_system = Qno_conversion;
10278   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10279   return AREF (spec, 1);
10280 }
10281
10282 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10283        Scoding_system_eol_type, 1, 1, 0,
10284        doc: /* Return eol-type of CODING-SYSTEM.
10285 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10286
10287 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10288 and CR respectively.
10289
10290 A vector value indicates that a format of end-of-line should be
10291 detected automatically.  Nth element of the vector is the subsidiary
10292 coding system whose eol-type is N.  */)
10293      (coding_system)
10294      Lisp_Object coding_system;
10295 {
10296   Lisp_Object spec, eol_type;
10297   int n;
10298
10299   if (NILP (coding_system))
10300     coding_system = Qno_conversion;
10301   if (! CODING_SYSTEM_P (coding_system))
10302     return Qnil;
10303   spec = CODING_SYSTEM_SPEC (coding_system);
10304   eol_type = AREF (spec, 2);
10305   if (VECTORP (eol_type))
10306     return Fcopy_sequence (eol_type);
10307   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10308   return make_number (n);
10309 }
10310
10311 #endif /* emacs */
10312
10313 \f
10314 /*** 9. Post-amble ***/
10315
10316 void
10317 init_coding_once ()
10318 {
10319   int i;
10320
10321   for (i = 0; i < coding_category_max; i++)
10322     {
10323       coding_categories[i].id = -1;
10324       coding_priorities[i] = i;
10325     }
10326
10327   /* ISO2022 specific initialize routine.  */
10328   for (i = 0; i < 0x20; i++)
10329     iso_code_class[i] = ISO_control_0;
10330   for (i = 0x21; i < 0x7F; i++)
10331     iso_code_class[i] = ISO_graphic_plane_0;
10332   for (i = 0x80; i < 0xA0; i++)
10333     iso_code_class[i] = ISO_control_1;
10334   for (i = 0xA1; i < 0xFF; i++)
10335     iso_code_class[i] = ISO_graphic_plane_1;
10336   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10337   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10338   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10339   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10340   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10341   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10342   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10343   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10344   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10345
10346   for (i = 0; i < 256; i++)
10347     {
10348       emacs_mule_bytes[i] = 1;
10349     }
10350   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10351   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10352   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10353   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10354 }
10355
10356 #ifdef emacs
10357
10358 void
10359 syms_of_coding ()
10360 {
10361   staticpro (&Vcoding_system_hash_table);
10362   {
10363     Lisp_Object args[2];
10364     args[0] = QCtest;
10365     args[1] = Qeq;
10366     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10367   }
10368
10369   staticpro (&Vsjis_coding_system);
10370   Vsjis_coding_system = Qnil;
10371
10372   staticpro (&Vbig5_coding_system);
10373   Vbig5_coding_system = Qnil;
10374
10375   staticpro (&Vcode_conversion_reused_workbuf);
10376   Vcode_conversion_reused_workbuf = Qnil;
10377
10378   staticpro (&Vcode_conversion_workbuf_name);
10379   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
10380
10381   reused_workbuf_in_use = 0;
10382
10383   DEFSYM (Qcharset, "charset");
10384   DEFSYM (Qtarget_idx, "target-idx");
10385   DEFSYM (Qcoding_system_history, "coding-system-history");
10386   Fset (Qcoding_system_history, Qnil);
10387
10388   /* Target FILENAME is the first argument.  */
10389   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10390   /* Target FILENAME is the third argument.  */
10391   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10392
10393   DEFSYM (Qcall_process, "call-process");
10394   /* Target PROGRAM is the first argument.  */
10395   Fput (Qcall_process, Qtarget_idx, make_number (0));
10396
10397   DEFSYM (Qcall_process_region, "call-process-region");
10398   /* Target PROGRAM is the third argument.  */
10399   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10400
10401   DEFSYM (Qstart_process, "start-process");
10402   /* Target PROGRAM is the third argument.  */
10403   Fput (Qstart_process, Qtarget_idx, make_number (2));
10404
10405   DEFSYM (Qopen_network_stream, "open-network-stream");
10406   /* Target SERVICE is the fourth argument.  */
10407   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10408
10409   DEFSYM (Qcoding_system, "coding-system");
10410   DEFSYM (Qcoding_aliases, "coding-aliases");
10411
10412   DEFSYM (Qeol_type, "eol-type");
10413   DEFSYM (Qunix, "unix");
10414   DEFSYM (Qdos, "dos");
10415
10416   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10417   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10418   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10419   DEFSYM (Qdefault_char, "default-char");
10420   DEFSYM (Qundecided, "undecided");
10421   DEFSYM (Qno_conversion, "no-conversion");
10422   DEFSYM (Qraw_text, "raw-text");
10423
10424   DEFSYM (Qiso_2022, "iso-2022");
10425
10426   DEFSYM (Qutf_8, "utf-8");
10427   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10428
10429   DEFSYM (Qutf_16, "utf-16");
10430   DEFSYM (Qbig, "big");
10431   DEFSYM (Qlittle, "little");
10432
10433   DEFSYM (Qshift_jis, "shift-jis");
10434   DEFSYM (Qbig5, "big5");
10435
10436   DEFSYM (Qcoding_system_p, "coding-system-p");
10437
10438   DEFSYM (Qcoding_system_error, "coding-system-error");
10439   Fput (Qcoding_system_error, Qerror_conditions,
10440         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
10441   Fput (Qcoding_system_error, Qerror_message,
10442         build_string ("Invalid coding system"));
10443
10444   /* Intern this now in case it isn't already done.
10445      Setting this variable twice is harmless.
10446      But don't staticpro it here--that is done in alloc.c.  */
10447   Qchar_table_extra_slots = intern ("char-table-extra-slots");
10448
10449   DEFSYM (Qtranslation_table, "translation-table");
10450   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10451   DEFSYM (Qtranslation_table_id, "translation-table-id");
10452   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10453   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10454
10455   DEFSYM (Qvalid_codes, "valid-codes");
10456
10457   DEFSYM (Qemacs_mule, "emacs-mule");
10458
10459   DEFSYM (QCcategory, ":category");
10460   DEFSYM (QCmnemonic, ":mnemonic");
10461   DEFSYM (QCdefault_char, ":default-char");
10462   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10463   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10464   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10465   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10466   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10467
10468   Vcoding_category_table
10469     = Fmake_vector (make_number (coding_category_max), Qnil);
10470   staticpro (&Vcoding_category_table);
10471   /* Followings are target of code detection.  */
10472   ASET (Vcoding_category_table, coding_category_iso_7,
10473         intern ("coding-category-iso-7"));
10474   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10475         intern ("coding-category-iso-7-tight"));
10476   ASET (Vcoding_category_table, coding_category_iso_8_1,
10477         intern ("coding-category-iso-8-1"));
10478   ASET (Vcoding_category_table, coding_category_iso_8_2,
10479         intern ("coding-category-iso-8-2"));
10480   ASET (Vcoding_category_table, coding_category_iso_7_else,
10481         intern ("coding-category-iso-7-else"));
10482   ASET (Vcoding_category_table, coding_category_iso_8_else,
10483         intern ("coding-category-iso-8-else"));
10484   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10485         intern ("coding-category-utf-8-auto"));
10486   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10487         intern ("coding-category-utf-8"));
10488   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10489         intern ("coding-category-utf-8-sig"));
10490   ASET (Vcoding_category_table, coding_category_utf_16_be,
10491         intern ("coding-category-utf-16-be"));
10492   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10493         intern ("coding-category-utf-16-auto"));
10494   ASET (Vcoding_category_table, coding_category_utf_16_le,
10495         intern ("coding-category-utf-16-le"));
10496   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10497         intern ("coding-category-utf-16-be-nosig"));
10498   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10499         intern ("coding-category-utf-16-le-nosig"));
10500   ASET (Vcoding_category_table, coding_category_charset,
10501         intern ("coding-category-charset"));
10502   ASET (Vcoding_category_table, coding_category_sjis,
10503         intern ("coding-category-sjis"));
10504   ASET (Vcoding_category_table, coding_category_big5,
10505         intern ("coding-category-big5"));
10506   ASET (Vcoding_category_table, coding_category_ccl,
10507         intern ("coding-category-ccl"));
10508   ASET (Vcoding_category_table, coding_category_emacs_mule,
10509         intern ("coding-category-emacs-mule"));
10510   /* Followings are NOT target of code detection.  */
10511   ASET (Vcoding_category_table, coding_category_raw_text,
10512         intern ("coding-category-raw-text"));
10513   ASET (Vcoding_category_table, coding_category_undecided,
10514         intern ("coding-category-undecided"));
10515
10516   DEFSYM (Qinsufficient_source, "insufficient-source");
10517   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10518   DEFSYM (Qinvalid_source, "invalid-source");
10519   DEFSYM (Qinterrupted, "interrupted");
10520   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10521   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10522
10523   defsubr (&Scoding_system_p);
10524   defsubr (&Sread_coding_system);
10525   defsubr (&Sread_non_nil_coding_system);
10526   defsubr (&Scheck_coding_system);
10527   defsubr (&Sdetect_coding_region);
10528   defsubr (&Sdetect_coding_string);
10529   defsubr (&Sfind_coding_systems_region_internal);
10530   defsubr (&Sunencodable_char_position);
10531   defsubr (&Scheck_coding_systems_region);
10532   defsubr (&Sdecode_coding_region);
10533   defsubr (&Sencode_coding_region);
10534   defsubr (&Sdecode_coding_string);
10535   defsubr (&Sencode_coding_string);
10536   defsubr (&Sdecode_sjis_char);
10537   defsubr (&Sencode_sjis_char);
10538   defsubr (&Sdecode_big5_char);
10539   defsubr (&Sencode_big5_char);
10540   defsubr (&Sset_terminal_coding_system_internal);
10541   defsubr (&Sset_safe_terminal_coding_system_internal);
10542   defsubr (&Sterminal_coding_system);
10543   defsubr (&Sset_keyboard_coding_system_internal);
10544   defsubr (&Skeyboard_coding_system);
10545   defsubr (&Sfind_operation_coding_system);
10546   defsubr (&Sset_coding_system_priority);
10547   defsubr (&Sdefine_coding_system_internal);
10548   defsubr (&Sdefine_coding_system_alias);
10549   defsubr (&Scoding_system_put);
10550   defsubr (&Scoding_system_base);
10551   defsubr (&Scoding_system_plist);
10552   defsubr (&Scoding_system_aliases);
10553   defsubr (&Scoding_system_eol_type);
10554   defsubr (&Scoding_system_priority_list);
10555
10556   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10557                doc: /* List of coding systems.
10558
10559 Do not alter the value of this variable manually.  This variable should be
10560 updated by the functions `define-coding-system' and
10561 `define-coding-system-alias'.  */);
10562   Vcoding_system_list = Qnil;
10563
10564   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10565                doc: /* Alist of coding system names.
10566 Each element is one element list of coding system name.
10567 This variable is given to `completing-read' as COLLECTION argument.
10568
10569 Do not alter the value of this variable manually.  This variable should be
10570 updated by the functions `make-coding-system' and
10571 `define-coding-system-alias'.  */);
10572   Vcoding_system_alist = Qnil;
10573
10574   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10575                doc: /* List of coding-categories (symbols) ordered by priority.
10576
10577 On detecting a coding system, Emacs tries code detection algorithms
10578 associated with each coding-category one by one in this order.  When
10579 one algorithm agrees with a byte sequence of source text, the coding
10580 system bound to the corresponding coding-category is selected.
10581
10582 Don't modify this variable directly, but use `set-coding-priority'.  */);
10583   {
10584     int i;
10585
10586     Vcoding_category_list = Qnil;
10587     for (i = coding_category_max - 1; i >= 0; i--)
10588       Vcoding_category_list
10589         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10590                  Vcoding_category_list);
10591   }
10592
10593   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10594                doc: /* Specify the coding system for read operations.
10595 It is useful to bind this variable with `let', but do not set it globally.
10596 If the value is a coding system, it is used for decoding on read operation.
10597 If not, an appropriate element is used from one of the coding system alists.
10598 There are three such tables: `file-coding-system-alist',
10599 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10600   Vcoding_system_for_read = Qnil;
10601
10602   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10603                doc: /* Specify the coding system for write operations.
10604 Programs bind this variable with `let', but you should not set it globally.
10605 If the value is a coding system, it is used for encoding of output,
10606 when writing it to a file and when sending it to a file or subprocess.
10607
10608 If this does not specify a coding system, an appropriate element
10609 is used from one of the coding system alists.
10610 There are three such tables: `file-coding-system-alist',
10611 `process-coding-system-alist', and `network-coding-system-alist'.
10612 For output to files, if the above procedure does not specify a coding system,
10613 the value of `buffer-file-coding-system' is used.  */);
10614   Vcoding_system_for_write = Qnil;
10615
10616   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10617                doc: /*
10618 Coding system used in the latest file or process I/O.  */);
10619   Vlast_coding_system_used = Qnil;
10620
10621   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10622                doc: /*
10623 Error status of the last code conversion.
10624
10625 When an error was detected in the last code conversion, this variable
10626 is set to one of the following symbols.
10627   `insufficient-source'
10628   `inconsistent-eol'
10629   `invalid-source'
10630   `interrupted'
10631   `insufficient-memory'
10632 When no error was detected, the value doesn't change.  So, to check
10633 the error status of a code conversion by this variable, you must
10634 explicitly set this variable to nil before performing code
10635 conversion.  */);
10636   Vlast_code_conversion_error = Qnil;
10637
10638   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10639                doc: /*
10640 *Non-nil means always inhibit code conversion of end-of-line format.
10641 See info node `Coding Systems' and info node `Text and Binary' concerning
10642 such conversion.  */);
10643   inhibit_eol_conversion = 0;
10644
10645   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10646                doc: /*
10647 Non-nil means process buffer inherits coding system of process output.
10648 Bind it to t if the process output is to be treated as if it were a file
10649 read from some filesystem.  */);
10650   inherit_process_coding_system = 0;
10651
10652   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10653                doc: /*
10654 Alist to decide a coding system to use for a file I/O operation.
10655 The format is ((PATTERN . VAL) ...),
10656 where PATTERN is a regular expression matching a file name,
10657 VAL is a coding system, a cons of coding systems, or a function symbol.
10658 If VAL is a coding system, it is used for both decoding and encoding
10659 the file contents.
10660 If VAL is a cons of coding systems, the car part is used for decoding,
10661 and the cdr part is used for encoding.
10662 If VAL is a function symbol, the function must return a coding system
10663 or a cons of coding systems which are used as above.  The function is
10664 called with an argument that is a list of the arguments with which
10665 `find-operation-coding-system' was called.  If the function can't decide
10666 a coding system, it can return `undecided' so that the normal
10667 code-detection is performed.
10668
10669 See also the function `find-operation-coding-system'
10670 and the variable `auto-coding-alist'.  */);
10671   Vfile_coding_system_alist = Qnil;
10672
10673   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10674                doc: /*
10675 Alist to decide a coding system to use for a process I/O operation.
10676 The format is ((PATTERN . VAL) ...),
10677 where PATTERN is a regular expression matching a program name,
10678 VAL is a coding system, a cons of coding systems, or a function symbol.
10679 If VAL is a coding system, it is used for both decoding what received
10680 from the program and encoding what sent to the program.
10681 If VAL is a cons of coding systems, the car part is used for decoding,
10682 and the cdr part is used for encoding.
10683 If VAL is a function symbol, the function must return a coding system
10684 or a cons of coding systems which are used as above.
10685
10686 See also the function `find-operation-coding-system'.  */);
10687   Vprocess_coding_system_alist = Qnil;
10688
10689   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10690                doc: /*
10691 Alist to decide a coding system to use for a network I/O operation.
10692 The format is ((PATTERN . VAL) ...),
10693 where PATTERN is a regular expression matching a network service name
10694 or is a port number to connect to,
10695 VAL is a coding system, a cons of coding systems, or a function symbol.
10696 If VAL is a coding system, it is used for both decoding what received
10697 from the network stream and encoding what sent to the network stream.
10698 If VAL is a cons of coding systems, the car part is used for decoding,
10699 and the cdr part is used for encoding.
10700 If VAL is a function symbol, the function must return a coding system
10701 or a cons of coding systems which are used as above.
10702
10703 See also the function `find-operation-coding-system'.  */);
10704   Vnetwork_coding_system_alist = Qnil;
10705
10706   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10707                doc: /* Coding system to use with system messages.
10708 Also used for decoding keyboard input on X Window system.  */);
10709   Vlocale_coding_system = Qnil;
10710
10711   /* The eol mnemonics are reset in startup.el system-dependently.  */
10712   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10713                doc: /*
10714 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10715   eol_mnemonic_unix = build_string (":");
10716
10717   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10718                doc: /*
10719 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10720   eol_mnemonic_dos = build_string ("\\");
10721
10722   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10723                doc: /*
10724 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10725   eol_mnemonic_mac = build_string ("/");
10726
10727   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10728                doc: /*
10729 *String displayed in mode line when end-of-line format is not yet determined.  */);
10730   eol_mnemonic_undecided = build_string (":");
10731
10732   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10733                doc: /*
10734 *Non-nil enables character translation while encoding and decoding.  */);
10735   Venable_character_translation = Qt;
10736
10737   DEFVAR_LISP ("standard-translation-table-for-decode",
10738                &Vstandard_translation_table_for_decode,
10739                doc: /* Table for translating characters while decoding.  */);
10740   Vstandard_translation_table_for_decode = Qnil;
10741
10742   DEFVAR_LISP ("standard-translation-table-for-encode",
10743                &Vstandard_translation_table_for_encode,
10744                doc: /* Table for translating characters while encoding.  */);
10745   Vstandard_translation_table_for_encode = Qnil;
10746
10747   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10748                doc: /* Alist of charsets vs revision numbers.
10749 While encoding, if a charset (car part of an element) is found,
10750 designate it with the escape sequence identifying revision (cdr part
10751 of the element).  */);
10752   Vcharset_revision_table = Qnil;
10753
10754   DEFVAR_LISP ("default-process-coding-system",
10755                &Vdefault_process_coding_system,
10756                doc: /* Cons of coding systems used for process I/O by default.
10757 The car part is used for decoding a process output,
10758 the cdr part is used for encoding a text to be sent to a process.  */);
10759   Vdefault_process_coding_system = Qnil;
10760
10761   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10762                doc: /*
10763 Table of extra Latin codes in the range 128..159 (inclusive).
10764 This is a vector of length 256.
10765 If Nth element is non-nil, the existence of code N in a file
10766 \(or output of subprocess) doesn't prevent it to be detected as
10767 a coding system of ISO 2022 variant which has a flag
10768 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10769 or reading output of a subprocess.
10770 Only 128th through 159th elements have a meaning.  */);
10771   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10772
10773   DEFVAR_LISP ("select-safe-coding-system-function",
10774                &Vselect_safe_coding_system_function,
10775                doc: /*
10776 Function to call to select safe coding system for encoding a text.
10777
10778 If set, this function is called to force a user to select a proper
10779 coding system which can encode the text in the case that a default
10780 coding system used in each operation can't encode the text.  The
10781 function should take care that the buffer is not modified while
10782 the coding system is being selected.
10783
10784 The default value is `select-safe-coding-system' (which see).  */);
10785   Vselect_safe_coding_system_function = Qnil;
10786
10787   DEFVAR_BOOL ("coding-system-require-warning",
10788                &coding_system_require_warning,
10789                doc: /* Internal use only.
10790 If non-nil, on writing a file, `select-safe-coding-system-function' is
10791 called even if `coding-system-for-write' is non-nil.  The command
10792 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10793   coding_system_require_warning = 0;
10794
10795
10796   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10797                &inhibit_iso_escape_detection,
10798                doc: /*
10799 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10800
10801 When Emacs reads text, it tries to detect how the text is encoded.
10802 This code detection is sensitive to escape sequences.  If Emacs sees
10803 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10804 of the ISO2022 encodings, and decodes text by the corresponding coding
10805 system (e.g. `iso-2022-7bit').
10806
10807 However, there may be a case that you want to read escape sequences in
10808 a file as is.  In such a case, you can set this variable to non-nil.
10809 Then the code detection will ignore any escape sequences, and no text is
10810 detected as encoded in some ISO-2022 encoding.  The result is that all
10811 escape sequences become visible in a buffer.
10812
10813 The default value is nil, and it is strongly recommended not to change
10814 it.  That is because many Emacs Lisp source files that contain
10815 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10816 in Emacs's distribution, and they won't be decoded correctly on
10817 reading if you suppress escape sequence detection.
10818
10819 The other way to read escape sequences in a file without decoding is
10820 to explicitly specify some coding system that doesn't use ISO-2022
10821 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10822   inhibit_iso_escape_detection = 0;
10823
10824   DEFVAR_BOOL ("inhibit-null-byte-detection",
10825                &inhibit_null_byte_detection,
10826                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10827 By default, Emacs treats it as binary data, and does not attempt to
10828 decode it.  The effect is as if you specified `no-conversion' for
10829 reading that text.
10830
10831 Set this to non-nil when a regular text happens to include null bytes.
10832 Examples are Index nodes of Info files and null-byte delimited output
10833 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10834 decode text as usual.  */);
10835   inhibit_null_byte_detection = 0;
10836
10837   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10838                doc: /* Char table for translating self-inserting characters.
10839 This is applied to the result of input methods, not their input.
10840 See also `keyboard-translate-table'.
10841
10842 Use of this variable for character code unification was rendered
10843 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10844 internal character representation.  */);
10845     Vtranslation_table_for_input = Qnil;
10846
10847   {
10848     Lisp_Object args[coding_arg_max];
10849     Lisp_Object plist[16];
10850     int i;
10851
10852     for (i = 0; i < coding_arg_max; i++)
10853       args[i] = Qnil;
10854
10855     plist[0] = intern (":name");
10856     plist[1] = args[coding_arg_name] = Qno_conversion;
10857     plist[2] = intern (":mnemonic");
10858     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10859     plist[4] = intern (":coding-type");
10860     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10861     plist[6] = intern (":ascii-compatible-p");
10862     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10863     plist[8] = intern (":default-char");
10864     plist[9] = args[coding_arg_default_char] = make_number (0);
10865     plist[10] = intern (":for-unibyte");
10866     plist[11] = args[coding_arg_for_unibyte] = Qt;
10867     plist[12] = intern (":docstring");
10868     plist[13] = build_string ("Do no conversion.\n\
10869 \n\
10870 When you visit a file with this coding, the file is read into a\n\
10871 unibyte buffer as is, thus each byte of a file is treated as a\n\
10872 character.");
10873     plist[14] = intern (":eol-type");
10874     plist[15] = args[coding_arg_eol_type] = Qunix;
10875     args[coding_arg_plist] = Flist (16, plist);
10876     Fdefine_coding_system_internal (coding_arg_max, args);
10877
10878     plist[1] = args[coding_arg_name] = Qundecided;
10879     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10880     plist[5] = args[coding_arg_coding_type] = Qundecided;
10881     /* This is already set.
10882        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10883     plist[8] = intern (":charset-list");
10884     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10885     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10886     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10887     plist[15] = args[coding_arg_eol_type] = Qnil;
10888     args[coding_arg_plist] = Flist (16, plist);
10889     Fdefine_coding_system_internal (coding_arg_max, args);
10890   }
10891
10892   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10893
10894   {
10895     int i;
10896
10897     for (i = 0; i < coding_category_max; i++)
10898       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10899   }
10900 #if defined (MSDOS) || defined (WINDOWSNT)
10901   system_eol_type = Qdos;
10902 #else
10903   system_eol_type = Qunix;
10904 #endif
10905   staticpro (&system_eol_type);
10906 }
10907
10908 char *
10909 emacs_strerror (error_number)
10910      int error_number;
10911 {
10912   char *str;
10913
10914   synchronize_system_messages_locale ();
10915   str = strerror (error_number);
10916
10917   if (! NILP (Vlocale_coding_system))
10918     {
10919       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10920                                                       Vlocale_coding_system,
10921                                                       0);
10922       str = (char *) SDATA (dec);
10923     }
10924
10925   return str;
10926 }
10927
10928 #endif /* emacs */
10929
10930 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10931    (do not change this comment) */