src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010, 2011
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the source is exhausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exhausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe characters on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use only.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_INSUFFICIENT_DST:
 997       /* Don't record this error in Vlast_code_conversion_error
 998          because it happens just temporarily and is resolved when the
 999          whole conversion is finished.  */
1000       break;
1001     case CODING_RESULT_SUCCESS:
1002       break;
1003     default:
1004       Vlast_code_conversion_error = intern ("Unknown error");
1005     }
1006 }
1007
1008 /* This wrapper macro is used to preserve validity of pointers into
1009    buffer text across calls to decode_char, which could cause
1010    relocation of buffers if it loads a charset map, because loading a
1011    charset map allocates large structures.  */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013   do {                                                                       \
1014     charset_map_loaded = 0;                                                  \
1015     c = DECODE_CHAR (charset, code);                                         \
1016     if (charset_map_loaded)                                                  \
1017       {                                                                      \
1018         const unsigned char *orig = coding->source;                          \
1019         EMACS_INT offset;                                                    \
1020                                                                              \
1021         coding_set_source (coding);                                          \
1022         offset = coding->source - orig;                                      \
1023         src += offset;                                                       \
1024         src_base += offset;                                                  \
1025         src_end += offset;                                                   \
1026       }                                                                      \
1027   } while (0)
1028
1029
1030 /* If there are at least BYTES length of room at dst, allocate memory
1031    for coding->destination and update dst and dst_end.  We don't have
1032    to take care of coding->source which will be relocated.  It is
1033    handled by calling coding_set_source in encode_coding.  */
1034
1035 #define ASSURE_DESTINATION(bytes)                               \
1036   do {                                                          \
1037     if (dst + (bytes) >= dst_end)                               \
1038       {                                                         \
1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
1040                                                                 \
1041         dst = alloc_destination (coding, more_bytes, dst);      \
1042         dst_end = coding->destination + coding->dst_bytes;      \
1043       }                                                         \
1044   } while (0)
1045
1046
1047 /* Store multibyte form of the character C in P, and advance P to the
1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1049    never calls MAYBE_UNIFY_CHAR.  */
1050
1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1052   do {                                          \
1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
1054       *(p)++ = (c);                             \
1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
1063       *(p)++ = (0xF0 | (c >> 18)),              \
1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1066         *(p)++ = (0x80 | (c & 0x3F));           \
1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
1068       *(p)++ = 0xF8,                            \
1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1072         *(p)++ = (0x80 | (c & 0x3F));           \
1073     else                                        \
1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1075   } while (0)
1076
1077
1078 /* Return the character code of character whose multibyte form is at
1079    P, and advance P to the end of the multibyte form.  This is like
1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1081
1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1083   (!((p)[0] & 0x80)                                             \
1084    ? *(p)++                                                     \
1085    : ! ((p)[0] & 0x20)                                          \
1086    ? ((p) += 2,                                                 \
1087       ((((p)[-2] & 0x1F) << 6)                                  \
1088        | ((p)[-1] & 0x3F)                                       \
1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1090    : ! ((p)[0] & 0x10)                                          \
1091    ? ((p) += 3,                                                 \
1092       ((((p)[-3] & 0x0F) << 12)                                 \
1093        | (((p)[-2] & 0x3F) << 6)                                \
1094        | ((p)[-1] & 0x3F)))                                     \
1095    : ! ((p)[0] & 0x08)                                          \
1096    ? ((p) += 4,                                                 \
1097       ((((p)[-4] & 0xF) << 18)                                  \
1098        | (((p)[-3] & 0x3F) << 12)                               \
1099        | (((p)[-2] & 0x3F) << 6)                                \
1100        | ((p)[-1] & 0x3F)))                                     \
1101    : ((p) += 5,                                                 \
1102       ((((p)[-4] & 0x3F) << 18)                                 \
1103        | (((p)[-3] & 0x3F) << 12)                               \
1104        | (((p)[-2] & 0x3F) << 6)                                \
1105        | ((p)[-1] & 0x3F))))
1106
1107
1108 static void
1109 coding_set_source (coding)
1110      struct coding_system *coding;
1111 {
1112   if (BUFFERP (coding->src_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->src_object);
1115
1116       if (coding->src_pos < 0)
1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1118       else
1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1120     }
1121   else if (STRINGP (coding->src_object))
1122     {
1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1124     }
1125   else
1126     /* Otherwise, the source is C string and is never relocated
1127        automatically.  Thus we don't have to update anything.  */
1128     ;
1129 }
1130
1131 static void
1132 coding_set_destination (coding)
1133      struct coding_system *coding;
1134 {
1135   if (BUFFERP (coding->dst_object))
1136     {
1137       if (coding->src_pos < 0)
1138         {
1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1140           coding->dst_bytes = (GAP_END_ADDR
1141                                - (coding->src_bytes - coding->consumed)
1142                                - coding->destination);
1143         }
1144       else
1145         {
1146           /* We are sure that coding->dst_pos_byte is before the gap
1147              of the buffer. */
1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1149                                  + coding->dst_pos_byte - BEG_BYTE);
1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1151                                - coding->destination);
1152         }
1153     }
1154   else
1155     /* Otherwise, the destination is C string and is never relocated
1156        automatically.  Thus we don't have to update anything.  */
1157     ;
1158 }
1159
1160
1161 static void
1162 coding_alloc_by_realloc (coding, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT bytes;
1165 {
1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
1167                                                     coding->dst_bytes + bytes);
1168   coding->dst_bytes += bytes;
1169 }
1170
1171 static void
1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1173      struct coding_system *coding;
1174      EMACS_INT gap_head_used, bytes;
1175 {
1176   if (EQ (coding->src_object, coding->dst_object))
1177     {
1178       /* The gap may contain the produced data at the head and not-yet
1179          consumed data at the tail.  To preserve those data, we at
1180          first make the gap size to zero, then increase the gap
1181          size.  */
1182       EMACS_INT add = GAP_SIZE;
1183
1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1186       make_gap (bytes);
1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1189     }
1190   else
1191     {
1192       Lisp_Object this_buffer;
1193
1194       this_buffer = Fcurrent_buffer ();
1195       set_buffer_internal (XBUFFER (coding->dst_object));
1196       make_gap (bytes);
1197       set_buffer_internal (XBUFFER (this_buffer));
1198     }
1199 }
1200
1201
1202 static unsigned char *
1203 alloc_destination (coding, nbytes, dst)
1204      struct coding_system *coding;
1205      EMACS_INT nbytes;
1206      unsigned char *dst;
1207 {
1208   EMACS_INT offset = dst - coding->destination;
1209
1210   if (BUFFERP (coding->dst_object))
1211     {
1212       struct buffer *buf = XBUFFER (coding->dst_object);
1213
1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1215     }
1216   else
1217     coding_alloc_by_realloc (coding, nbytes);
1218   coding_set_destination (coding);
1219   dst = coding->destination + offset;
1220   return dst;
1221 }
1222
1223 /** Macros for annotations.  */
1224
1225 /* An annotation data is stored in the array coding->charbuf in this
1226    format:
1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1228    LENGTH is the number of elements in the annotation.
1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1230    NCHARS is the number of characters in the text annotated.
1231
1232    The format of the following elements depend on ANNOTATION_MASK.
1233
1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1235    follows:
1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1237
1238    NBYTES is the number of bytes specified in the header part of
1239    old-style emacs-mule encoding, or 0 for the other kind of
1240    composition.
1241
1242    METHOD is one of enum composition_method.
1243
1244    Optional COMPOSITION-COMPONENTS are characters and composition
1245    rules.
1246
1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1248    follows.
1249
1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1251    recover from an invalid annotation, and should be skipped by
1252    produce_annotation.  */
1253
1254 /* Maximum length of the header of annotation data.  */
1255 #define MAX_ANNOTATION_LENGTH 5
1256
1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1258   do {                                                  \
1259     *(buf)++ = -(len);                                  \
1260     *(buf)++ = (mask);                                  \
1261     *(buf)++ = (nchars);                                \
1262     coding->annotated = 1;                              \
1263   } while (0);
1264
1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1266   do {                                                                      \
1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1268     *buf++ = nbytes;                                                        \
1269     *buf++ = method;                                                        \
1270   } while (0)
1271
1272
1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1274   do {                                                                  \
1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1276     *buf++ = id;                                                        \
1277   } while (0)
1278
1279 \f
1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1281
1282
1283
1284 \f
1285 /*** 3. UTF-8 ***/
1286
1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
1289    return 0.  */
1290
1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1297
1298 #define UTF_BOM 0xFEFF
1299 #define UTF_8_BOM_1 0xEF
1300 #define UTF_8_BOM_2 0xBB
1301 #define UTF_8_BOM_3 0xBF
1302
1303 static int
1304 detect_coding_utf_8 (coding, detect_info)
1305      struct coding_system *coding;
1306      struct coding_detection_info *detect_info;
1307 {
1308   const unsigned char *src = coding->source, *src_base;
1309   const unsigned char *src_end = coding->source + coding->src_bytes;
1310   int multibytep = coding->src_multibyte;
1311   int consumed_chars = 0;
1312   int bom_found = 0;
1313   int found = 0;
1314
1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
1316   /* A coding system of this category is always ASCII compatible.  */
1317   src += coding->head_ascii;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4;
1322
1323       src_base = src;
1324       ONE_MORE_BYTE (c);
1325       if (c < 0 || UTF_8_1_OCTET_P (c))
1326         continue;
1327       ONE_MORE_BYTE (c1);
1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1329         break;
1330       if (UTF_8_2_OCTET_LEADING_P (c))
1331         {
1332           found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c2);
1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1337         break;
1338       if (UTF_8_3_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           if (src_base == coding->source
1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1343             bom_found = 1;
1344           continue;
1345         }
1346       ONE_MORE_BYTE (c3);
1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1348         break;
1349       if (UTF_8_4_OCTET_LEADING_P (c))
1350         {
1351           found = 1;
1352           continue;
1353         }
1354       ONE_MORE_BYTE (c4);
1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1356         break;
1357       if (UTF_8_5_OCTET_LEADING_P (c))
1358         {
1359           found = 1;
1360           continue;
1361         }
1362       break;
1363     }
1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1365   return 0;
1366
1367  no_more_source:
1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1371       return 0;
1372     }
1373   if (bom_found)
1374     {
1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1377     }
1378   else
1379     {
1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1381       if (found)
1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1383     }
1384   return 1;
1385 }
1386
1387
1388 static void
1389 decode_coding_utf_8 (coding)
1390      struct coding_system *coding;
1391 {
1392   const unsigned char *src = coding->source + coding->consumed;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   const unsigned char *src_base;
1395   int *charbuf = coding->charbuf + coding->charbuf_used;
1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1397   int consumed_chars = 0, consumed_chars_base = 0;
1398   int multibytep = coding->src_multibyte;
1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1400   Lisp_Object attr, charset_list;
1401   int eol_crlf =
1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1403   int byte_after_cr = -1;
1404
1405   CODING_GET_INFO (coding, attr, charset_list);
1406
1407   if (bom != utf_without_bom)
1408     {
1409       int c1, c2, c3;
1410
1411       src_base = src;
1412       ONE_MORE_BYTE (c1);
1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
1414         src = src_base;
1415       else
1416         {
1417           ONE_MORE_BYTE (c2);
1418           if (! UTF_8_EXTRA_OCTET_P (c2))
1419             src = src_base;
1420           else
1421             {
1422               ONE_MORE_BYTE (c3);
1423               if (! UTF_8_EXTRA_OCTET_P (c3))
1424                 src = src_base;
1425               else
1426                 {
1427                   if ((c1 != UTF_8_BOM_1)
1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1429                     src = src_base;
1430                   else
1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1432                 }
1433             }
1434         }
1435     }
1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
1437
1438
1439
1440   while (1)
1441     {
1442       int c, c1, c2, c3, c4, c5;
1443
1444       src_base = src;
1445       consumed_chars_base = consumed_chars;
1446
1447       if (charbuf >= charbuf_end)
1448         {
1449           if (byte_after_cr >= 0)
1450             src_base--;
1451           break;
1452         }
1453
1454       if (byte_after_cr >= 0)
1455         c1 = byte_after_cr, byte_after_cr = -1;
1456       else
1457         ONE_MORE_BYTE (c1);
1458       if (c1 < 0)
1459         {
1460           c = - c1;
1461         }
1462       else if (UTF_8_1_OCTET_P(c1))
1463         {
1464           if (eol_crlf && c1 == '\r')
1465             ONE_MORE_BYTE (byte_after_cr);
1466           c = c1;
1467         }
1468       else
1469         {
1470           ONE_MORE_BYTE (c2);
1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1472             goto invalid_code;
1473           if (UTF_8_2_OCTET_LEADING_P (c1))
1474             {
1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1476               /* Reject overlong sequences here and below.  Encoders
1477                  producing them are incorrect, they can be misleading,
1478                  and they mess up read/write invariance.  */
1479               if (c < 128)
1480                 goto invalid_code;
1481             }
1482           else
1483             {
1484               ONE_MORE_BYTE (c3);
1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1486                 goto invalid_code;
1487               if (UTF_8_3_OCTET_LEADING_P (c1))
1488                 {
1489                   c = (((c1 & 0xF) << 12)
1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1491                   if (c < 0x800
1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1493                     goto invalid_code;
1494                 }
1495               else
1496                 {
1497                   ONE_MORE_BYTE (c4);
1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1499                     goto invalid_code;
1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
1501                     {
1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1504                     if (c < 0x10000)
1505                       goto invalid_code;
1506                     }
1507                   else
1508                     {
1509                       ONE_MORE_BYTE (c5);
1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1511                         goto invalid_code;
1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
1513                         {
1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1516                                | (c5 & 0x3F));
1517                           if ((c > MAX_CHAR) || (c < 0x200000))
1518                             goto invalid_code;
1519                         }
1520                       else
1521                         goto invalid_code;
1522                     }
1523                 }
1524             }
1525         }
1526
1527       *charbuf++ = c;
1528       continue;
1529
1530     invalid_code:
1531       src = src_base;
1532       consumed_chars = consumed_chars_base;
1533       ONE_MORE_BYTE (c);
1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1535       coding->errors++;
1536     }
1537
1538  no_more_source:
1539   coding->consumed_char += consumed_chars_base;
1540   coding->consumed = src_base - coding->source;
1541   coding->charbuf_used = charbuf - coding->charbuf;
1542 }
1543
1544
1545 static int
1546 encode_coding_utf_8 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int produced_chars = 0;
1555   int c;
1556
1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1558     {
1559       ASSURE_DESTINATION (3);
1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
1562     }
1563
1564   if (multibytep)
1565     {
1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1567
1568       while (charbuf < charbuf_end)
1569         {
1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1571
1572           ASSURE_DESTINATION (safe_room);
1573           c = *charbuf++;
1574           if (CHAR_BYTE8_P (c))
1575             {
1576               c = CHAR_TO_BYTE8 (c);
1577               EMIT_ONE_BYTE (c);
1578             }
1579           else
1580             {
1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1582               for (p = str; p < pend; p++)
1583                 EMIT_ONE_BYTE (*p);
1584             }
1585         }
1586     }
1587   else
1588     {
1589       int safe_room = MAX_MULTIBYTE_LENGTH;
1590
1591       while (charbuf < charbuf_end)
1592         {
1593           ASSURE_DESTINATION (safe_room);
1594           c = *charbuf++;
1595           if (CHAR_BYTE8_P (c))
1596             *dst++ = CHAR_TO_BYTE8 (c);
1597           else
1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1599           produced_chars++;
1600         }
1601     }
1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1603   coding->produced_char += produced_chars;
1604   coding->produced = dst - coding->destination;
1605   return 0;
1606 }
1607
1608
1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1610    Check if a text is encoded in one of UTF-16 based coding systems.
1611    If it is, return 1, else return 0.  */
1612
1613 #define UTF_16_HIGH_SURROGATE_P(val) \
1614   (((val) & 0xFC00) == 0xD800)
1615
1616 #define UTF_16_LOW_SURROGATE_P(val) \
1617   (((val) & 0xFC00) == 0xDC00)
1618
1619 #define UTF_16_INVALID_P(val)   \
1620   (((val) == 0xFFFE)            \
1621    || ((val) == 0xFFFF)         \
1622    || UTF_16_LOW_SURROGATE_P (val))
1623
1624
1625 static int
1626 detect_coding_utf_16 (coding, detect_info)
1627      struct coding_system *coding;
1628      struct coding_detection_info *detect_info;
1629 {
1630   const unsigned char *src = coding->source, *src_base = src;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   int multibytep = coding->src_multibyte;
1633   int consumed_chars = 0;
1634   int c1, c2;
1635
1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
1637   if (coding->mode & CODING_MODE_LAST_BLOCK
1638       && (coding->src_chars & 1))
1639     {
1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1641       return 0;
1642     }
1643
1644   TWO_MORE_BYTES (c1, c2);
1645   if ((c1 == 0xFF) && (c2 == 0xFE))
1646     {
1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1648                              | CATEGORY_MASK_UTF_16_AUTO);
1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1652     }
1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
1654     {
1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1656                              | CATEGORY_MASK_UTF_16_AUTO);
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1660     }
1661   else if (c2 < 0)
1662     {
1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1664       return 0;
1665     }
1666   else
1667     {
1668       /* We check the dispersion of Eth and Oth bytes where E is even and
1669          O is odd.  If both are high, we assume binary data.*/
1670       unsigned char e[256], o[256];
1671       unsigned e_num = 1, o_num = 1;
1672
1673       memset (e, 0, 256);
1674       memset (o, 0, 256);
1675       e[c1] = 1;
1676       o[c2] = 1;
1677
1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1679                                 |CATEGORY_MASK_UTF_16_BE
1680                                 | CATEGORY_MASK_UTF_16_LE);
1681
1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1683              != CATEGORY_MASK_UTF_16)
1684         {
1685           TWO_MORE_BYTES (c1, c2);
1686           if (c2 < 0)
1687             break;
1688           if (! e[c1])
1689             {
1690               e[c1] = 1;
1691               e_num++;
1692               if (e_num >= 128)
1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1694             }
1695           if (! o[c2])
1696             {
1697               o[c2] = 1;
1698               o_num++;
1699               if (o_num >= 128)
1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1701             }
1702         }
1703       return 0;
1704     }
1705
1706  no_more_source:
1707   return 1;
1708 }
1709
1710 static void
1711 decode_coding_utf_16 (coding)
1712      struct coding_system *coding;
1713 {
1714   const unsigned char *src = coding->source + coding->consumed;
1715   const unsigned char *src_end = coding->source + coding->src_bytes;
1716   const unsigned char *src_base;
1717   int *charbuf = coding->charbuf + coding->charbuf_used;
1718   /* We may produces at most 3 chars in one loop.  */
1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1720   int consumed_chars = 0, consumed_chars_base = 0;
1721   int multibytep = coding->src_multibyte;
1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
1725   Lisp_Object attr, charset_list;
1726   int eol_crlf =
1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1729
1730   CODING_GET_INFO (coding, attr, charset_list);
1731
1732   if (bom == utf_with_bom)
1733     {
1734       int c, c1, c2;
1735
1736       src_base = src;
1737       ONE_MORE_BYTE (c1);
1738       ONE_MORE_BYTE (c2);
1739       c = (c1 << 8) | c2;
1740
1741       if (endian == utf_16_big_endian
1742           ? c != 0xFEFF : c != 0xFFFE)
1743         {
1744           /* The first two bytes are not BOM.  Treat them as bytes
1745              for a normal character.  */
1746           src = src_base;
1747           coding->errors++;
1748         }
1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
1750     }
1751   else if (bom == utf_detect_bom)
1752     {
1753       /* We have already tried to detect BOM and failed in
1754          detect_coding.  */
1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
1756     }
1757
1758   while (1)
1759     {
1760       int c, c1, c2;
1761
1762       src_base = src;
1763       consumed_chars_base = consumed_chars;
1764
1765       if (charbuf >= charbuf_end)
1766         {
1767           if (byte_after_cr1 >= 0)
1768             src_base -= 2;
1769           break;
1770         }
1771
1772       if (byte_after_cr1 >= 0)
1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
1774       else
1775         ONE_MORE_BYTE (c1);
1776       if (c1 < 0)
1777         {
1778           *charbuf++ = -c1;
1779           continue;
1780         }
1781       if (byte_after_cr2 >= 0)
1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
1783       else
1784         ONE_MORE_BYTE (c2);
1785       if (c2 < 0)
1786         {
1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1788           *charbuf++ = -c2;
1789           continue;
1790         }
1791       c = (endian == utf_16_big_endian
1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1793
1794       if (surrogate)
1795         {
1796           if (! UTF_16_LOW_SURROGATE_P (c))
1797             {
1798               if (endian == utf_16_big_endian)
1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1800               else
1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1802               *charbuf++ = c1;
1803               *charbuf++ = c2;
1804               coding->errors++;
1805               if (UTF_16_HIGH_SURROGATE_P (c))
1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1807               else
1808                 *charbuf++ = c;
1809             }
1810           else
1811             {
1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1814               *charbuf++ = 0x10000 + c;
1815             }
1816         }
1817       else
1818         {
1819           if (UTF_16_HIGH_SURROGATE_P (c))
1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1821           else
1822             {
1823               if (eol_crlf && c == '\r')
1824                 {
1825                   ONE_MORE_BYTE (byte_after_cr1);
1826                   ONE_MORE_BYTE (byte_after_cr2);
1827                 }
1828               *charbuf++ = c;
1829             }
1830         }
1831     }
1832
1833  no_more_source:
1834   coding->consumed_char += consumed_chars_base;
1835   coding->consumed = src_base - coding->source;
1836   coding->charbuf_used = charbuf - coding->charbuf;
1837 }
1838
1839 static int
1840 encode_coding_utf_16 (coding)
1841      struct coding_system *coding;
1842 {
1843   int multibytep = coding->dst_multibyte;
1844   int *charbuf = coding->charbuf;
1845   int *charbuf_end = charbuf + coding->charbuf_used;
1846   unsigned char *dst = coding->destination + coding->produced;
1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1848   int safe_room = 8;
1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1851   int produced_chars = 0;
1852   Lisp_Object attrs, charset_list;
1853   int c;
1854
1855   CODING_GET_INFO (coding, attrs, charset_list);
1856
1857   if (bom != utf_without_bom)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       if (big_endian)
1861         EMIT_TWO_BYTES (0xFE, 0xFF);
1862       else
1863         EMIT_TWO_BYTES (0xFF, 0xFE);
1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
1865     }
1866
1867   while (charbuf < charbuf_end)
1868     {
1869       ASSURE_DESTINATION (safe_room);
1870       c = *charbuf++;
1871       if (c > MAX_UNICODE_CHAR)
1872         c = coding->default_char;
1873
1874       if (c < 0x10000)
1875         {
1876           if (big_endian)
1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1878           else
1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1880         }
1881       else
1882         {
1883           int c1, c2;
1884
1885           c -= 0x10000;
1886           c1 = (c >> 10) + 0xD800;
1887           c2 = (c & 0x3FF) + 0xDC00;
1888           if (big_endian)
1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1890           else
1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1892         }
1893     }
1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1895   coding->produced = dst - coding->destination;
1896   coding->produced_char += produced_chars;
1897   return 0;
1898 }
1899
1900 \f
1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1902
1903 /* Emacs' internal format for representation of multiple character
1904    sets is a kind of multi-byte encoding, i.e. characters are
1905    represented by variable-length sequences of one-byte codes.
1906
1907    ASCII characters and control characters (e.g. `tab', `newline') are
1908    represented by one-byte sequences which are their ASCII codes, in
1909    the range 0x00 through 0x7F.
1910
1911    8-bit characters of the range 0x80..0x9F are represented by
1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1913    code + 0x20).
1914
1915    8-bit characters of the range 0xA0..0xFF are represented by
1916    one-byte sequences which are their 8-bit code.
1917
1918    The other characters are represented by a sequence of `base
1919    leading-code', optional `extended leading-code', and one or two
1920    `position-code's.  The length of the sequence is determined by the
1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1922    whereas extended leading-code and position-code take the range 0xA0
1923    through 0xFF.  See `charset.h' for more details about leading-code
1924    and position-code.
1925
1926    --- CODE RANGE of Emacs' internal format ---
1927    character set        range
1928    -------------        -----
1929    ascii                0x00..0x7F
1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1931    eight-bit-graphic    0xA0..0xBF
1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1933    ---------------------------------------------
1934
1935    As this is the internal character representation, the format is
1936    usually not used externally (i.e. in a file or in a data sent to a
1937    process).  But, it is possible to have a text externally in this
1938    format (i.e. by encoding by the coding system `emacs-mule').
1939
1940    In that case, a sequence of one-byte codes has a slightly different
1941    form.
1942
1943    At first, all characters in eight-bit-control are represented by
1944    one-byte sequences which are their 8-bit code.
1945
1946    Next, character composition data are represented by the byte
1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1948    where,
1949         METHOD is 0xF2 plus one of composition method (enum
1950         composition_method),
1951
1952         BYTES is 0xA0 plus a byte length of this composition data,
1953
1954         CHARS is 0xA0 plus a number of characters composed by this
1955         data,
1956
1957         COMPONENTs are characters of multibyte form or composition
1958         rules encoded by two-byte of ASCII codes.
1959
1960    In addition, for backward compatibility, the following formats are
1961    also recognized as composition data on decoding.
1962
1963    0x80 MSEQ ...
1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1965
1966    Here,
1967         MSEQ is a multibyte form but in these special format:
1968           ASCII: 0xA0 ASCII_CODE+0x80,
1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1970         RULE is a one byte code of the range 0xA0..0xF0 that
1971         represents a composition rule.
1972   */
1973
1974 char emacs_mule_bytes[256];
1975
1976
1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1979    else return 0.  */
1980
1981 static int
1982 detect_coding_emacs_mule (coding, detect_info)
1983      struct coding_system *coding;
1984      struct coding_detection_info *detect_info;
1985 {
1986   const unsigned char *src = coding->source, *src_base;
1987   const unsigned char *src_end = coding->source + coding->src_bytes;
1988   int multibytep = coding->src_multibyte;
1989   int consumed_chars = 0;
1990   int c;
1991   int found = 0;
1992
1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1994   /* A coding system of this category is always ASCII compatible.  */
1995   src += coding->head_ascii;
1996
1997   while (1)
1998     {
1999       src_base = src;
2000       ONE_MORE_BYTE (c);
2001       if (c < 0)
2002         continue;
2003       if (c == 0x80)
2004         {
2005           /* Perhaps the start of composite character.  We simply skip
2006              it because analyzing it is too heavy for detecting.  But,
2007              at least, we check that the composite character
2008              constitutes of more than 4 bytes.  */
2009           const unsigned char *src_base;
2010
2011         repeat:
2012           src_base = src;
2013           do
2014             {
2015               ONE_MORE_BYTE (c);
2016             }
2017           while (c >= 0xA0);
2018
2019           if (src - src_base <= 4)
2020             break;
2021           found = CATEGORY_MASK_EMACS_MULE;
2022           if (c == 0x80)
2023             goto repeat;
2024         }
2025
2026       if (c < 0x80)
2027         {
2028           if (c < 0x20
2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2030             break;
2031         }
2032       else
2033         {
2034           int more_bytes = emacs_mule_bytes[c] - 1;
2035
2036           while (more_bytes > 0)
2037             {
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 {
2041                   src--;        /* Unread the last byte.  */
2042                   break;
2043                 }
2044               more_bytes--;
2045             }
2046           if (more_bytes != 0)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049         }
2050     }
2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2052   return 0;
2053
2054  no_more_source:
2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2056     {
2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2058       return 0;
2059     }
2060   detect_info->found |= found;
2061   return 1;
2062 }
2063
2064
2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
2067    RULE described above, decode it and return the negative value of
2068    the decoded character or rule.  If an invalid byte is found, return
2069    -1.  If SRC is too short, return -2.  */
2070
2071 int
2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2073      struct coding_system *coding;
2074      const unsigned char *src;
2075      int *nbytes, *nchars, *id;
2076      struct composition_status *cmp_status;
2077 {
2078   const unsigned char *src_end = coding->source + coding->src_bytes;
2079   const unsigned char *src_base = src;
2080   int multibytep = coding->src_multibyte;
2081   int charset_id;
2082   unsigned code;
2083   int c;
2084   int consumed_chars = 0;
2085   int mseq_found = 0;
2086
2087   ONE_MORE_BYTE (c);
2088   if (c < 0)
2089     {
2090       c = -c;
2091       charset_id = emacs_mule_charset[0];
2092     }
2093   else
2094     {
2095       if (c >= 0xA0)
2096         {
2097           if (cmp_status->state != COMPOSING_NO
2098               && cmp_status->old_form)
2099             {
2100               if (cmp_status->state == COMPOSING_CHAR)
2101                 {
2102                   if (c == 0xA0)
2103                     {
2104                       ONE_MORE_BYTE (c);
2105                       c -= 0x80;
2106                       if (c < 0)
2107                         goto invalid_code;
2108                     }
2109                   else
2110                     c -= 0x20;
2111                   mseq_found = 1;
2112                 }
2113               else
2114                 {
2115                   *nbytes = src - src_base;
2116                   *nchars = consumed_chars;
2117                   return -c;
2118                 }
2119             }
2120           else
2121             goto invalid_code;
2122         }
2123
2124       switch (emacs_mule_bytes[c])
2125         {
2126         case 2:
2127           if ((charset_id = emacs_mule_charset[c]) < 0)
2128             goto invalid_code;
2129           ONE_MORE_BYTE (c);
2130           if (c < 0xA0)
2131             goto invalid_code;
2132           code = c & 0x7F;
2133           break;
2134
2135         case 3:
2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2138             {
2139               ONE_MORE_BYTE (c);
2140               if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = c & 0x7F;
2146             }
2147           else
2148             {
2149               if ((charset_id = emacs_mule_charset[c]) < 0)
2150                 goto invalid_code;
2151               ONE_MORE_BYTE (c);
2152               if (c < 0xA0)
2153                 goto invalid_code;
2154               code = (c & 0x7F) << 8;
2155               ONE_MORE_BYTE (c);
2156               if (c < 0xA0)
2157                 goto invalid_code;
2158               code |= c & 0x7F;
2159             }
2160           break;
2161
2162         case 4:
2163           ONE_MORE_BYTE (c);
2164           if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2165             goto invalid_code;
2166           ONE_MORE_BYTE (c);
2167           if (c < 0xA0)
2168             goto invalid_code;
2169           code = (c & 0x7F) << 8;
2170           ONE_MORE_BYTE (c);
2171           if (c < 0xA0)
2172             goto invalid_code;
2173           code |= c & 0x7F;
2174           break;
2175
2176         case 1:
2177           code = c;
2178           charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2179           break;
2180
2181         default:
2182           abort ();
2183         }
2184       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2185                           CHARSET_FROM_ID (charset_id), code, c);
2186       if (c < 0)
2187         goto invalid_code;
2188     }
2189   *nbytes = src - src_base;
2190   *nchars = consumed_chars;
2191   if (id)
2192     *id = charset_id;
2193   return (mseq_found ? -c : c);
2194
2195  no_more_source:
2196   return -2;
2197
2198  invalid_code:
2199   return -1;
2200 }
2201
2202
2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2204
2205 /* Handle these composition sequence ('|': the end of header elements,
2206    BYTES and CHARS >= 0xA0):
2207
2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2211
2212    and these old form:
2213
2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2216
2217    When the starter 0x80 and the following header elements are found,
2218    this annotation header is produced.
2219
2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2221
2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2224
2225    Then, upon reading the following elements, these codes are produced
2226    until the composition end is found:
2227
2228    (1) CHAR ... CHAR
2229    (2) ALT ... ALT CHAR ... CHAR
2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2231    (4) CHAR ... CHAR
2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2233
2234    When the composition end is found, LENGTH and NCHARS in the
2235    annotation header is updated as below:
2236
2237    (1) LENGTH: unchanged, NCHARS: unchanged
2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2242
2243    If an error is found while composing, the annotation header is
2244    changed to the original composition header (plus filler -1s) as
2245    below:
2246
2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
2249
2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
2251    byte sequence as below:
2252         o the original byte sequence is B: [ B -1 ]
2253         o the original byte sequence is B1 B2: [ B1 B2 ]
2254
2255    Most of the routines are implemented by macros because many
2256    variables and labels in the caller decode_coding_emacs_mule must be
2257    accessible, and they are usually called just once (thus doesn't
2258    increase the size of compiled object).  */
2259
2260 /* Decode a composition rule represented by C as a component of
2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
2262    rule. */
2263
2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2265   do {                                                  \
2266     int gref, nref;                                     \
2267                                                         \
2268     c -= 0xA0;                                          \
2269     if (c < 0 || c >= 81)                               \
2270       goto invalid_code;                                \
2271     gref = c / 9, nref = c % 9;                         \
2272     if (gref == 4) gref = 10;                           \
2273     if (nref == 4) nref = 10;                           \
2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2275   } while (0)
2276
2277
2278 /* Decode a composition rule represented by C and the following byte
2279    at SRC as a component of composition sequence of Emacs 21 style.
2280    Set RULE to the decoded rule.  */
2281
2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2283   do {                                                  \
2284     int gref, nref;                                     \
2285                                                         \
2286     gref = c - 0x20;                                    \
2287     if (gref < 0 || gref >= 81)                         \
2288       goto invalid_code;                                \
2289     ONE_MORE_BYTE (c);                                  \
2290     nref = c - 0x20;                                    \
2291     if (nref < 0 || nref >= 81)                         \
2292       goto invalid_code;                                \
2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2294   } while (0)
2295
2296
2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2299    byte length of this composition information, CHARS is the number of
2300    characters composed by this composition.  */
2301
2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2303   do {                                                                  \
2304     enum composition_method method = c - 0xF2;                          \
2305     int *charbuf_base = charbuf;                                        \
2306     int nbytes, nchars;                                                 \
2307                                                                         \
2308     ONE_MORE_BYTE (c);                                                  \
2309     if (c < 0)                                                          \
2310       goto invalid_code;                                                \
2311     nbytes = c - 0xA0;                                                  \
2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2313       goto invalid_code;                                                \
2314     ONE_MORE_BYTE (c);                                                  \
2315     nchars = c - 0xA0;                                                  \
2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2317       goto invalid_code;                                                \
2318     cmp_status->old_form = 0;                                           \
2319     cmp_status->method = method;                                        \
2320     if (method == COMPOSITION_RELATIVE)                                 \
2321       cmp_status->state = COMPOSING_CHAR;                               \
2322     else                                                                \
2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2325     cmp_status->nchars = nchars;                                        \
2326     cmp_status->ncomps = nbytes - 4;                                    \
2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for relative composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 /* Start of Emacs 20 style format for rule-base composition.  */
2345
2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2347   do {                                                          \
2348     cmp_status->old_form = 1;                                   \
2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2350     cmp_status->state = COMPOSING_CHAR;                         \
2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2354   } while (0)
2355
2356
2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2358   do {                                                  \
2359     const unsigned char *current_src = src;             \
2360                                                         \
2361     ONE_MORE_BYTE (c);                                  \
2362     if (c < 0)                                          \
2363       goto invalid_code;                                \
2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2367     else if (c < 0xA0)                                  \
2368       goto invalid_code;                                \
2369     else if (c < 0xC0)                                  \
2370       {                                                 \
2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2372         /* Re-read C as a composition component.  */    \
2373         src = current_src;                              \
2374       }                                                 \
2375     else if (c == 0xFF)                                 \
2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2377     else                                                \
2378       goto invalid_code;                                \
2379   } while (0)
2380
2381 #define EMACS_MULE_COMPOSITION_END()                            \
2382   do {                                                          \
2383     int idx = - cmp_status->length;                             \
2384                                                                 \
2385     if (cmp_status->old_form)                                   \
2386       charbuf[idx + 2] = cmp_status->nchars;                    \
2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2389     cmp_status->state = COMPOSING_NO;                           \
2390   } while (0)
2391
2392
2393 static int
2394 emacs_mule_finish_composition (charbuf, cmp_status)
2395      int *charbuf;
2396      struct composition_status *cmp_status;
2397 {
2398   int idx = - cmp_status->length;
2399   int new_chars;
2400
2401   if (cmp_status->old_form && cmp_status->nchars > 0)
2402     {
2403       charbuf[idx + 2] = cmp_status->nchars;
2404       new_chars = 0;
2405       if (cmp_status->method == COMPOSITION_WITH_RULE
2406           && cmp_status->state == COMPOSING_CHAR)
2407         {
2408           /* The last rule was invalid.  */
2409           int rule = charbuf[-1] + 0xA0;
2410
2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
2412           charbuf[-1] = -1;
2413           new_chars = 1;
2414         }
2415     }
2416   else
2417     {
2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2419
2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
2421         {
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2423           charbuf[idx++] = -3;
2424           charbuf[idx++] = 0;
2425           new_chars = 1;
2426         }
2427       else
2428         {
2429           int nchars = charbuf[idx + 1] + 0xA0;
2430           int nbytes = charbuf[idx + 2] + 0xA0;
2431
2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2435           charbuf[idx++] = -1;
2436           new_chars = 4;
2437         }
2438     }
2439   cmp_status->state = COMPOSING_NO;
2440   return new_chars;
2441 }
2442
2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2444   do {                                                                    \
2445     if (cmp_status->state != COMPOSING_NO)                                \
2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2447   } while (0)
2448
2449
2450 static void
2451 decode_coding_emacs_mule (coding)
2452      struct coding_system *coding;
2453 {
2454   const unsigned char *src = coding->source + coding->consumed;
2455   const unsigned char *src_end = coding->source + coding->src_bytes;
2456   const unsigned char *src_base;
2457   int *charbuf = coding->charbuf + coding->charbuf_used;
2458   /* We may produce two annotations (charset and composition) in one
2459      loop and one more charset annotation at the end.  */
2460   int *charbuf_end
2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2462   int consumed_chars = 0, consumed_chars_base;
2463   int multibytep = coding->src_multibyte;
2464   Lisp_Object attrs, charset_list;
2465   int char_offset = coding->produced_char;
2466   int last_offset = char_offset;
2467   int last_id = charset_ascii;
2468   int eol_crlf =
2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2470   int byte_after_cr = -1;
2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2472
2473   CODING_GET_INFO (coding, attrs, charset_list);
2474
2475   if (cmp_status->state != COMPOSING_NO)
2476     {
2477       int i;
2478
2479       for (i = 0; i < cmp_status->length; i++)
2480         *charbuf++ = cmp_status->carryover[i];
2481       coding->annotated = 1;
2482     }
2483
2484   while (1)
2485     {
2486       int c, id;
2487
2488       src_base = src;
2489       consumed_chars_base = consumed_chars;
2490
2491       if (charbuf >= charbuf_end)
2492         {
2493           if (byte_after_cr >= 0)
2494             src_base--;
2495           break;
2496         }
2497
2498       if (byte_after_cr >= 0)
2499         c = byte_after_cr, byte_after_cr = -1;
2500       else
2501         ONE_MORE_BYTE (c);
2502
2503       if (c < 0 || c == 0x80)
2504         {
2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2506           if (c < 0)
2507             {
2508               *charbuf++ = -c;
2509               char_offset++;
2510             }
2511           else
2512             DECODE_EMACS_MULE_COMPOSITION_START ();
2513           continue;
2514         }
2515
2516       if (c < 0x80)
2517         {
2518           if (eol_crlf && c == '\r')
2519             ONE_MORE_BYTE (byte_after_cr);
2520           id = charset_ascii;
2521           if (cmp_status->state != COMPOSING_NO)
2522             {
2523               if (cmp_status->old_form)
2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2526                 cmp_status->ncomps--;
2527             }
2528         }
2529       else
2530         {
2531           int nchars, nbytes;
2532           /* emacs_mule_char can load a charset map from a file, which
2533              allocates a large structure and might cause buffer text
2534              to be relocated as result.  Thus, we need to remember the
2535              original pointer to buffer text, and fix up all related
2536              pointers after the call.  */
2537           const unsigned char *orig = coding->source;
2538           EMACS_INT offset;
2539
2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2541                                cmp_status);
2542           offset = coding->source - orig;
2543           if (offset)
2544             {
2545               src += offset;
2546               src_base += offset;
2547               src_end += offset;
2548             }
2549           if (c < 0)
2550             {
2551               if (c == -1)
2552                 goto invalid_code;
2553               if (c == -2)
2554                 break;
2555             }
2556           src = src_base + nbytes;
2557           consumed_chars = consumed_chars_base + nchars;
2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2559             cmp_status->ncomps -= nchars;
2560         }
2561
2562       /* Now if C >= 0, we found a normally encoded character, if C <
2563          0, we found an old-style composition component character or
2564          rule.  */
2565
2566       if (cmp_status->state == COMPOSING_NO)
2567         {
2568           if (last_id != id)
2569             {
2570               if (last_id != charset_ascii)
2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2572                                   last_id);
2573               last_id = id;
2574               last_offset = char_offset;
2575             }
2576           *charbuf++ = c;
2577           char_offset++;
2578         }
2579       else if (cmp_status->state == COMPOSING_CHAR)
2580         {
2581           if (cmp_status->old_form)
2582             {
2583               if (c >= 0)
2584                 {
2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586                   *charbuf++ = c;
2587                   char_offset++;
2588                 }
2589               else
2590                 {
2591                   *charbuf++ = -c;
2592                   cmp_status->nchars++;
2593                   cmp_status->length++;
2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2595                     EMACS_MULE_COMPOSITION_END ();
2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2597                     cmp_status->state = COMPOSING_RULE;
2598                 }
2599             }
2600           else
2601             {
2602               *charbuf++ = c;
2603               cmp_status->length++;
2604               cmp_status->nchars--;
2605               if (cmp_status->nchars == 0)
2606                 EMACS_MULE_COMPOSITION_END ();
2607             }
2608         }
2609       else if (cmp_status->state == COMPOSING_RULE)
2610         {
2611           int rule;
2612
2613           if (c >= 0)
2614             {
2615               EMACS_MULE_COMPOSITION_END ();
2616               *charbuf++ = c;
2617               char_offset++;
2618             }
2619           else
2620             {
2621               c = -c;
2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2623               if (rule < 0)
2624                 goto invalid_code;
2625               *charbuf++ = -2;
2626               *charbuf++ = rule;
2627               cmp_status->length += 2;
2628               cmp_status->state = COMPOSING_CHAR;
2629             }
2630         }
2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2632         {
2633           *charbuf++ = c;
2634           cmp_status->length++;
2635           if (cmp_status->ncomps == 0)
2636             cmp_status->state = COMPOSING_CHAR;
2637           else if (cmp_status->ncomps > 0)
2638             {
2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2641             }
2642           else
2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644         }
2645       else                      /* COMPOSING_COMPONENT_RULE */
2646         {
2647           int rule;
2648
2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2650           if (rule < 0)
2651             goto invalid_code;
2652           *charbuf++ = -2;
2653           *charbuf++ = rule;
2654           cmp_status->length += 2;
2655           cmp_status->ncomps--;
2656           if (cmp_status->ncomps > 0)
2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2658           else
2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660         }
2661       continue;
2662
2663     retry:
2664       src = src_base;
2665       consumed_chars = consumed_chars_base;
2666       continue;
2667
2668     invalid_code:
2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2670       src = src_base;
2671       consumed_chars = consumed_chars_base;
2672       ONE_MORE_BYTE (c);
2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2674       char_offset++;
2675       coding->errors++;
2676     }
2677
2678  no_more_source:
2679   if (cmp_status->state != COMPOSING_NO)
2680     {
2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2683       else
2684         {
2685           int i;
2686
2687           charbuf -= cmp_status->length;
2688           for (i = 0; i < cmp_status->length; i++)
2689             cmp_status->carryover[i] = charbuf[i];
2690         }
2691     }
2692   if (last_id != charset_ascii)
2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2694   coding->consumed_char += consumed_chars_base;
2695   coding->consumed = src_base - coding->source;
2696   coding->charbuf_used = charbuf - coding->charbuf;
2697 }
2698
2699
2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2701   do {                                          \
2702     if (id < 0xA0)                              \
2703       codes[0] = id, codes[1] = 0;              \
2704     else if (id < 0xE0)                         \
2705       codes[0] = 0x9A, codes[1] = id;           \
2706     else if (id < 0xF0)                         \
2707       codes[0] = 0x9B, codes[1] = id;           \
2708     else if (id < 0xF5)                         \
2709       codes[0] = 0x9C, codes[1] = id;           \
2710     else                                        \
2711       codes[0] = 0x9D, codes[1] = id;           \
2712   } while (0);
2713
2714
2715 static int
2716 encode_coding_emacs_mule (coding)
2717      struct coding_system *coding;
2718 {
2719   int multibytep = coding->dst_multibyte;
2720   int *charbuf = coding->charbuf;
2721   int *charbuf_end = charbuf + coding->charbuf_used;
2722   unsigned char *dst = coding->destination + coding->produced;
2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2724   int safe_room = 8;
2725   int produced_chars = 0;
2726   Lisp_Object attrs, charset_list;
2727   int c;
2728   int preferred_charset_id = -1;
2729
2730   CODING_GET_INFO (coding, attrs, charset_list);
2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
2732     {
2733       CODING_ATTR_CHARSET_LIST (attrs)
2734         = charset_list = Vemacs_mule_charset_list;
2735     }
2736
2737   while (charbuf < charbuf_end)
2738     {
2739       ASSURE_DESTINATION (safe_room);
2740       c = *charbuf++;
2741
2742       if (c < 0)
2743         {
2744           /* Handle an annotation.  */
2745           switch (*charbuf)
2746             {
2747             case CODING_ANNOTATE_COMPOSITION_MASK:
2748               /* Not yet implemented.  */
2749               break;
2750             case CODING_ANNOTATE_CHARSET_MASK:
2751               preferred_charset_id = charbuf[3];
2752               if (preferred_charset_id >= 0
2753                   && NILP (Fmemq (make_number (preferred_charset_id),
2754                                   charset_list)))
2755                 preferred_charset_id = -1;
2756               break;
2757             default:
2758               abort ();
2759             }
2760           charbuf += -c - 1;
2761           continue;
2762         }
2763
2764       if (ASCII_CHAR_P (c))
2765         EMIT_ONE_ASCII_BYTE (c);
2766       else if (CHAR_BYTE8_P (c))
2767         {
2768           c = CHAR_TO_BYTE8 (c);
2769           EMIT_ONE_BYTE (c);
2770         }
2771       else
2772         {
2773           struct charset *charset;
2774           unsigned code;
2775           int dimension;
2776           int emacs_mule_id;
2777           unsigned char leading_codes[2];
2778
2779           if (preferred_charset_id >= 0)
2780             {
2781               charset = CHARSET_FROM_ID (preferred_charset_id);
2782               if (CHAR_CHARSET_P (c, charset))
2783                 code = ENCODE_CHAR (charset, c);
2784               else
2785                 charset = char_charset (c, charset_list, &code);
2786             }
2787           else
2788             charset = char_charset (c, charset_list, &code);
2789           if (! charset)
2790             {
2791               c = coding->default_char;
2792               if (ASCII_CHAR_P (c))
2793                 {
2794                   EMIT_ONE_ASCII_BYTE (c);
2795                   continue;
2796                 }
2797               charset = char_charset (c, charset_list, &code);
2798             }
2799           dimension = CHARSET_DIMENSION (charset);
2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2802           EMIT_ONE_BYTE (leading_codes[0]);
2803           if (leading_codes[1])
2804             EMIT_ONE_BYTE (leading_codes[1]);
2805           if (dimension == 1)
2806             EMIT_ONE_BYTE (code | 0x80);
2807           else
2808             {
2809               code |= 0x8080;
2810               EMIT_ONE_BYTE (code >> 8);
2811               EMIT_ONE_BYTE (code & 0xFF);
2812             }
2813         }
2814     }
2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2816   coding->produced_char += produced_chars;
2817   coding->produced = dst - coding->destination;
2818   return 0;
2819 }
2820
2821 \f
2822 /*** 7. ISO2022 handlers ***/
2823
2824 /* The following note describes the coding system ISO2022 briefly.
2825    Since the intention of this note is to help understand the
2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
2827    SIMPLIFIED.  For thorough understanding, please refer to the
2828    original document of ISO2022.  This is equivalent to the standard
2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2830
2831    ISO2022 provides many mechanisms to encode several character sets
2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2833    is encoded using bytes less than 128.  This may make the encoded
2834    text a little bit longer, but the text passes more easily through
2835    several types of gateway, some of which strip off the MSB (Most
2836    Significant Bit).
2837
2838    There are two kinds of character sets: control character sets and
2839    graphic character sets.  The former contain control characters such
2840    as `newline' and `escape' to provide control functions (control
2841    functions are also provided by escape sequences).  The latter
2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2843    two control character sets and many graphic character sets.
2844
2845    Graphic character sets are classified into one of the following
2846    four classes, according to the number of bytes (DIMENSION) and
2847    number of characters in one dimension (CHARS) of the set:
2848    - DIMENSION1_CHARS94
2849    - DIMENSION1_CHARS96
2850    - DIMENSION2_CHARS94
2851    - DIMENSION2_CHARS96
2852
2853    In addition, each character set is assigned an identification tag,
2854    unique for each set, called the "final character" (denoted as <F>
2855    hereafter).  The <F> of each character set is decided by ECMA(*)
2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2857    (0x30..0x3F are for private use only).
2858
2859    Note (*): ECMA = European Computer Manufacturers Association
2860
2861    Here are examples of graphic character sets [NAME(<F>)]:
2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2865         o DIMENSION2_CHARS96 -- none for the moment
2866
2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2868         C0 [0x00..0x1F] -- control character plane 0
2869         GL [0x20..0x7F] -- graphic character plane 0
2870         C1 [0x80..0x9F] -- control character plane 1
2871         GR [0xA0..0xFF] -- graphic character plane 1
2872
2873    A control character set is directly designated and invoked to C0 or
2874    C1 by an escape sequence.  The most common case is that:
2875    - ISO646's  control character set is designated/invoked to C0, and
2876    - ISO6429's control character set is designated/invoked to C1,
2877    and usually these designations/invocations are omitted in encoded
2878    text.  In a 7-bit environment, only C0 can be used, and a control
2879    character for C1 is encoded by an appropriate escape sequence to
2880    fit into the environment.  All control characters for C1 are
2881    defined to have corresponding escape sequences.
2882
2883    A graphic character set is at first designated to one of four
2884    graphic registers (G0 through G3), then these graphic registers are
2885    invoked to GL or GR.  These designations and invocations can be
2886    done independently.  The most common case is that G0 is invoked to
2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2888    these invocations and designations are omitted in encoded text.
2889    In a 7-bit environment, only GL can be used.
2890
2891    When a graphic character set of CHARS94 is invoked to GL, codes
2892    0x20 and 0x7F of the GL area work as control characters SPACE and
2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2894    be used.
2895
2896    There are two ways of invocation: locking-shift and single-shift.
2897    With locking-shift, the invocation lasts until the next different
2898    invocation, whereas with single-shift, the invocation affects the
2899    following character only and doesn't affect the locking-shift
2900    state.  Invocations are done by the following control characters or
2901    escape sequences:
2902
2903    ----------------------------------------------------------------------
2904    abbrev  function                  cntrl escape seq   description
2905    ----------------------------------------------------------------------
2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2915    ----------------------------------------------------------------------
2916    (*) These are not used by any known coding system.
2917
2918    Control characters for these functions are defined by macros
2919    ISO_CODE_XXX in `coding.h'.
2920
2921    Designations are done by the following escape sequences:
2922    ----------------------------------------------------------------------
2923    escape sequence      description
2924    ----------------------------------------------------------------------
2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2941    ----------------------------------------------------------------------
2942
2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2944    of dimension 1, chars 94, and final character <F>, etc...
2945
2946    Note (*): Although these designations are not allowed in ISO2022,
2947    Emacs accepts them on decoding, and produces them on encoding
2948    CHARS96 character sets in a coding system which is characterized as
2949    7-bit environment, non-locking-shift, and non-single-shift.
2950
2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
2953
2954    Now you may notice that there are a lot of ways of encoding the
2955    same multilingual text in ISO2022.  Actually, there exist many
2956    coding systems such as Compound Text (used in X11's inter client
2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2959    localized platforms), and all of these are variants of ISO2022.
2960
2961    In addition to the above, Emacs handles two more kinds of escape
2962    sequences: ISO6429's direction specification and Emacs' private
2963    sequence for specifying character composition.
2964
2965    ISO6429's direction specification takes the following form:
2966         o CSI ']'      -- end of the current direction
2967         o CSI '0' ']'  -- end of the current direction
2968         o CSI '1' ']'  -- start of left-to-right text
2969         o CSI '2' ']'  -- start of right-to-left text
2970    The control character CSI (0x9B: control sequence introducer) is
2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2972
2973    Character composition specification takes the following form:
2974         o ESC '0' -- start relative composition
2975         o ESC '1' -- end composition
2976         o ESC '2' -- start rule-base composition (*)
2977         o ESC '3' -- start relative composition with alternate chars  (**)
2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
2979   Since these are not standard escape sequences of any ISO standard,
2980   the use of them with these meanings is restricted to Emacs only.
2981
2982   (*) This form is used only in Emacs 20.7 and older versions,
2983   but newer versions can safely decode it.
2984   (**) This form is used only in Emacs 21.1 and newer versions,
2985   and older versions can't decode it.
2986
2987   Here's a list of example usages of these composition escape
2988   sequences (categorized by `enum composition_method').
2989
2990   COMPOSITION_RELATIVE:
2991         ESC 0 CHAR [ CHAR ] ESC 1
2992   COMPOSITION_WITH_RULE:
2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
2994   COMPOSITION_WITH_ALTCHARS:
2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2996   COMPOSITION_WITH_RULE_ALTCHARS:
2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2998
2999 enum iso_code_class_type iso_code_class[256];
3000
3001 #define SAFE_CHARSET_P(coding, id)      \
3002   ((id) <= (coding)->max_charset_id     \
3003    && (coding)->safe_charsets[id] != 255)
3004
3005
3006 #define SHIFT_OUT_OK(category)  \
3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3008
3009 static void
3010 setup_iso_safe_charsets (attrs)
3011      Lisp_Object attrs;
3012 {
3013   Lisp_Object charset_list, safe_charsets;
3014   Lisp_Object request;
3015   Lisp_Object reg_usage;
3016   Lisp_Object tail;
3017   int reg94, reg96;
3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3019   int max_charset_id;
3020
3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3023       && ! EQ (charset_list, Viso_2022_charset_list))
3024     {
3025       CODING_ATTR_CHARSET_LIST (attrs)
3026         = charset_list = Viso_2022_charset_list;
3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
3028     }
3029
3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3031     return;
3032
3033   max_charset_id = 0;
3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3035     {
3036       int id = XINT (XCAR (tail));
3037       if (max_charset_id < id)
3038         max_charset_id = id;
3039     }
3040
3041   safe_charsets = make_uninit_string (max_charset_id + 1);
3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3043   request = AREF (attrs, coding_attr_iso_request);
3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
3045   reg94 = XINT (XCAR (reg_usage));
3046   reg96 = XINT (XCDR (reg_usage));
3047
3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3049     {
3050       Lisp_Object id;
3051       Lisp_Object reg;
3052       struct charset *charset;
3053
3054       id = XCAR (tail);
3055       charset = CHARSET_FROM_ID (XINT (id));
3056       reg = Fcdr (Fassq (id, request));
3057       if (! NILP (reg))
3058         SSET (safe_charsets, XINT (id), XINT (reg));
3059       else if (charset->iso_chars_96)
3060         {
3061           if (reg96 < 4)
3062             SSET (safe_charsets, XINT (id), reg96);
3063         }
3064       else
3065         {
3066           if (reg94 < 4)
3067             SSET (safe_charsets, XINT (id), reg94);
3068         }
3069     }
3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3071 }
3072
3073
3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3075    Check if a text is encoded in one of ISO-2022 based coding systems.
3076    If it is, return 1, else return 0.  */
3077
3078 static int
3079 detect_coding_iso_2022 (coding, detect_info)
3080      struct coding_system *coding;
3081      struct coding_detection_info *detect_info;
3082 {
3083   const unsigned char *src = coding->source, *src_base = src;
3084   const unsigned char *src_end = coding->source + coding->src_bytes;
3085   int multibytep = coding->src_multibyte;
3086   int single_shifting = 0;
3087   int id;
3088   int c, c1;
3089   int consumed_chars = 0;
3090   int i;
3091   int rejected = 0;
3092   int found = 0;
3093   int composition_count = -1;
3094
3095   detect_info->checked |= CATEGORY_MASK_ISO;
3096
3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3098     {
3099       struct coding_system *this = &(coding_categories[i]);
3100       Lisp_Object attrs, val;
3101
3102       if (this->id < 0)
3103         continue;
3104       attrs = CODING_ID_ATTRS (this->id);
3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3107         setup_iso_safe_charsets (attrs);
3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3109       this->max_charset_id = SCHARS (val) - 1;
3110       this->safe_charsets = SDATA (val);
3111     }
3112
3113   /* A coding system of this category is always ASCII compatible.  */
3114   src += coding->head_ascii;
3115
3116   while (rejected != CATEGORY_MASK_ISO)
3117     {
3118       src_base = src;
3119       ONE_MORE_BYTE (c);
3120       switch (c)
3121         {
3122         case ISO_CODE_ESC:
3123           if (inhibit_iso_escape_detection)
3124             break;
3125           single_shifting = 0;
3126           ONE_MORE_BYTE (c);
3127           if (c >= '(' && c <= '/')
3128             {
3129               /* Designation sequence for a charset of dimension 1.  */
3130               ONE_MORE_BYTE (c1);
3131               if (c1 < ' ' || c1 >= 0x80
3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3133                 /* Invalid designation sequence.  Just ignore.  */
3134                 break;
3135             }
3136           else if (c == '$')
3137             {
3138               /* Designation sequence for a charset of dimension 2.  */
3139               ONE_MORE_BYTE (c);
3140               if (c >= '@' && c <= 'B')
3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3142                 id = iso_charset_table[1][0][c];
3143               else if (c >= '(' && c <= '/')
3144                 {
3145                   ONE_MORE_BYTE (c1);
3146                   if (c1 < ' ' || c1 >= 0x80
3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3148                     /* Invalid designation sequence.  Just ignore.  */
3149                     break;
3150                 }
3151               else
3152                 /* Invalid designation sequence.  Just ignore it.  */
3153                 break;
3154             }
3155           else if (c == 'N' || c == 'O')
3156             {
3157               /* ESC <Fe> for SS2 or SS3.  */
3158               single_shifting = 1;
3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160               break;
3161             }
3162           else if (c == '1')
3163             {
3164               /* End of composition.  */
3165               if (composition_count < 0
3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3167                 /* Invalid */
3168                 break;
3169               composition_count = -1;
3170               found |= CATEGORY_MASK_ISO;
3171             }
3172           else if (c >= '0' && c <= '4')
3173             {
3174               /* ESC <Fp> for start/end composition.  */
3175               composition_count = 0;
3176               break;
3177             }
3178           else
3179             {
3180               /* Invalid escape sequence.  Just ignore it.  */
3181               break;
3182             }
3183
3184           /* We found a valid designation sequence for CHARSET.  */
3185           rejected |= CATEGORY_MASK_ISO_8BIT;
3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3187                               id))
3188             found |= CATEGORY_MASK_ISO_7;
3189           else
3190             rejected |= CATEGORY_MASK_ISO_7;
3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3192                               id))
3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3197                               id))
3198             found |= CATEGORY_MASK_ISO_7_ELSE;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3202                               id))
3203             found |= CATEGORY_MASK_ISO_8_ELSE;
3204           else
3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3206           break;
3207
3208         case ISO_CODE_SO:
3209         case ISO_CODE_SI:
3210           /* Locking shift out/in.  */
3211           if (inhibit_iso_escape_detection)
3212             break;
3213           single_shifting = 0;
3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3215           break;
3216
3217         case ISO_CODE_CSI:
3218           /* Control sequence introducer.  */
3219           single_shifting = 0;
3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221           found |= CATEGORY_MASK_ISO_8_ELSE;
3222           goto check_extra_latin;
3223
3224         case ISO_CODE_SS2:
3225         case ISO_CODE_SS3:
3226           /* Single shift.   */
3227           if (inhibit_iso_escape_detection)
3228             break;
3229           single_shifting = 0;
3230           rejected |= CATEGORY_MASK_ISO_7BIT;
3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3237           if (single_shifting)
3238             break;
3239           goto check_extra_latin;
3240
3241         default:
3242           if (c < 0)
3243             continue;
3244           if (c < 0x80)
3245             {
3246               if (composition_count >= 0)
3247                 composition_count++;
3248               single_shifting = 0;
3249               break;
3250             }
3251           if (c >= 0xA0)
3252             {
3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3254               found |= CATEGORY_MASK_ISO_8_1;
3255               /* Check the length of succeeding codes of the range
3256                  0xA0..0FF.  If the byte length is even, we include
3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3258                  only when we are not single shifting.  */
3259               if (! single_shifting
3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3261                 {
3262                   int i = 1;
3263                   while (src < src_end)
3264                     {
3265                       src_base = src;
3266                       ONE_MORE_BYTE (c);
3267                       if (c < 0xA0)
3268                         {
3269                           src = src_base;
3270                           break;
3271                         }
3272                       i++;
3273                     }
3274
3275                   if (i & 1 && src < src_end)
3276                     {
3277                       rejected |= CATEGORY_MASK_ISO_8_2;
3278                       if (composition_count >= 0)
3279                         composition_count += i;
3280                     }
3281                   else
3282                     {
3283                       found |= CATEGORY_MASK_ISO_8_2;
3284                       if (composition_count >= 0)
3285                         composition_count += i / 2;
3286                     }
3287                 }
3288               break;
3289             }
3290         check_extra_latin:
3291           single_shifting = 0;
3292           if (! VECTORP (Vlatin_extra_code_table)
3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3294             {
3295               rejected = CATEGORY_MASK_ISO;
3296               break;
3297             }
3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3299               & CODING_ISO_FLAG_LATIN_EXTRA)
3300             found |= CATEGORY_MASK_ISO_8_1;
3301           else
3302             rejected |= CATEGORY_MASK_ISO_8_1;
3303           rejected |= CATEGORY_MASK_ISO_8_2;
3304         }
3305     }
3306   detect_info->rejected |= CATEGORY_MASK_ISO;
3307   return 0;
3308
3309  no_more_source:
3310   detect_info->rejected |= rejected;
3311   detect_info->found |= (found & ~rejected);
3312   return 1;
3313 }
3314
3315
3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3317    escape sequence should be kept.  */
3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3319   do {                                                                  \
3320     int id, prev;                                                       \
3321                                                                         \
3322     if (final < '0' || final >= 128                                     \
3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3324         || !SAFE_CHARSET_P (coding, id))                                \
3325       {                                                                 \
3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3327         chars_96 = -1;                                                  \
3328         break;                                                          \
3329       }                                                                 \
3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3331     if (id == charset_jisx0201_roman)                                   \
3332       {                                                                 \
3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3334           id = charset_ascii;                                           \
3335       }                                                                 \
3336     else if (id == charset_jisx0208_1978)                               \
3337       {                                                                 \
3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3339           id = charset_jisx0208;                                        \
3340       }                                                                 \
3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3342     /* If there was an invalid designation to REG previously, and this  \
3343        designation is ASCII to REG, we should keep this designation     \
3344        sequence.  */                                                    \
3345     if (prev == -2 && id == charset_ascii)                              \
3346       chars_96 = -1;                                                    \
3347   } while (0)
3348
3349
3350 /* Handle these composition sequence (ALT: alternate char):
3351
3352    (1) relative composition: ESC 0 CHAR ... ESC 1
3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3356
3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
3358    header is produced.
3359
3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3361
3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3363    produced until the end sequence (ESC 1) is found:
3364
3365    (1) CHAR ... CHAR
3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3369
3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3371    annotation header is updated as below:
3372
3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3377
3378    If an error is found while composing, the annotation header is
3379    changed to:
3380
3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3382
3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
3384    byte sequence as below:
3385         o the original byte sequence is B: [ B -1 ]
3386         o the original byte sequence is B1 B2: [ B1 B2 ]
3387    and the sequence [ -1 -1 ] is changed to the original byte
3388    sequence:
3389         [ ESC '0' ]
3390 */
3391
3392 /* Decode a composition rule C1 and maybe one more byte from the
3393    source, and set RULE to the encoded composition rule, NBYTES to the
3394    length of the composition rule.  If the rule is invalid, set RULE
3395    to some negative value.  */
3396
3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3398   do {                                                                  \
3399     rule = c1 - 32;                                                     \
3400     if (rule < 0)                                                       \
3401       break;                                                            \
3402     if (rule < 81)              /* old format (before ver.21) */        \
3403       {                                                                 \
3404         int gref = (rule) / 9;                                          \
3405         int nref = (rule) % 9;                                          \
3406         if (gref == 4) gref = 10;                                       \
3407         if (nref == 4) nref = 10;                                       \
3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3409         nbytes = 1;                                                     \
3410       }                                                                 \
3411     else                        /* new format (after ver.21) */         \
3412       {                                                                 \
3413         int c;                                                          \
3414                                                                         \
3415         ONE_MORE_BYTE (c);                                              \
3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3417         if (rule >= 0)                                                  \
3418           rule += 0x100;   /* to destinguish it from the old format */  \
3419         nbytes = 2;                                                     \
3420       }                                                                 \
3421   } while (0)
3422
3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
3424   do {                                                          \
3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3426                                                                 \
3427     if (rule < 0x100)           /* old format */                \
3428       {                                                         \
3429         if (gref == 10) gref = 4;                               \
3430         if (nref == 10) nref = 4;                               \
3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
3432         charbuf[idx + 1] = -1;                                  \
3433         new_chars++;                                            \
3434       }                                                         \
3435     else                                /* new format */        \
3436       {                                                         \
3437         charbuf[idx] = 32 + 81 + gref;                          \
3438         charbuf[idx + 1] = 32 + nref;                           \
3439         new_chars += 2;                                         \
3440       }                                                         \
3441   } while (0)
3442
3443 /* Finish the current composition as invalid.  */
3444
3445 static int finish_composition P_ ((int *, struct composition_status *));
3446
3447 static int
3448 finish_composition (charbuf, cmp_status)
3449      int *charbuf;
3450      struct composition_status *cmp_status;
3451 {
3452   int idx = - cmp_status->length;
3453   int new_chars;
3454
3455   /* Recover the original ESC sequence */
3456   charbuf[idx++] = ISO_CODE_ESC;
3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3461                     : '4');
3462   charbuf[idx++] = -2;
3463   charbuf[idx++] = 0;
3464   charbuf[idx++] = -1;
3465   new_chars = cmp_status->nchars;
3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3467     for (; idx < 0; idx++)
3468       {
3469         int elt = charbuf[idx];
3470
3471         if (elt == -2)
3472           {
3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3474             idx++;
3475           }
3476         else if (elt == -1)
3477           {
3478             charbuf[idx++] = ISO_CODE_ESC;
3479             charbuf[idx] = '0';
3480             new_chars += 2;
3481           }
3482       }
3483   cmp_status->state = COMPOSING_NO;
3484   return new_chars;
3485 }
3486
3487 /* If characters are under composition, finish the composition.  */
3488 #define MAYBE_FINISH_COMPOSITION()                              \
3489   do {                                                          \
3490     if (cmp_status->state != COMPOSING_NO)                      \
3491       char_offset += finish_composition (charbuf, cmp_status);  \
3492   } while (0)
3493
3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3495
3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3500
3501    Produce this annotation sequence now:
3502
3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3504 */
3505
3506 #define DECODE_COMPOSITION_START(c1)                                       \
3507   do {                                                                     \
3508     if (c1 == '0'                                                          \
3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3513       {                                                                    \
3514         *charbuf++ = -1;                                                   \
3515         *charbuf++= -1;                                                    \
3516         cmp_status->state = COMPOSING_CHAR;                                \
3517         cmp_status->length += 2;                                           \
3518       }                                                                    \
3519     else                                                                   \
3520       {                                                                    \
3521         MAYBE_FINISH_COMPOSITION ();                                       \
3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3526         cmp_status->state                                                  \
3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3531         coding->annotated = 1;                                             \
3532       }                                                                    \
3533   } while (0)
3534
3535
3536 /* Handle composition end sequence ESC 1.  */
3537
3538 #define DECODE_COMPOSITION_END()                                        \
3539   do {                                                                  \
3540     if (cmp_status->nchars == 0                                         \
3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3543       {                                                                 \
3544         MAYBE_FINISH_COMPOSITION ();                                    \
3545         goto invalid_code;                                              \
3546       }                                                                 \
3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3552     char_offset += cmp_status->nchars;                                  \
3553     cmp_status->state = COMPOSING_NO;                                   \
3554   } while (0)
3555
3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3557
3558 #define STORE_COMPOSITION_RULE(rule)    \
3559   do {                                  \
3560     *charbuf++ = -2;                    \
3561     *charbuf++ = rule;                  \
3562     cmp_status->length += 2;            \
3563     cmp_status->state--;                \
3564   } while (0)
3565
3566 /* Store a composed char or a component char C in charbuf, and update
3567    cmp_status.  */
3568
3569 #define STORE_COMPOSITION_CHAR(c)                                       \
3570   do {                                                                  \
3571     *charbuf++ = (c);                                                   \
3572     cmp_status->length++;                                               \
3573     if (cmp_status->state == COMPOSING_CHAR)                            \
3574       cmp_status->nchars++;                                             \
3575     else                                                                \
3576       cmp_status->ncomps++;                                             \
3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3580       cmp_status->state++;                                              \
3581   } while (0)
3582
3583
3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3585
3586 static void
3587 decode_coding_iso_2022 (coding)
3588      struct coding_system *coding;
3589 {
3590   const unsigned char *src = coding->source + coding->consumed;
3591   const unsigned char *src_end = coding->source + coding->src_bytes;
3592   const unsigned char *src_base;
3593   int *charbuf = coding->charbuf + coding->charbuf_used;
3594   /* We may produce two annotations (charset and composition) in one
3595      loop and one more charset annotation at the end.  */
3596   int *charbuf_end
3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3598   int consumed_chars = 0, consumed_chars_base;
3599   int multibytep = coding->src_multibyte;
3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3603   int charset_id_2, charset_id_3;
3604   struct charset *charset;
3605   int c;
3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3607   Lisp_Object attrs, charset_list;
3608   int char_offset = coding->produced_char;
3609   int last_offset = char_offset;
3610   int last_id = charset_ascii;
3611   int eol_crlf =
3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3613   int byte_after_cr = -1;
3614   int i;
3615
3616   CODING_GET_INFO (coding, attrs, charset_list);
3617   setup_iso_safe_charsets (attrs);
3618   /* Charset list may have been changed.  */
3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3621
3622   if (cmp_status->state != COMPOSING_NO)
3623     {
3624       for (i = 0; i < cmp_status->length; i++)
3625         *charbuf++ = cmp_status->carryover[i];
3626       coding->annotated = 1;
3627     }
3628
3629   while (1)
3630     {
3631       int c1, c2, c3;
3632
3633       src_base = src;
3634       consumed_chars_base = consumed_chars;
3635
3636       if (charbuf >= charbuf_end)
3637         {
3638           if (byte_after_cr >= 0)
3639             src_base--;
3640           break;
3641         }
3642
3643       if (byte_after_cr >= 0)
3644         c1 = byte_after_cr, byte_after_cr = -1;
3645       else
3646         ONE_MORE_BYTE (c1);
3647       if (c1 < 0)
3648         goto invalid_code;
3649
3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3651         {
3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653           char_offset++;
3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3655           continue;
3656         }
3657
3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3659         {
3660           if (c1 == ISO_CODE_ESC)
3661             {
3662               if (src + 1 >= src_end)
3663                 goto no_more_source;
3664               *charbuf++ = ISO_CODE_ESC;
3665               char_offset++;
3666               if (src[0] == '%' && src[1] == '@')
3667                 {
3668                   src += 2;
3669                   consumed_chars += 2;
3670                   char_offset += 2;
3671                   /* We are sure charbuf can contain two more chars. */
3672                   *charbuf++ = '%';
3673                   *charbuf++ = '@';
3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3675                 }
3676             }
3677           else
3678             {
3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3680               char_offset++;
3681             }
3682           continue;
3683         }
3684
3685       if ((cmp_status->state == COMPOSING_RULE
3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3687           && c1 != ISO_CODE_ESC)
3688         {
3689           int rule, nbytes;
3690
3691           DECODE_COMPOSITION_RULE (rule, nbytes);
3692           if (rule < 0)
3693             goto invalid_code;
3694           STORE_COMPOSITION_RULE (rule);
3695           continue;
3696         }
3697
3698       /* We produce at most one character.  */
3699       switch (iso_code_class [c1])
3700         {
3701         case ISO_0x20_or_0x7F:
3702           if (charset_id_0 < 0
3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3704             /* This is SPACE or DEL.  */
3705             charset = CHARSET_FROM_ID (charset_ascii);
3706           else
3707             charset = CHARSET_FROM_ID (charset_id_0);
3708           break;
3709
3710         case ISO_graphic_plane_0:
3711           if (charset_id_0 < 0)
3712             charset = CHARSET_FROM_ID (charset_ascii);
3713           else
3714             charset = CHARSET_FROM_ID (charset_id_0);
3715           break;
3716
3717         case ISO_0xA0_or_0xFF:
3718           if (charset_id_1 < 0
3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3721             goto invalid_code;
3722           /* This is a graphic character, we fall down ... */
3723
3724         case ISO_graphic_plane_1:
3725           if (charset_id_1 < 0)
3726             goto invalid_code;
3727           charset = CHARSET_FROM_ID (charset_id_1);
3728           break;
3729
3730         case ISO_control_0:
3731           if (eol_crlf && c1 == '\r')
3732             ONE_MORE_BYTE (byte_after_cr);
3733           MAYBE_FINISH_COMPOSITION ();
3734           charset = CHARSET_FROM_ID (charset_ascii);
3735           break;
3736
3737         case ISO_control_1:
3738           goto invalid_code;
3739
3740         case ISO_shift_out:
3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3743             goto invalid_code;
3744           CODING_ISO_INVOCATION (coding, 0) = 1;
3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3746           continue;
3747
3748         case ISO_shift_in:
3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3750             goto invalid_code;
3751           CODING_ISO_INVOCATION (coding, 0) = 0;
3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3753           continue;
3754
3755         case ISO_single_shift_2_7:
3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3757             goto invalid_code;
3758         case ISO_single_shift_2:
3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3760             goto invalid_code;
3761           /* SS2 is handled as an escape sequence of ESC 'N' */
3762           c1 = 'N';
3763           goto label_escape_sequence;
3764
3765         case ISO_single_shift_3:
3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3767             goto invalid_code;
3768           /* SS2 is handled as an escape sequence of ESC 'O' */
3769           c1 = 'O';
3770           goto label_escape_sequence;
3771
3772         case ISO_control_sequence_introducer:
3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
3774           c1 = '[';
3775           goto label_escape_sequence;
3776
3777         case ISO_escape:
3778           ONE_MORE_BYTE (c1);
3779         label_escape_sequence:
3780           /* Escape sequences handled here are invocation,
3781              designation, direction specification, and character
3782              composition specification.  */
3783           switch (c1)
3784             {
3785             case '&':           /* revision of following character set */
3786               ONE_MORE_BYTE (c1);
3787               if (!(c1 >= '@' && c1 <= '~'))
3788                 goto invalid_code;
3789               ONE_MORE_BYTE (c1);
3790               if (c1 != ISO_CODE_ESC)
3791                 goto invalid_code;
3792               ONE_MORE_BYTE (c1);
3793               goto label_escape_sequence;
3794
3795             case '$':           /* designation of 2-byte character set */
3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3797                 goto invalid_code;
3798               {
3799                 int reg, chars96;
3800
3801                 ONE_MORE_BYTE (c1);
3802                 if (c1 >= '@' && c1 <= 'B')
3803                   {     /* designation of JISX0208.1978, GB2312.1980,
3804                            or JISX0208.1980 */
3805                     reg = 0, chars96 = 0;
3806                   }
3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
3808                   { /* designation of DIMENSION2_CHARS94 character set */
3809                     reg = c1 - 0x28, chars96 = 0;
3810                     ONE_MORE_BYTE (c1);
3811                   }
3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
3813                   { /* designation of DIMENSION2_CHARS96 character set */
3814                     reg = c1 - 0x2C, chars96 = 1;
3815                     ONE_MORE_BYTE (c1);
3816                   }
3817                 else
3818                   goto invalid_code;
3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3820                 /* We must update these variables now.  */
3821                 if (reg == 0)
3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3823                 else if (reg == 1)
3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3825                 if (chars96 < 0)
3826                   goto invalid_code;
3827               }
3828               continue;
3829
3830             case 'n':           /* invocation of locking-shift-2 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3833                 goto invalid_code;
3834               CODING_ISO_INVOCATION (coding, 0) = 2;
3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3836               continue;
3837
3838             case 'o':           /* invocation of locking-shift-3 */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3841                 goto invalid_code;
3842               CODING_ISO_INVOCATION (coding, 0) = 3;
3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844               continue;
3845
3846             case 'N':           /* invocation of single-shift-2 */
3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3849                 goto invalid_code;
3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3851               if (charset_id_2 < 0)
3852                 charset = CHARSET_FROM_ID (charset_ascii);
3853               else
3854                 charset = CHARSET_FROM_ID (charset_id_2);
3855               ONE_MORE_BYTE (c1);
3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3857                 goto invalid_code;
3858               break;
3859
3860             case 'O':           /* invocation of single-shift-3 */
3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3863                 goto invalid_code;
3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3865               if (charset_id_3 < 0)
3866                 charset = CHARSET_FROM_ID (charset_ascii);
3867               else
3868                 charset = CHARSET_FROM_ID (charset_id_3);
3869               ONE_MORE_BYTE (c1);
3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3871                 goto invalid_code;
3872               break;
3873
3874             case '0': case '2': case '3': case '4': /* start composition */
3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3876                 goto invalid_code;
3877               if (last_id != charset_ascii)
3878                 {
3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3880                   last_id = charset_ascii;
3881                   last_offset = char_offset;
3882                 }
3883               DECODE_COMPOSITION_START (c1);
3884               continue;
3885
3886             case '1':           /* end composition */
3887               if (cmp_status->state == COMPOSING_NO)
3888                 goto invalid_code;
3889               DECODE_COMPOSITION_END ();
3890               continue;
3891
3892             case '[':           /* specification of direction */
3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3894                 goto invalid_code;
3895               /* For the moment, nested direction is not supported.
3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3897                  left-to-right, and nonzero means right-to-left.  */
3898               ONE_MORE_BYTE (c1);
3899               switch (c1)
3900                 {
3901                 case ']':       /* end of the current direction */
3902                   coding->mode &= ~CODING_MODE_DIRECTION;
3903
3904                 case '0':       /* end of the current direction */
3905                 case '1':       /* start of left-to-right direction */
3906                   ONE_MORE_BYTE (c1);
3907                   if (c1 == ']')
3908                     coding->mode &= ~CODING_MODE_DIRECTION;
3909                   else
3910                     goto invalid_code;
3911                   break;
3912
3913                 case '2':       /* start of right-to-left direction */
3914                   ONE_MORE_BYTE (c1);
3915                   if (c1 == ']')
3916                     coding->mode |= CODING_MODE_DIRECTION;
3917                   else
3918                     goto invalid_code;
3919                   break;
3920
3921                 default:
3922                   goto invalid_code;
3923                 }
3924               continue;
3925
3926             case '%':
3927               ONE_MORE_BYTE (c1);
3928               if (c1 == '/')
3929                 {
3930                   /* CTEXT extended segment:
3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3932                      We keep these bytes as is for the moment.
3933                      They may be decoded by post-read-conversion.  */
3934                   int dim, M, L;
3935                   int size;
3936
3937                   ONE_MORE_BYTE (dim);
3938                   if (dim < '0' || dim > '4')
3939                     goto invalid_code;
3940                   ONE_MORE_BYTE (M);
3941                   if (M < 128)
3942                     goto invalid_code;
3943                   ONE_MORE_BYTE (L);
3944                   if (L < 128)
3945                     goto invalid_code;
3946                   size = ((M - 128) * 128) + (L - 128);
3947                   if (charbuf + 6 > charbuf_end)
3948                     goto break_loop;
3949                   *charbuf++ = ISO_CODE_ESC;
3950                   *charbuf++ = '%';
3951                   *charbuf++ = '/';
3952                   *charbuf++ = dim;
3953                   *charbuf++ = BYTE8_TO_CHAR (M);
3954                   *charbuf++ = BYTE8_TO_CHAR (L);
3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3956                 }
3957               else if (c1 == 'G')
3958                 {
3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3960                      ESC % G --UTF-8-BYTES-- ESC % @
3961                      We keep these bytes as is for the moment.
3962                      They may be decoded by post-read-conversion.  */
3963                   if (charbuf + 3 > charbuf_end)
3964                     goto break_loop;
3965                   *charbuf++ = ISO_CODE_ESC;
3966                   *charbuf++ = '%';
3967                   *charbuf++ = 'G';
3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3969                 }
3970               else
3971                 goto invalid_code;
3972               continue;
3973               break;
3974
3975             default:
3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3977                 goto invalid_code;
3978               {
3979                 int reg, chars96;
3980
3981                 if (c1 >= 0x28 && c1 <= 0x2B)
3982                   { /* designation of DIMENSION1_CHARS94 character set */
3983                     reg = c1 - 0x28, chars96 = 0;
3984                     ONE_MORE_BYTE (c1);
3985                   }
3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
3987                   { /* designation of DIMENSION1_CHARS96 character set */
3988                     reg = c1 - 0x2C, chars96 = 1;
3989                     ONE_MORE_BYTE (c1);
3990                   }
3991                 else
3992                   goto invalid_code;
3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3994                 /* We must update these variables now.  */
3995                 if (reg == 0)
3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3997                 else if (reg == 1)
3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3999                 if (chars96 < 0)
4000                   goto invalid_code;
4001               }
4002               continue;
4003             }
4004         }
4005
4006       if (cmp_status->state == COMPOSING_NO
4007           && charset->id != charset_ascii
4008           && last_id != charset->id)
4009         {
4010           if (last_id != charset_ascii)
4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4012           last_id = charset->id;
4013           last_offset = char_offset;
4014         }
4015
4016       /* Now we know CHARSET and 1st position code C1 of a character.
4017          Produce a decoded character while getting 2nd and 3rd
4018          position codes C2, C3 if necessary.  */
4019       if (CHARSET_DIMENSION (charset) > 1)
4020         {
4021           ONE_MORE_BYTE (c2);
4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4023               || ((c1 & 0x80) != (c2 & 0x80)))
4024             /* C2 is not in a valid range.  */
4025             goto invalid_code;
4026           if (CHARSET_DIMENSION (charset) == 2)
4027             c1 = (c1 << 8) | c2;
4028           else
4029             {
4030               ONE_MORE_BYTE (c3);
4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4032                   || ((c1 & 0x80) != (c3 & 0x80)))
4033                 /* C3 is not in a valid range.  */
4034                 goto invalid_code;
4035               c1 = (c1 << 16) | (c2 << 8) | c2;
4036             }
4037         }
4038       c1 &= 0x7F7F7F;
4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4040       if (c < 0)
4041         {
4042           MAYBE_FINISH_COMPOSITION ();
4043           for (; src_base < src; src_base++, char_offset++)
4044             {
4045               if (ASCII_BYTE_P (*src_base))
4046                 *charbuf++ = *src_base;
4047               else
4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4049             }
4050         }
4051       else if (cmp_status->state == COMPOSING_NO)
4052         {
4053           *charbuf++ = c;
4054           char_offset++;
4055         }
4056       else if ((cmp_status->state == COMPOSING_CHAR
4057                 ? cmp_status->nchars
4058                 : cmp_status->ncomps)
4059                >= MAX_COMPOSITION_COMPONENTS)
4060         {
4061           /* Too long composition.  */
4062           MAYBE_FINISH_COMPOSITION ();
4063           *charbuf++ = c;
4064           char_offset++;
4065         }
4066       else
4067         STORE_COMPOSITION_CHAR (c);
4068       continue;
4069
4070     invalid_code:
4071       MAYBE_FINISH_COMPOSITION ();
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078       continue;
4079
4080     break_loop:
4081       break;
4082     }
4083
4084  no_more_source:
4085   if (cmp_status->state != COMPOSING_NO)
4086     {
4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
4088         MAYBE_FINISH_COMPOSITION ();
4089       else
4090         {
4091           charbuf -= cmp_status->length;
4092           for (i = 0; i < cmp_status->length; i++)
4093             cmp_status->carryover[i] = charbuf[i];
4094         }
4095     }
4096   else if (last_id != charset_ascii)
4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4098   coding->consumed_char += consumed_chars_base;
4099   coding->consumed = src_base - coding->source;
4100   coding->charbuf_used = charbuf - coding->charbuf;
4101 }
4102
4103
4104 /* ISO2022 encoding stuff.  */
4105
4106 /*
4107    It is not enough to say just "ISO2022" on encoding, we have to
4108    specify more details.  In Emacs, each coding system of ISO2022
4109    variant has the following specifications:
4110         1. Initial designation to G0 thru G3.
4111         2. Allows short-form designation?
4112         3. ASCII should be designated to G0 before control characters?
4113         4. ASCII should be designated to G0 at end of line?
4114         5. 7-bit environment or 8-bit environment?
4115         6. Use locking-shift?
4116         7. Use Single-shift?
4117    And the following two are only for Japanese:
4118         8. Use ASCII in place of JIS0201-1976-Roman?
4119         9. Use JISX0208-1983 in place of JISX0208-1978?
4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4122    details.
4123 */
4124
4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4127    '@', 'A', or 'B' and the coding system CODING allows, produce
4128    designation sequence of short-form.  */
4129
4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4131   do {                                                                  \
4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4133     char *intermediate_char_94 = "()*+";                                \
4134     char *intermediate_char_96 = ",-./";                                \
4135     int revision = -1;                                                  \
4136     int c;                                                              \
4137                                                                         \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4139       revision = CHARSET_ISO_REVISION (charset);                        \
4140                                                                         \
4141     if (revision >= 0)                                                  \
4142       {                                                                 \
4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4144         EMIT_ONE_BYTE ('@' + revision);                                 \
4145       }                                                                 \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4147     if (CHARSET_DIMENSION (charset) == 1)                               \
4148       {                                                                 \
4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4150           c = intermediate_char_94[reg];                                \
4151         else                                                            \
4152           c = intermediate_char_96[reg];                                \
4153         EMIT_ONE_ASCII_BYTE (c);                                        \
4154       }                                                                 \
4155     else                                                                \
4156       {                                                                 \
4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4159           {                                                             \
4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4161                 || reg != 0                                             \
4162                 || final_char < '@' || final_char > 'B')                \
4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4164           }                                                             \
4165         else                                                            \
4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4167       }                                                                 \
4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4169                                                                         \
4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4171   } while (0)
4172
4173
4174 /* The following two macros produce codes (control character or escape
4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
4176    single-shift-3).  */
4177
4178 #define ENCODE_SINGLE_SHIFT_2                                           \
4179   do {                                                                  \
4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4182     else                                                                \
4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4185   } while (0)
4186
4187
4188 #define ENCODE_SINGLE_SHIFT_3                                           \
4189   do {                                                                  \
4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4192     else                                                                \
4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4195   } while (0)
4196
4197
4198 /* The following four macros produce codes (control character or
4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
4200    shift-out, locking-shift-2, and locking-shift-3).  */
4201
4202 #define ENCODE_SHIFT_IN                                 \
4203   do {                                                  \
4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4206   } while (0)
4207
4208
4209 #define ENCODE_SHIFT_OUT                                \
4210   do {                                                  \
4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4213   } while (0)
4214
4215
4216 #define ENCODE_LOCKING_SHIFT_2                          \
4217   do {                                                  \
4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4220   } while (0)
4221
4222
4223 #define ENCODE_LOCKING_SHIFT_3                          \
4224   do {                                                  \
4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4227   } while (0)
4228
4229
4230 /* Produce codes for a DIMENSION1 character whose character set is
4231    CHARSET and whose position-code is C1.  Designation and invocation
4232    sequences are also produced in advance if necessary.  */
4233
4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4235   do {                                                                  \
4236     int id = CHARSET_ID (charset);                                      \
4237                                                                         \
4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4239         && id == charset_ascii)                                         \
4240       {                                                                 \
4241         id = charset_jisx0201_roman;                                    \
4242         charset = CHARSET_FROM_ID (id);                                 \
4243       }                                                                 \
4244                                                                         \
4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4246       {                                                                 \
4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4249         else                                                            \
4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4252         break;                                                          \
4253       }                                                                 \
4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4255       {                                                                 \
4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4260       {                                                                 \
4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4262         break;                                                          \
4263       }                                                                 \
4264     else                                                                \
4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4266          must invoke it, or, at first, designate it to some graphic     \
4267          register.  Then repeat the loop to actually produce the        \
4268          character.  */                                                 \
4269       dst = encode_invocation_designation (charset, coding, dst,        \
4270                                            &produced_chars);            \
4271   } while (1)
4272
4273
4274 /* Produce codes for a DIMENSION2 character whose character set is
4275    CHARSET and whose position-codes are C1 and C2.  Designation and
4276    invocation codes are also produced in advance if necessary.  */
4277
4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4279   do {                                                                  \
4280     int id = CHARSET_ID (charset);                                      \
4281                                                                         \
4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4283         && id == charset_jisx0208)                                      \
4284       {                                                                 \
4285         id = charset_jisx0208_1978;                                     \
4286         charset = CHARSET_FROM_ID (id);                                 \
4287       }                                                                 \
4288                                                                         \
4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4290       {                                                                 \
4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4293         else                                                            \
4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4296         break;                                                          \
4297       }                                                                 \
4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4299       {                                                                 \
4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4301         break;                                                          \
4302       }                                                                 \
4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4304       {                                                                 \
4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4306         break;                                                          \
4307       }                                                                 \
4308     else                                                                \
4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4310          must invoke it, or, at first, designate it to some graphic     \
4311          register.  Then repeat the loop to actually produce the        \
4312          character.  */                                                 \
4313       dst = encode_invocation_designation (charset, coding, dst,        \
4314                                            &produced_chars);            \
4315   } while (1)
4316
4317
4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4319   do {                                                                     \
4320     int code = ENCODE_CHAR ((charset),(c));                                \
4321                                                                            \
4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4324     else                                                                   \
4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4326   } while (0)
4327
4328
4329 /* Produce designation and invocation codes at a place pointed by DST
4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4331    Return new DST.  */
4332
4333 unsigned char *
4334 encode_invocation_designation (charset, coding, dst, p_nchars)
4335      struct charset *charset;
4336      struct coding_system *coding;
4337      unsigned char *dst;
4338      int *p_nchars;
4339 {
4340   int multibytep = coding->dst_multibyte;
4341   int produced_chars = *p_nchars;
4342   int reg;                      /* graphic register number */
4343   int id = CHARSET_ID (charset);
4344
4345   /* At first, check designations.  */
4346   for (reg = 0; reg < 4; reg++)
4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
4348       break;
4349
4350   if (reg >= 4)
4351     {
4352       /* CHARSET is not yet designated to any graphic registers.  */
4353       /* At first check the requested designation.  */
4354       reg = CODING_ISO_REQUEST (coding, id);
4355       if (reg < 0)
4356         /* Since CHARSET requests no special designation, designate it
4357            to graphic register 0.  */
4358         reg = 0;
4359
4360       ENCODE_DESIGNATION (charset, reg, coding);
4361     }
4362
4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
4365     {
4366       /* Since the graphic register REG is not invoked to any graphic
4367          planes, invoke it to graphic plane 0.  */
4368       switch (reg)
4369         {
4370         case 0:                 /* graphic register 0 */
4371           ENCODE_SHIFT_IN;
4372           break;
4373
4374         case 1:                 /* graphic register 1 */
4375           ENCODE_SHIFT_OUT;
4376           break;
4377
4378         case 2:                 /* graphic register 2 */
4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4380             ENCODE_SINGLE_SHIFT_2;
4381           else
4382             ENCODE_LOCKING_SHIFT_2;
4383           break;
4384
4385         case 3:                 /* graphic register 3 */
4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4387             ENCODE_SINGLE_SHIFT_3;
4388           else
4389             ENCODE_LOCKING_SHIFT_3;
4390           break;
4391         }
4392     }
4393
4394   *p_nchars = produced_chars;
4395   return dst;
4396 }
4397
4398 /* The following three macros produce codes for indicating direction
4399    of text.  */
4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4401   do {                                                                  \
4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4404     else                                                                \
4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4406   } while (0)
4407
4408
4409 #define ENCODE_DIRECTION_R2L()                  \
4410   do {                                          \
4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4413   } while (0)
4414
4415
4416 #define ENCODE_DIRECTION_L2R()                  \
4417   do {                                          \
4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4420   } while (0)
4421
4422
4423 /* Produce codes for designation and invocation to reset the graphic
4424    planes and registers to initial state.  */
4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4426   do {                                                                  \
4427     int reg;                                                            \
4428     struct charset *charset;                                            \
4429                                                                         \
4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4431       ENCODE_SHIFT_IN;                                                  \
4432     for (reg = 0; reg < 4; reg++)                                       \
4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4435               != CODING_ISO_INITIAL (coding, reg)))                     \
4436         {                                                               \
4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
4439         }                                                               \
4440   } while (0)
4441
4442
4443 /* Produce designation sequences of charsets in the line started from
4444    SRC to a place pointed by DST, and return updated DST.
4445
4446    If the current block ends before any end-of-line, we may fail to
4447    find all the necessary designations.  */
4448
4449 static unsigned char *
4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4451      struct coding_system *coding;
4452      int *charbuf, *charbuf_end;
4453      unsigned char *dst;
4454 {
4455   struct charset *charset;
4456   /* Table of charsets to be designated to each graphic register.  */
4457   int r[4];
4458   int c, found = 0, reg;
4459   int produced_chars = 0;
4460   int multibytep = coding->dst_multibyte;
4461   Lisp_Object attrs;
4462   Lisp_Object charset_list;
4463
4464   attrs = CODING_ID_ATTRS (coding->id);
4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4466   if (EQ (charset_list, Qiso_2022))
4467     charset_list = Viso_2022_charset_list;
4468
4469   for (reg = 0; reg < 4; reg++)
4470     r[reg] = -1;
4471
4472   while (found < 4)
4473     {
4474       int id;
4475
4476       c = *charbuf++;
4477       if (c == '\n')
4478         break;
4479       charset = char_charset (c, charset_list, NULL);
4480       id = CHARSET_ID (charset);
4481       reg = CODING_ISO_REQUEST (coding, id);
4482       if (reg >= 0 && r[reg] < 0)
4483         {
4484           found++;
4485           r[reg] = id;
4486         }
4487     }
4488
4489   if (found)
4490     {
4491       for (reg = 0; reg < 4; reg++)
4492         if (r[reg] >= 0
4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4495     }
4496
4497   return dst;
4498 }
4499
4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4501
4502 static int
4503 encode_coding_iso_2022 (coding)
4504      struct coding_system *coding;
4505 {
4506   int multibytep = coding->dst_multibyte;
4507   int *charbuf = coding->charbuf;
4508   int *charbuf_end = charbuf + coding->charbuf_used;
4509   unsigned char *dst = coding->destination + coding->produced;
4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511   int safe_room = 16;
4512   int bol_designation
4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4514        && CODING_ISO_BOL (coding));
4515   int produced_chars = 0;
4516   Lisp_Object attrs, eol_type, charset_list;
4517   int ascii_compatible;
4518   int c;
4519   int preferred_charset_id = -1;
4520
4521   CODING_GET_INFO (coding, attrs, charset_list);
4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4523   if (VECTORP (eol_type))
4524     eol_type = Qunix;
4525
4526   setup_iso_safe_charsets (attrs);
4527   /* Charset list may have been changed.  */
4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4530
4531   ascii_compatible
4532     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4533        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4534                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4535
4536   while (charbuf < charbuf_end)
4537     {
4538       ASSURE_DESTINATION (safe_room);
4539
4540       if (bol_designation)
4541         {
4542           unsigned char *dst_prev = dst;
4543
4544           /* We have to produce designation sequences if any now.  */
4545           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4546           bol_designation = 0;
4547           /* We are sure that designation sequences are all ASCII bytes.  */
4548           produced_chars += dst - dst_prev;
4549         }
4550
4551       c = *charbuf++;
4552
4553       if (c < 0)
4554         {
4555           /* Handle an annotation.  */
4556           switch (*charbuf)
4557             {
4558             case CODING_ANNOTATE_COMPOSITION_MASK:
4559               /* Not yet implemented.  */
4560               break;
4561             case CODING_ANNOTATE_CHARSET_MASK:
4562               preferred_charset_id = charbuf[2];
4563               if (preferred_charset_id >= 0
4564                   && NILP (Fmemq (make_number (preferred_charset_id),
4565                                   charset_list)))
4566                 preferred_charset_id = -1;
4567               break;
4568             default:
4569               abort ();
4570             }
4571           charbuf += -c - 1;
4572           continue;
4573         }
4574
4575       /* Now encode the character C.  */
4576       if (c < 0x20 || c == 0x7F)
4577         {
4578           if (c == '\n'
4579               || (c == '\r' && EQ (eol_type, Qmac)))
4580             {
4581               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4582                 ENCODE_RESET_PLANE_AND_REGISTER ();
4583               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4584                 {
4585                   int i;
4586
4587                   for (i = 0; i < 4; i++)
4588                     CODING_ISO_DESIGNATION (coding, i)
4589                       = CODING_ISO_INITIAL (coding, i);
4590                 }
4591               bol_designation
4592                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4593             }
4594           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4595             ENCODE_RESET_PLANE_AND_REGISTER ();
4596           EMIT_ONE_ASCII_BYTE (c);
4597         }
4598       else if (ASCII_CHAR_P (c))
4599         {
4600           if (ascii_compatible)
4601             EMIT_ONE_ASCII_BYTE (c);
4602           else
4603             {
4604               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4605               ENCODE_ISO_CHARACTER (charset, c);
4606             }
4607         }
4608       else if (CHAR_BYTE8_P (c))
4609         {
4610           c = CHAR_TO_BYTE8 (c);
4611           EMIT_ONE_BYTE (c);
4612         }
4613       else
4614         {
4615           struct charset *charset;
4616
4617           if (preferred_charset_id >= 0)
4618             {
4619               charset = CHARSET_FROM_ID (preferred_charset_id);
4620               if (! CHAR_CHARSET_P (c, charset))
4621                 charset = char_charset (c, charset_list, NULL);
4622             }
4623           else
4624             charset = char_charset (c, charset_list, NULL);
4625           if (!charset)
4626             {
4627               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4628                 {
4629                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4630                   charset = CHARSET_FROM_ID (charset_ascii);
4631                 }
4632               else
4633                 {
4634                   c = coding->default_char;
4635                   charset = char_charset (c, charset_list, NULL);
4636                 }
4637             }
4638           ENCODE_ISO_CHARACTER (charset, c);
4639         }
4640     }
4641
4642   if (coding->mode & CODING_MODE_LAST_BLOCK
4643       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4644     {
4645       ASSURE_DESTINATION (safe_room);
4646       ENCODE_RESET_PLANE_AND_REGISTER ();
4647     }
4648   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4649   CODING_ISO_BOL (coding) = bol_designation;
4650   coding->produced_char += produced_chars;
4651   coding->produced = dst - coding->destination;
4652   return 0;
4653 }
4654
4655 \f
4656 /*** 8,9. SJIS and BIG5 handlers ***/
4657
4658 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4659    quite widely.  So, for the moment, Emacs supports them in the bare
4660    C code.  But, in the future, they may be supported only by CCL.  */
4661
4662 /* SJIS is a coding system encoding three character sets: ASCII, right
4663    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4664    as is.  A character of charset katakana-jisx0201 is encoded by
4665    "position-code + 0x80".  A character of charset japanese-jisx0208
4666    is encoded in 2-byte but two position-codes are divided and shifted
4667    so that it fit in the range below.
4668
4669    --- CODE RANGE of SJIS ---
4670    (character set)      (range)
4671    ASCII                0x00 .. 0x7F
4672    KATAKANA-JISX0201    0xA0 .. 0xDF
4673    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4674             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4675    -------------------------------
4676
4677 */
4678
4679 /* BIG5 is a coding system encoding two character sets: ASCII and
4680    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4681    character set and is encoded in two-byte.
4682
4683    --- CODE RANGE of BIG5 ---
4684    (character set)      (range)
4685    ASCII                0x00 .. 0x7F
4686    Big5 (1st byte)      0xA1 .. 0xFE
4687         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4688    --------------------------
4689
4690   */
4691
4692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4693    Check if a text is encoded in SJIS.  If it is, return
4694    CATEGORY_MASK_SJIS, else return 0.  */
4695
4696 static int
4697 detect_coding_sjis (coding, detect_info)
4698      struct coding_system *coding;
4699      struct coding_detection_info *detect_info;
4700 {
4701   const unsigned char *src = coding->source, *src_base;
4702   const unsigned char *src_end = coding->source + coding->src_bytes;
4703   int multibytep = coding->src_multibyte;
4704   int consumed_chars = 0;
4705   int found = 0;
4706   int c;
4707   Lisp_Object attrs, charset_list;
4708   int max_first_byte_of_2_byte_code;
4709
4710   CODING_GET_INFO (coding, attrs, charset_list);
4711   max_first_byte_of_2_byte_code
4712     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4713
4714   detect_info->checked |= CATEGORY_MASK_SJIS;
4715   /* A coding system of this category is always ASCII compatible.  */
4716   src += coding->head_ascii;
4717
4718   while (1)
4719     {
4720       src_base = src;
4721       ONE_MORE_BYTE (c);
4722       if (c < 0x80)
4723         continue;
4724       if ((c >= 0x81 && c <= 0x9F)
4725           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4726         {
4727           ONE_MORE_BYTE (c);
4728           if (c < 0x40 || c == 0x7F || c > 0xFC)
4729             break;
4730           found = CATEGORY_MASK_SJIS;
4731         }
4732       else if (c >= 0xA0 && c < 0xE0)
4733         found = CATEGORY_MASK_SJIS;
4734       else
4735         break;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_SJIS;
4738   return 0;
4739
4740  no_more_source:
4741   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742     {
4743       detect_info->rejected |= CATEGORY_MASK_SJIS;
4744       return 0;
4745     }
4746   detect_info->found |= found;
4747   return 1;
4748 }
4749
4750 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4751    Check if a text is encoded in BIG5.  If it is, return
4752    CATEGORY_MASK_BIG5, else return 0.  */
4753
4754 static int
4755 detect_coding_big5 (coding, detect_info)
4756      struct coding_system *coding;
4757      struct coding_detection_info *detect_info;
4758 {
4759   const unsigned char *src = coding->source, *src_base;
4760   const unsigned char *src_end = coding->source + coding->src_bytes;
4761   int multibytep = coding->src_multibyte;
4762   int consumed_chars = 0;
4763   int found = 0;
4764   int c;
4765
4766   detect_info->checked |= CATEGORY_MASK_BIG5;
4767   /* A coding system of this category is always ASCII compatible.  */
4768   src += coding->head_ascii;
4769
4770   while (1)
4771     {
4772       src_base = src;
4773       ONE_MORE_BYTE (c);
4774       if (c < 0x80)
4775         continue;
4776       if (c >= 0xA1)
4777         {
4778           ONE_MORE_BYTE (c);
4779           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4780             return 0;
4781           found = CATEGORY_MASK_BIG5;
4782         }
4783       else
4784         break;
4785     }
4786   detect_info->rejected |= CATEGORY_MASK_BIG5;
4787   return 0;
4788
4789  no_more_source:
4790   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4791     {
4792       detect_info->rejected |= CATEGORY_MASK_BIG5;
4793       return 0;
4794     }
4795   detect_info->found |= found;
4796   return 1;
4797 }
4798
4799 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4800    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4801
4802 static void
4803 decode_coding_sjis (coding)
4804      struct coding_system *coding;
4805 {
4806   const unsigned char *src = coding->source + coding->consumed;
4807   const unsigned char *src_end = coding->source + coding->src_bytes;
4808   const unsigned char *src_base;
4809   int *charbuf = coding->charbuf + coding->charbuf_used;
4810   /* We may produce one charset annotation in one loop and one more at
4811      the end.  */
4812   int *charbuf_end
4813     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814   int consumed_chars = 0, consumed_chars_base;
4815   int multibytep = coding->src_multibyte;
4816   struct charset *charset_roman, *charset_kanji, *charset_kana;
4817   struct charset *charset_kanji2;
4818   Lisp_Object attrs, charset_list, val;
4819   int char_offset = coding->produced_char;
4820   int last_offset = char_offset;
4821   int last_id = charset_ascii;
4822   int eol_crlf =
4823     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4824   int byte_after_cr = -1;
4825
4826   CODING_GET_INFO (coding, attrs, charset_list);
4827
4828   val = charset_list;
4829   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4830   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4831   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4832   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4833
4834   while (1)
4835     {
4836       int c, c1;
4837       struct charset *charset;
4838
4839       src_base = src;
4840       consumed_chars_base = consumed_chars;
4841
4842       if (charbuf >= charbuf_end)
4843         {
4844           if (byte_after_cr >= 0)
4845             src_base--;
4846           break;
4847         }
4848
4849       if (byte_after_cr >= 0)
4850         c = byte_after_cr, byte_after_cr = -1;
4851       else
4852         ONE_MORE_BYTE (c);
4853       if (c < 0)
4854         goto invalid_code;
4855       if (c < 0x80)
4856         {
4857           if (eol_crlf && c == '\r')
4858             ONE_MORE_BYTE (byte_after_cr);
4859           charset = charset_roman;
4860         }
4861       else if (c == 0x80 || c == 0xA0)
4862         goto invalid_code;
4863       else if (c >= 0xA1 && c <= 0xDF)
4864         {
4865           /* SJIS -> JISX0201-Kana */
4866           c &= 0x7F;
4867           charset = charset_kana;
4868         }
4869       else if (c <= 0xEF)
4870         {
4871           /* SJIS -> JISX0208 */
4872           ONE_MORE_BYTE (c1);
4873           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4874             goto invalid_code;
4875           c = (c << 8) | c1;
4876           SJIS_TO_JIS (c);
4877           charset = charset_kanji;
4878         }
4879       else if (c <= 0xFC && charset_kanji2)
4880         {
4881           /* SJIS -> JISX0213-2 */
4882           ONE_MORE_BYTE (c1);
4883           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4884             goto invalid_code;
4885           c = (c << 8) | c1;
4886           SJIS_TO_JIS2 (c);
4887           charset = charset_kanji2;
4888         }
4889       else
4890         goto invalid_code;
4891       if (charset->id != charset_ascii
4892           && last_id != charset->id)
4893         {
4894           if (last_id != charset_ascii)
4895             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4896           last_id = charset->id;
4897           last_offset = char_offset;
4898         }
4899       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4900       *charbuf++ = c;
4901       char_offset++;
4902       continue;
4903
4904     invalid_code:
4905       src = src_base;
4906       consumed_chars = consumed_chars_base;
4907       ONE_MORE_BYTE (c);
4908       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4909       char_offset++;
4910       coding->errors++;
4911     }
4912
4913  no_more_source:
4914   if (last_id != charset_ascii)
4915     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4916   coding->consumed_char += consumed_chars_base;
4917   coding->consumed = src_base - coding->source;
4918   coding->charbuf_used = charbuf - coding->charbuf;
4919 }
4920
4921 static void
4922 decode_coding_big5 (coding)
4923      struct coding_system *coding;
4924 {
4925   const unsigned char *src = coding->source + coding->consumed;
4926   const unsigned char *src_end = coding->source + coding->src_bytes;
4927   const unsigned char *src_base;
4928   int *charbuf = coding->charbuf + coding->charbuf_used;
4929   /* We may produce one charset annotation in one loop and one more at
4930      the end.  */
4931   int *charbuf_end
4932     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4933   int consumed_chars = 0, consumed_chars_base;
4934   int multibytep = coding->src_multibyte;
4935   struct charset *charset_roman, *charset_big5;
4936   Lisp_Object attrs, charset_list, val;
4937   int char_offset = coding->produced_char;
4938   int last_offset = char_offset;
4939   int last_id = charset_ascii;
4940   int eol_crlf =
4941     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4942   int byte_after_cr = -1;
4943
4944   CODING_GET_INFO (coding, attrs, charset_list);
4945   val = charset_list;
4946   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4947   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4948
4949   while (1)
4950     {
4951       int c, c1;
4952       struct charset *charset;
4953
4954       src_base = src;
4955       consumed_chars_base = consumed_chars;
4956
4957       if (charbuf >= charbuf_end)
4958         {
4959           if (byte_after_cr >= 0)
4960             src_base--;
4961           break;
4962         }
4963
4964       if (byte_after_cr >= 0)
4965         c = byte_after_cr, byte_after_cr = -1;
4966       else
4967         ONE_MORE_BYTE (c);
4968
4969       if (c < 0)
4970         goto invalid_code;
4971       if (c < 0x80)
4972         {
4973           if (eol_crlf && c == '\r')
4974             ONE_MORE_BYTE (byte_after_cr);
4975           charset = charset_roman;
4976         }
4977       else
4978         {
4979           /* BIG5 -> Big5 */
4980           if (c < 0xA1 || c > 0xFE)
4981             goto invalid_code;
4982           ONE_MORE_BYTE (c1);
4983           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4984             goto invalid_code;
4985           c = c << 8 | c1;
4986           charset = charset_big5;
4987         }
4988       if (charset->id != charset_ascii
4989           && last_id != charset->id)
4990         {
4991           if (last_id != charset_ascii)
4992             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4993           last_id = charset->id;
4994           last_offset = char_offset;
4995         }
4996       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4997       *charbuf++ = c;
4998       char_offset++;
4999       continue;
5000
5001     invalid_code:
5002       src = src_base;
5003       consumed_chars = consumed_chars_base;
5004       ONE_MORE_BYTE (c);
5005       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5006       char_offset++;
5007       coding->errors++;
5008     }
5009
5010  no_more_source:
5011   if (last_id != charset_ascii)
5012     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5013   coding->consumed_char += consumed_chars_base;
5014   coding->consumed = src_base - coding->source;
5015   coding->charbuf_used = charbuf - coding->charbuf;
5016 }
5017
5018 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5019    This function can encode charsets `ascii', `katakana-jisx0201',
5020    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
5021    are sure that all these charsets are registered as official charset
5022    (i.e. do not have extended leading-codes).  Characters of other
5023    charsets are produced without any encoding.  If SJIS_P is 1, encode
5024    SJIS text, else encode BIG5 text.  */
5025
5026 static int
5027 encode_coding_sjis (coding)
5028      struct coding_system *coding;
5029 {
5030   int multibytep = coding->dst_multibyte;
5031   int *charbuf = coding->charbuf;
5032   int *charbuf_end = charbuf + coding->charbuf_used;
5033   unsigned char *dst = coding->destination + coding->produced;
5034   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5035   int safe_room = 4;
5036   int produced_chars = 0;
5037   Lisp_Object attrs, charset_list, val;
5038   int ascii_compatible;
5039   struct charset *charset_roman, *charset_kanji, *charset_kana;
5040   struct charset *charset_kanji2;
5041   int c;
5042
5043   CODING_GET_INFO (coding, attrs, charset_list);
5044   val = charset_list;
5045   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5046   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5047   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5048   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5049
5050   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5051
5052   while (charbuf < charbuf_end)
5053     {
5054       ASSURE_DESTINATION (safe_room);
5055       c = *charbuf++;
5056       /* Now encode the character C.  */
5057       if (ASCII_CHAR_P (c) && ascii_compatible)
5058         EMIT_ONE_ASCII_BYTE (c);
5059       else if (CHAR_BYTE8_P (c))
5060         {
5061           c = CHAR_TO_BYTE8 (c);
5062           EMIT_ONE_BYTE (c);
5063         }
5064       else
5065         {
5066           unsigned code;
5067           struct charset *charset = char_charset (c, charset_list, &code);
5068
5069           if (!charset)
5070             {
5071               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5072                 {
5073                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5074                   charset = CHARSET_FROM_ID (charset_ascii);
5075                 }
5076               else
5077                 {
5078                   c = coding->default_char;
5079                   charset = char_charset (c, charset_list, &code);
5080                 }
5081             }
5082           if (code == CHARSET_INVALID_CODE (charset))
5083             abort ();
5084           if (charset == charset_kanji)
5085             {
5086               int c1, c2;
5087               JIS_TO_SJIS (code);
5088               c1 = code >> 8, c2 = code & 0xFF;
5089               EMIT_TWO_BYTES (c1, c2);
5090             }
5091           else if (charset == charset_kana)
5092             EMIT_ONE_BYTE (code | 0x80);
5093           else if (charset_kanji2 && charset == charset_kanji2)
5094             {
5095               int c1, c2;
5096
5097               c1 = code >> 8;
5098               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5099                   || c1 == 0x28
5100                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5101                 {
5102                   JIS_TO_SJIS2 (code);
5103                   c1 = code >> 8, c2 = code & 0xFF;
5104                   EMIT_TWO_BYTES (c1, c2);
5105                 }
5106               else
5107                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5108             }
5109           else
5110             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5111         }
5112     }
5113   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114   coding->produced_char += produced_chars;
5115   coding->produced = dst - coding->destination;
5116   return 0;
5117 }
5118
5119 static int
5120 encode_coding_big5 (coding)
5121      struct coding_system *coding;
5122 {
5123   int multibytep = coding->dst_multibyte;
5124   int *charbuf = coding->charbuf;
5125   int *charbuf_end = charbuf + coding->charbuf_used;
5126   unsigned char *dst = coding->destination + coding->produced;
5127   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5128   int safe_room = 4;
5129   int produced_chars = 0;
5130   Lisp_Object attrs, charset_list, val;
5131   int ascii_compatible;
5132   struct charset *charset_roman, *charset_big5;
5133   int c;
5134
5135   CODING_GET_INFO (coding, attrs, charset_list);
5136   val = charset_list;
5137   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5138   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5139   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5140
5141   while (charbuf < charbuf_end)
5142     {
5143       ASSURE_DESTINATION (safe_room);
5144       c = *charbuf++;
5145       /* Now encode the character C.  */
5146       if (ASCII_CHAR_P (c) && ascii_compatible)
5147         EMIT_ONE_ASCII_BYTE (c);
5148       else if (CHAR_BYTE8_P (c))
5149         {
5150           c = CHAR_TO_BYTE8 (c);
5151           EMIT_ONE_BYTE (c);
5152         }
5153       else
5154         {
5155           unsigned code;
5156           struct charset *charset = char_charset (c, charset_list, &code);
5157
5158           if (! charset)
5159             {
5160               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5161                 {
5162                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5163                   charset = CHARSET_FROM_ID (charset_ascii);
5164                 }
5165               else
5166                 {
5167                   c = coding->default_char;
5168                   charset = char_charset (c, charset_list, &code);
5169                 }
5170             }
5171           if (code == CHARSET_INVALID_CODE (charset))
5172             abort ();
5173           if (charset == charset_big5)
5174             {
5175               int c1, c2;
5176
5177               c1 = code >> 8, c2 = code & 0xFF;
5178               EMIT_TWO_BYTES (c1, c2);
5179             }
5180           else
5181             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5182         }
5183     }
5184   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10. CCL handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5194    Check if a text is encoded in a coding system of which
5195    encoder/decoder are written in CCL program.  If it is, return
5196    CATEGORY_MASK_CCL, else return 0.  */
5197
5198 static int
5199 detect_coding_ccl (coding, detect_info)
5200      struct coding_system *coding;
5201      struct coding_detection_info *detect_info;
5202 {
5203   const unsigned char *src = coding->source, *src_base;
5204   const unsigned char *src_end = coding->source + coding->src_bytes;
5205   int multibytep = coding->src_multibyte;
5206   int consumed_chars = 0;
5207   int found = 0;
5208   unsigned char *valids;
5209   int head_ascii = coding->head_ascii;
5210   Lisp_Object attrs;
5211
5212   detect_info->checked |= CATEGORY_MASK_CCL;
5213
5214   coding = &coding_categories[coding_category_ccl];
5215   valids = CODING_CCL_VALIDS (coding);
5216   attrs = CODING_ID_ATTRS (coding->id);
5217   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5218     src += head_ascii;
5219
5220   while (1)
5221     {
5222       int c;
5223
5224       src_base = src;
5225       ONE_MORE_BYTE (c);
5226       if (c < 0 || ! valids[c])
5227         break;
5228       if ((valids[c] > 1))
5229         found = CATEGORY_MASK_CCL;
5230     }
5231   detect_info->rejected |= CATEGORY_MASK_CCL;
5232   return 0;
5233
5234  no_more_source:
5235   detect_info->found |= found;
5236   return 1;
5237 }
5238
5239 static void
5240 decode_coding_ccl (coding)
5241      struct coding_system *coding;
5242 {
5243   const unsigned char *src = coding->source + coding->consumed;
5244   const unsigned char *src_end = coding->source + coding->src_bytes;
5245   int *charbuf = coding->charbuf + coding->charbuf_used;
5246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5247   int consumed_chars = 0;
5248   int multibytep = coding->src_multibyte;
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int source_charbuf[1024];
5251   int source_byteidx[1025];
5252   Lisp_Object attrs, charset_list;
5253
5254   CODING_GET_INFO (coding, attrs, charset_list);
5255
5256   while (1)
5257     {
5258       const unsigned char *p = src;
5259       int i = 0;
5260
5261       if (multibytep)
5262         {
5263           while (i < 1024 && p < src_end)
5264             {
5265               source_byteidx[i] = p - src;
5266               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5267             }
5268           source_byteidx[i] = p - src;
5269         }
5270       else
5271         while (i < 1024 && p < src_end)
5272           source_charbuf[i++] = *p++;
5273
5274       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5275         ccl->last_block = 1;
5276       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5277                   charset_list);
5278       charbuf += ccl->produced;
5279       if (multibytep)
5280         src += source_byteidx[ccl->consumed];
5281       else
5282         src += ccl->consumed;
5283       consumed_chars += ccl->consumed;
5284       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5285         break;
5286     }
5287
5288   switch (ccl->status)
5289     {
5290     case CCL_STAT_SUSPEND_BY_SRC:
5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5292       break;
5293     case CCL_STAT_SUSPEND_BY_DST:
5294       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5295       break;
5296     case CCL_STAT_QUIT:
5297     case CCL_STAT_INVALID_CMD:
5298       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5299       break;
5300     default:
5301       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302       break;
5303     }
5304   coding->consumed_char += consumed_chars;
5305   coding->consumed = src - coding->source;
5306   coding->charbuf_used = charbuf - coding->charbuf;
5307 }
5308
5309 static int
5310 encode_coding_ccl (coding)
5311      struct coding_system *coding;
5312 {
5313   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5314   int multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   int destination_charbuf[1024];
5320   int i, produced_chars = 0;
5321   Lisp_Object attrs, charset_list;
5322
5323   CODING_GET_INFO (coding, attrs, charset_list);
5324   if (coding->consumed_char == coding->src_chars
5325       && coding->mode & CODING_MODE_LAST_BLOCK)
5326     ccl->last_block = 1;
5327
5328   while (charbuf < charbuf_end)
5329     {
5330       ccl_driver (ccl, charbuf, destination_charbuf,
5331                   charbuf_end - charbuf, 1024, charset_list);
5332       if (multibytep)
5333         {
5334           ASSURE_DESTINATION (ccl->produced * 2);
5335           for (i = 0; i < ccl->produced; i++)
5336             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5337         }
5338       else
5339         {
5340           ASSURE_DESTINATION (ccl->produced);
5341           for (i = 0; i < ccl->produced; i++)
5342             *dst++ = destination_charbuf[i] & 0xFF;
5343           produced_chars += ccl->produced;
5344         }
5345       charbuf += ccl->consumed;
5346       if (ccl->status == CCL_STAT_QUIT
5347           || ccl->status == CCL_STAT_INVALID_CMD)
5348         break;
5349     }
5350
5351   switch (ccl->status)
5352     {
5353     case CCL_STAT_SUSPEND_BY_SRC:
5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5355       break;
5356     case CCL_STAT_SUSPEND_BY_DST:
5357       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5358       break;
5359     case CCL_STAT_QUIT:
5360     case CCL_STAT_INVALID_CMD:
5361       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5362       break;
5363     default:
5364       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5365       break;
5366     }
5367
5368   coding->produced_char += produced_chars;
5369   coding->produced = dst - coding->destination;
5370   return 0;
5371 }
5372
5373
5374 \f
5375 /*** 10, 11. no-conversion handlers ***/
5376
5377 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5378
5379 static void
5380 decode_coding_raw_text (coding)
5381      struct coding_system *coding;
5382 {
5383   int eol_crlf =
5384     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5385
5386   coding->chars_at_source = 1;
5387   coding->consumed_char = coding->src_chars;
5388   coding->consumed = coding->src_bytes;
5389   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5390     {
5391       coding->consumed_char--;
5392       coding->consumed--;
5393       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5394     }
5395   else
5396     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5397 }
5398
5399 static int
5400 encode_coding_raw_text (coding)
5401      struct coding_system *coding;
5402 {
5403   int multibytep = coding->dst_multibyte;
5404   int *charbuf = coding->charbuf;
5405   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5406   unsigned char *dst = coding->destination + coding->produced;
5407   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5408   int produced_chars = 0;
5409   int c;
5410
5411   if (multibytep)
5412     {
5413       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5414
5415       if (coding->src_multibyte)
5416         while (charbuf < charbuf_end)
5417           {
5418             ASSURE_DESTINATION (safe_room);
5419             c = *charbuf++;
5420             if (ASCII_CHAR_P (c))
5421               EMIT_ONE_ASCII_BYTE (c);
5422             else if (CHAR_BYTE8_P (c))
5423               {
5424                 c = CHAR_TO_BYTE8 (c);
5425                 EMIT_ONE_BYTE (c);
5426               }
5427             else
5428               {
5429                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5430
5431                 CHAR_STRING_ADVANCE (c, p1);
5432                 while (p0 < p1)
5433                   {
5434                     EMIT_ONE_BYTE (*p0);
5435                     p0++;
5436                   }
5437               }
5438           }
5439       else
5440         while (charbuf < charbuf_end)
5441           {
5442             ASSURE_DESTINATION (safe_room);
5443             c = *charbuf++;
5444             EMIT_ONE_BYTE (c);
5445           }
5446     }
5447   else
5448     {
5449       if (coding->src_multibyte)
5450         {
5451           int safe_room = MAX_MULTIBYTE_LENGTH;
5452
5453           while (charbuf < charbuf_end)
5454             {
5455               ASSURE_DESTINATION (safe_room);
5456               c = *charbuf++;
5457               if (ASCII_CHAR_P (c))
5458                 *dst++ = c;
5459               else if (CHAR_BYTE8_P (c))
5460                 *dst++ = CHAR_TO_BYTE8 (c);
5461               else
5462                 CHAR_STRING_ADVANCE (c, dst);
5463             }
5464         }
5465       else
5466         {
5467           ASSURE_DESTINATION (charbuf_end - charbuf);
5468           while (charbuf < charbuf_end && dst < dst_end)
5469             *dst++ = *charbuf++;
5470         }
5471       produced_chars = dst - (coding->destination + coding->produced);
5472     }
5473   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5474   coding->produced_char += produced_chars;
5475   coding->produced = dst - coding->destination;
5476   return 0;
5477 }
5478
5479 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5480    Check if a text is encoded in a charset-based coding system.  If it
5481    is, return 1, else return 0.  */
5482
5483 static int
5484 detect_coding_charset (coding, detect_info)
5485      struct coding_system *coding;
5486      struct coding_detection_info *detect_info;
5487 {
5488   const unsigned char *src = coding->source, *src_base;
5489   const unsigned char *src_end = coding->source + coding->src_bytes;
5490   int multibytep = coding->src_multibyte;
5491   int consumed_chars = 0;
5492   Lisp_Object attrs, valids, name;
5493   int found = 0;
5494   int head_ascii = coding->head_ascii;
5495   int check_latin_extra = 0;
5496
5497   detect_info->checked |= CATEGORY_MASK_CHARSET;
5498
5499   coding = &coding_categories[coding_category_charset];
5500   attrs = CODING_ID_ATTRS (coding->id);
5501   valids = AREF (attrs, coding_attr_charset_valids);
5502   name = CODING_ID_NAME (coding->id);
5503   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5504                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5505       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5506                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5507     check_latin_extra = 1;
5508
5509   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5510     src += head_ascii;
5511
5512   while (1)
5513     {
5514       int c;
5515       Lisp_Object val;
5516       struct charset *charset;
5517       int dim, idx;
5518
5519       src_base = src;
5520       ONE_MORE_BYTE (c);
5521       if (c < 0)
5522         continue;
5523       val = AREF (valids, c);
5524       if (NILP (val))
5525         break;
5526       if (c >= 0x80)
5527         {
5528           if (c < 0xA0
5529               && check_latin_extra
5530               && (!VECTORP (Vlatin_extra_code_table)
5531                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5532             break;
5533           found = CATEGORY_MASK_CHARSET;
5534         }
5535       if (INTEGERP (val))
5536         {
5537           charset = CHARSET_FROM_ID (XFASTINT (val));
5538           dim = CHARSET_DIMENSION (charset);
5539           for (idx = 1; idx < dim; idx++)
5540             {
5541               if (src == src_end)
5542                 goto too_short;
5543               ONE_MORE_BYTE (c);
5544               if (c < charset->code_space[(dim - 1 - idx) * 2]
5545                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5546                 break;
5547             }
5548           if (idx < dim)
5549             break;
5550         }
5551       else
5552         {
5553           idx = 1;
5554           for (; CONSP (val); val = XCDR (val))
5555             {
5556               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5557               dim = CHARSET_DIMENSION (charset);
5558               while (idx < dim)
5559                 {
5560                   if (src == src_end)
5561                     goto too_short;
5562                   ONE_MORE_BYTE (c);
5563                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5564                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5565                     break;
5566                   idx++;
5567                 }
5568               if (idx == dim)
5569                 {
5570                   val = Qnil;
5571                   break;
5572                 }
5573             }
5574           if (CONSP (val))
5575             break;
5576         }
5577     }
5578  too_short:
5579   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5580   return 0;
5581
5582  no_more_source:
5583   detect_info->found |= found;
5584   return 1;
5585 }
5586
5587 static void
5588 decode_coding_charset (coding)
5589      struct coding_system *coding;
5590 {
5591   const unsigned char *src = coding->source + coding->consumed;
5592   const unsigned char *src_end = coding->source + coding->src_bytes;
5593   const unsigned char *src_base;
5594   int *charbuf = coding->charbuf + coding->charbuf_used;
5595   /* We may produce one charset annotation in one loop and one more at
5596      the end.  */
5597   int *charbuf_end
5598     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5599   int consumed_chars = 0, consumed_chars_base;
5600   int multibytep = coding->src_multibyte;
5601   Lisp_Object attrs, charset_list, valids;
5602   int char_offset = coding->produced_char;
5603   int last_offset = char_offset;
5604   int last_id = charset_ascii;
5605   int eol_crlf =
5606     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5607   int byte_after_cr = -1;
5608
5609   CODING_GET_INFO (coding, attrs, charset_list);
5610   valids = AREF (attrs, coding_attr_charset_valids);
5611
5612   while (1)
5613     {
5614       int c;
5615       Lisp_Object val;
5616       struct charset *charset;
5617       int dim;
5618       int len = 1;
5619       unsigned code;
5620
5621       src_base = src;
5622       consumed_chars_base = consumed_chars;
5623
5624       if (charbuf >= charbuf_end)
5625         {
5626           if (byte_after_cr >= 0)
5627             src_base--;
5628           break;
5629         }
5630
5631       if (byte_after_cr >= 0)
5632         {
5633           c = byte_after_cr;
5634           byte_after_cr = -1;
5635         }
5636       else
5637         {
5638           ONE_MORE_BYTE (c);
5639           if (eol_crlf && c == '\r')
5640             ONE_MORE_BYTE (byte_after_cr);
5641         }
5642       if (c < 0)
5643         goto invalid_code;
5644       code = c;
5645
5646       val = AREF (valids, c);
5647       if (! INTEGERP (val) && ! CONSP (val))
5648         goto invalid_code;
5649       if (INTEGERP (val))
5650         {
5651           charset = CHARSET_FROM_ID (XFASTINT (val));
5652           dim = CHARSET_DIMENSION (charset);
5653           while (len < dim)
5654             {
5655               ONE_MORE_BYTE (c);
5656               code = (code << 8) | c;
5657               len++;
5658             }
5659           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5660                               charset, code, c);
5661         }
5662       else
5663         {
5664           /* VAL is a list of charset IDs.  It is assured that the
5665              list is sorted by charset dimensions (smaller one
5666              comes first).  */
5667           while (CONSP (val))
5668             {
5669               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5670               dim = CHARSET_DIMENSION (charset);
5671               while (len < dim)
5672                 {
5673                   ONE_MORE_BYTE (c);
5674                   code = (code << 8) | c;
5675                   len++;
5676                 }
5677               CODING_DECODE_CHAR (coding, src, src_base,
5678                                   src_end, charset, code, c);
5679               if (c >= 0)
5680                 break;
5681               val = XCDR (val);
5682             }
5683         }
5684       if (c < 0)
5685         goto invalid_code;
5686       if (charset->id != charset_ascii
5687           && last_id != charset->id)
5688         {
5689           if (last_id != charset_ascii)
5690             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5691           last_id = charset->id;
5692           last_offset = char_offset;
5693         }
5694
5695       *charbuf++ = c;
5696       char_offset++;
5697       continue;
5698
5699     invalid_code:
5700       src = src_base;
5701       consumed_chars = consumed_chars_base;
5702       ONE_MORE_BYTE (c);
5703       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5704       char_offset++;
5705       coding->errors++;
5706     }
5707
5708  no_more_source:
5709   if (last_id != charset_ascii)
5710     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5711   coding->consumed_char += consumed_chars_base;
5712   coding->consumed = src_base - coding->source;
5713   coding->charbuf_used = charbuf - coding->charbuf;
5714 }
5715
5716 static int
5717 encode_coding_charset (coding)
5718      struct coding_system *coding;
5719 {
5720   int multibytep = coding->dst_multibyte;
5721   int *charbuf = coding->charbuf;
5722   int *charbuf_end = charbuf + coding->charbuf_used;
5723   unsigned char *dst = coding->destination + coding->produced;
5724   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5725   int safe_room = MAX_MULTIBYTE_LENGTH;
5726   int produced_chars = 0;
5727   Lisp_Object attrs, charset_list;
5728   int ascii_compatible;
5729   int c;
5730
5731   CODING_GET_INFO (coding, attrs, charset_list);
5732   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5733
5734   while (charbuf < charbuf_end)
5735     {
5736       struct charset *charset;
5737       unsigned code;
5738
5739       ASSURE_DESTINATION (safe_room);
5740       c = *charbuf++;
5741       if (ascii_compatible && ASCII_CHAR_P (c))
5742         EMIT_ONE_ASCII_BYTE (c);
5743       else if (CHAR_BYTE8_P (c))
5744         {
5745           c = CHAR_TO_BYTE8 (c);
5746           EMIT_ONE_BYTE (c);
5747         }
5748       else
5749         {
5750           charset = char_charset (c, charset_list, &code);
5751           if (charset)
5752             {
5753               if (CHARSET_DIMENSION (charset) == 1)
5754                 EMIT_ONE_BYTE (code);
5755               else if (CHARSET_DIMENSION (charset) == 2)
5756                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5757               else if (CHARSET_DIMENSION (charset) == 3)
5758                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5759               else
5760                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5761                                  (code >> 8) & 0xFF, code & 0xFF);
5762             }
5763           else
5764             {
5765               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5766                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5767               else
5768                 c = coding->default_char;
5769               EMIT_ONE_BYTE (c);
5770             }
5771         }
5772     }
5773
5774   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5775   coding->produced_char += produced_chars;
5776   coding->produced = dst - coding->destination;
5777   return 0;
5778 }
5779
5780 \f
5781 /*** 7. C library functions ***/
5782
5783 /* Setup coding context CODING from information about CODING_SYSTEM.
5784    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5785    CODING_SYSTEM is invalid, signal an error.  */
5786
5787 void
5788 setup_coding_system (coding_system, coding)
5789      Lisp_Object coding_system;
5790      struct coding_system *coding;
5791 {
5792   Lisp_Object attrs;
5793   Lisp_Object eol_type;
5794   Lisp_Object coding_type;
5795   Lisp_Object val;
5796
5797   if (NILP (coding_system))
5798     coding_system = Qundecided;
5799
5800   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5801
5802   attrs = CODING_ID_ATTRS (coding->id);
5803   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5804
5805   coding->mode = 0;
5806   coding->head_ascii = -1;
5807   if (VECTORP (eol_type))
5808     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5809                             | CODING_REQUIRE_DETECTION_MASK);
5810   else if (! EQ (eol_type, Qunix))
5811     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5812                             | CODING_REQUIRE_ENCODING_MASK);
5813   else
5814     coding->common_flags = 0;
5815   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5816     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5817   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5818     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5819   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5820     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5821
5822   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5823   coding->max_charset_id = SCHARS (val) - 1;
5824   coding->safe_charsets = SDATA (val);
5825   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5826   coding->carryover_bytes = 0;
5827
5828   coding_type = CODING_ATTR_TYPE (attrs);
5829   if (EQ (coding_type, Qundecided))
5830     {
5831       coding->detector = NULL;
5832       coding->decoder = decode_coding_raw_text;
5833       coding->encoder = encode_coding_raw_text;
5834       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5835     }
5836   else if (EQ (coding_type, Qiso_2022))
5837     {
5838       int i;
5839       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5840
5841       /* Invoke graphic register 0 to plane 0.  */
5842       CODING_ISO_INVOCATION (coding, 0) = 0;
5843       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5844       CODING_ISO_INVOCATION (coding, 1)
5845         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5846       /* Setup the initial status of designation.  */
5847       for (i = 0; i < 4; i++)
5848         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5849       /* Not single shifting initially.  */
5850       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5851       /* Beginning of buffer should also be regarded as bol. */
5852       CODING_ISO_BOL (coding) = 1;
5853       coding->detector = detect_coding_iso_2022;
5854       coding->decoder = decode_coding_iso_2022;
5855       coding->encoder = encode_coding_iso_2022;
5856       if (flags & CODING_ISO_FLAG_SAFE)
5857         coding->mode |= CODING_MODE_SAFE_ENCODING;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5860             | CODING_REQUIRE_FLUSHING_MASK);
5861       if (flags & CODING_ISO_FLAG_COMPOSITION)
5862         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5863       if (flags & CODING_ISO_FLAG_DESIGNATION)
5864         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5865       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5866         {
5867           setup_iso_safe_charsets (attrs);
5868           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5869           coding->max_charset_id = SCHARS (val) - 1;
5870           coding->safe_charsets = SDATA (val);
5871         }
5872       CODING_ISO_FLAGS (coding) = flags;
5873       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5874       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5875       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5876       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5877     }
5878   else if (EQ (coding_type, Qcharset))
5879     {
5880       coding->detector = detect_coding_charset;
5881       coding->decoder = decode_coding_charset;
5882       coding->encoder = encode_coding_charset;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else if (EQ (coding_type, Qutf_8))
5887     {
5888       val = AREF (attrs, coding_attr_utf_bom);
5889       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5890                                    : EQ (val, Qt) ? utf_with_bom
5891                                    : utf_without_bom);
5892       coding->detector = detect_coding_utf_8;
5893       coding->decoder = decode_coding_utf_8;
5894       coding->encoder = encode_coding_utf_8;
5895       coding->common_flags
5896         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5897       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5898         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5899     }
5900   else if (EQ (coding_type, Qutf_16))
5901     {
5902       val = AREF (attrs, coding_attr_utf_bom);
5903       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5904                                     : EQ (val, Qt) ? utf_with_bom
5905                                     : utf_without_bom);
5906       val = AREF (attrs, coding_attr_utf_16_endian);
5907       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5908                                        : utf_16_little_endian);
5909       CODING_UTF_16_SURROGATE (coding) = 0;
5910       coding->detector = detect_coding_utf_16;
5911       coding->decoder = decode_coding_utf_16;
5912       coding->encoder = encode_coding_utf_16;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5916         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5917     }
5918   else if (EQ (coding_type, Qccl))
5919     {
5920       coding->detector = detect_coding_ccl;
5921       coding->decoder = decode_coding_ccl;
5922       coding->encoder = encode_coding_ccl;
5923       coding->common_flags
5924         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5925             | CODING_REQUIRE_FLUSHING_MASK);
5926     }
5927   else if (EQ (coding_type, Qemacs_mule))
5928     {
5929       coding->detector = detect_coding_emacs_mule;
5930       coding->decoder = decode_coding_emacs_mule;
5931       coding->encoder = encode_coding_emacs_mule;
5932       coding->common_flags
5933         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5934       coding->spec.emacs_mule.full_support = 1;
5935       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5936           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5937         {
5938           Lisp_Object tail, safe_charsets;
5939           int max_charset_id = 0;
5940
5941           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5942                tail = XCDR (tail))
5943             if (max_charset_id < XFASTINT (XCAR (tail)))
5944               max_charset_id = XFASTINT (XCAR (tail));
5945           safe_charsets = make_uninit_string (max_charset_id + 1);
5946           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5947           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5948                tail = XCDR (tail))
5949             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5950           coding->max_charset_id = max_charset_id;
5951           coding->safe_charsets = SDATA (safe_charsets);
5952           coding->spec.emacs_mule.full_support = 1;
5953         }
5954       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5955       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5956     }
5957   else if (EQ (coding_type, Qshift_jis))
5958     {
5959       coding->detector = detect_coding_sjis;
5960       coding->decoder = decode_coding_sjis;
5961       coding->encoder = encode_coding_sjis;
5962       coding->common_flags
5963         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5964     }
5965   else if (EQ (coding_type, Qbig5))
5966     {
5967       coding->detector = detect_coding_big5;
5968       coding->decoder = decode_coding_big5;
5969       coding->encoder = encode_coding_big5;
5970       coding->common_flags
5971         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5972     }
5973   else                          /* EQ (coding_type, Qraw_text) */
5974     {
5975       coding->detector = NULL;
5976       coding->decoder = decode_coding_raw_text;
5977       coding->encoder = encode_coding_raw_text;
5978       if (! EQ (eol_type, Qunix))
5979         {
5980           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5981           if (! VECTORP (eol_type))
5982             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5983         }
5984
5985     }
5986
5987   return;
5988 }
5989
5990 /* Return a list of charsets supported by CODING.  */
5991
5992 Lisp_Object
5993 coding_charset_list (coding)
5994      struct coding_system *coding;
5995 {
5996   Lisp_Object attrs, charset_list;
5997
5998   CODING_GET_INFO (coding, attrs, charset_list);
5999   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6000     {
6001       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6002
6003       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6004         charset_list = Viso_2022_charset_list;
6005     }
6006   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6007     {
6008       charset_list = Vemacs_mule_charset_list;
6009     }
6010   return charset_list;
6011 }
6012
6013
6014 /* Return a list of charsets supported by CODING-SYSTEM.  */
6015
6016 Lisp_Object
6017 coding_system_charset_list (coding_system)
6018      Lisp_Object coding_system;
6019 {
6020   int id;
6021   Lisp_Object attrs, charset_list;
6022
6023   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6024   attrs = CODING_ID_ATTRS (id);
6025
6026   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6027     {
6028       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6029
6030       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6031         charset_list = Viso_2022_charset_list;
6032       else
6033         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6034     }
6035   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6036     {
6037       charset_list = Vemacs_mule_charset_list;
6038     }
6039   else
6040     {
6041       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6042     }
6043   return charset_list;
6044 }
6045
6046
6047 /* Return raw-text or one of its subsidiaries that has the same
6048    eol_type as CODING-SYSTEM.  */
6049
6050 Lisp_Object
6051 raw_text_coding_system (coding_system)
6052      Lisp_Object coding_system;
6053 {
6054   Lisp_Object spec, attrs;
6055   Lisp_Object eol_type, raw_text_eol_type;
6056
6057   if (NILP (coding_system))
6058     return Qraw_text;
6059   spec = CODING_SYSTEM_SPEC (coding_system);
6060   attrs = AREF (spec, 0);
6061
6062   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6063     return coding_system;
6064
6065   eol_type = AREF (spec, 2);
6066   if (VECTORP (eol_type))
6067     return Qraw_text;
6068   spec = CODING_SYSTEM_SPEC (Qraw_text);
6069   raw_text_eol_type = AREF (spec, 2);
6070   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6071           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6072           : AREF (raw_text_eol_type, 2));
6073 }
6074
6075
6076 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6077    the subsidiary that has the same eol-spec as PARENT (if it is not
6078    nil and specifies end-of-line format) or the system's setting
6079    (system_eol_type).  */
6080
6081 Lisp_Object
6082 coding_inherit_eol_type (coding_system, parent)
6083      Lisp_Object coding_system, parent;
6084 {
6085   Lisp_Object spec, eol_type;
6086
6087   if (NILP (coding_system))
6088     coding_system = Qraw_text;
6089   spec = CODING_SYSTEM_SPEC (coding_system);
6090   eol_type = AREF (spec, 2);
6091   if (VECTORP (eol_type))
6092     {
6093       Lisp_Object parent_eol_type;
6094
6095       if (! NILP (parent))
6096         {
6097           Lisp_Object parent_spec;
6098
6099           parent_spec = CODING_SYSTEM_SPEC (parent);
6100           parent_eol_type = AREF (parent_spec, 2);
6101           if (VECTORP (parent_eol_type))
6102             parent_eol_type = system_eol_type;
6103         }
6104       else
6105         parent_eol_type = system_eol_type;
6106       if (EQ (parent_eol_type, Qunix))
6107         coding_system = AREF (eol_type, 0);
6108       else if (EQ (parent_eol_type, Qdos))
6109         coding_system = AREF (eol_type, 1);
6110       else if (EQ (parent_eol_type, Qmac))
6111         coding_system = AREF (eol_type, 2);
6112     }
6113   return coding_system;
6114 }
6115
6116
6117 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6118    decided for writing to a process.  If not, complement them, and
6119    return a new coding system.  */
6120
6121 Lisp_Object
6122 complement_process_encoding_system (coding_system)
6123      Lisp_Object coding_system;
6124 {
6125   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6126   Lisp_Object spec, attrs;
6127   int i;
6128
6129   for (i = 0; i < 3; i++)
6130     {
6131       if (i == 1)
6132         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6133       else if (i == 2)
6134         coding_system = preferred_coding_system ();
6135       spec = CODING_SYSTEM_SPEC (coding_system);
6136       if (NILP (spec))
6137         continue;
6138       attrs = AREF (spec, 0);
6139       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6140         coding_base = CODING_ATTR_BASE_NAME (attrs);
6141       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6142         eol_base = coding_system;
6143       if (! NILP (coding_base) && ! NILP (eol_base))
6144         break;
6145     }
6146
6147   if (i > 0)
6148     /* The original CODING_SYSTEM didn't specify text-conversion or
6149        eol-conversion.  Be sure that we return a fully complemented
6150        coding system.  */
6151     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6152   return coding_system;
6153 }
6154
6155
6156 /* Emacs has a mechanism to automatically detect a coding system if it
6157    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6158    it's impossible to distinguish some coding systems accurately
6159    because they use the same range of codes.  So, at first, coding
6160    systems are categorized into 7, those are:
6161
6162    o coding-category-emacs-mule
6163
6164         The category for a coding system which has the same code range
6165         as Emacs' internal format.  Assigned the coding-system (Lisp
6166         symbol) `emacs-mule' by default.
6167
6168    o coding-category-sjis
6169
6170         The category for a coding system which has the same code range
6171         as SJIS.  Assigned the coding-system (Lisp
6172         symbol) `japanese-shift-jis' by default.
6173
6174    o coding-category-iso-7
6175
6176         The category for a coding system which has the same code range
6177         as ISO2022 of 7-bit environment.  This doesn't use any locking
6178         shift and single shift functions.  This can encode/decode all
6179         charsets.  Assigned the coding-system (Lisp symbol)
6180         `iso-2022-7bit' by default.
6181
6182    o coding-category-iso-7-tight
6183
6184         Same as coding-category-iso-7 except that this can
6185         encode/decode only the specified charsets.
6186
6187    o coding-category-iso-8-1
6188
6189         The category for a coding system which has the same code range
6190         as ISO2022 of 8-bit environment and graphic plane 1 used only
6191         for DIMENSION1 charset.  This doesn't use any locking shift
6192         and single shift functions.  Assigned the coding-system (Lisp
6193         symbol) `iso-latin-1' by default.
6194
6195    o coding-category-iso-8-2
6196
6197         The category for a coding system which has the same code range
6198         as ISO2022 of 8-bit environment and graphic plane 1 used only
6199         for DIMENSION2 charset.  This doesn't use any locking shift
6200         and single shift functions.  Assigned the coding-system (Lisp
6201         symbol) `japanese-iso-8bit' by default.
6202
6203    o coding-category-iso-7-else
6204
6205         The category for a coding system which has the same code range
6206         as ISO2022 of 7-bit environment but uses locking shift or
6207         single shift functions.  Assigned the coding-system (Lisp
6208         symbol) `iso-2022-7bit-lock' by default.
6209
6210    o coding-category-iso-8-else
6211
6212         The category for a coding system which has the same code range
6213         as ISO2022 of 8-bit environment but uses locking shift or
6214         single shift functions.  Assigned the coding-system (Lisp
6215         symbol) `iso-2022-8bit-ss2' by default.
6216
6217    o coding-category-big5
6218
6219         The category for a coding system which has the same code range
6220         as BIG5.  Assigned the coding-system (Lisp symbol)
6221         `cn-big5' by default.
6222
6223    o coding-category-utf-8
6224
6225         The category for a coding system which has the same code range
6226         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6227         symbol) `utf-8' by default.
6228
6229    o coding-category-utf-16-be
6230
6231         The category for a coding system in which a text has an
6232         Unicode signature (cf. Unicode Standard) in the order of BIG
6233         endian at the head.  Assigned the coding-system (Lisp symbol)
6234         `utf-16-be' by default.
6235
6236    o coding-category-utf-16-le
6237
6238         The category for a coding system in which a text has an
6239         Unicode signature (cf. Unicode Standard) in the order of
6240         LITTLE endian at the head.  Assigned the coding-system (Lisp
6241         symbol) `utf-16-le' by default.
6242
6243    o coding-category-ccl
6244
6245         The category for a coding system of which encoder/decoder is
6246         written in CCL programs.  The default value is nil, i.e., no
6247         coding system is assigned.
6248
6249    o coding-category-binary
6250
6251         The category for a coding system not categorized in any of the
6252         above.  Assigned the coding-system (Lisp symbol)
6253         `no-conversion' by default.
6254
6255    Each of them is a Lisp symbol and the value is an actual
6256    `coding-system's (this is also a Lisp symbol) assigned by a user.
6257    What Emacs does actually is to detect a category of coding system.
6258    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6259    decide only one possible category, it selects a category of the
6260    highest priority.  Priorities of categories are also specified by a
6261    user in a Lisp variable `coding-category-list'.
6262
6263 */
6264
6265 #define EOL_SEEN_NONE   0
6266 #define EOL_SEEN_LF     1
6267 #define EOL_SEEN_CR     2
6268 #define EOL_SEEN_CRLF   4
6269
6270 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6271    SOURCE is encoded.  If CATEGORY is one of
6272    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6273    two-byte, else they are encoded by one-byte.
6274
6275    Return one of EOL_SEEN_XXX.  */
6276
6277 #define MAX_EOL_CHECK_COUNT 3
6278
6279 static int
6280 detect_eol (source, src_bytes, category)
6281      const unsigned char *source;
6282      EMACS_INT src_bytes;
6283      enum coding_category category;
6284 {
6285   const unsigned char *src = source, *src_end = src + src_bytes;
6286   unsigned char c;
6287   int total  = 0;
6288   int eol_seen = EOL_SEEN_NONE;
6289
6290   if ((1 << category) & CATEGORY_MASK_UTF_16)
6291     {
6292       int msb, lsb;
6293
6294       msb = category == (coding_category_utf_16_le
6295                          | coding_category_utf_16_le_nosig);
6296       lsb = 1 - msb;
6297
6298       while (src + 1 < src_end)
6299         {
6300           c = src[lsb];
6301           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6302             {
6303               int this_eol;
6304
6305               if (c == '\n')
6306                 this_eol = EOL_SEEN_LF;
6307               else if (src + 3 >= src_end
6308                        || src[msb + 2] != 0
6309                        || src[lsb + 2] != '\n')
6310                 this_eol = EOL_SEEN_CR;
6311               else
6312                 {
6313                   this_eol = EOL_SEEN_CRLF;
6314                   src += 2;
6315                 }
6316
6317               if (eol_seen == EOL_SEEN_NONE)
6318                 /* This is the first end-of-line.  */
6319                 eol_seen = this_eol;
6320               else if (eol_seen != this_eol)
6321                 {
6322                   /* The found type is different from what found before.
6323                      Allow for stray ^M characters in DOS EOL files.  */
6324                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6325                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6326                     eol_seen = EOL_SEEN_CRLF;
6327                   else
6328                     {
6329                       eol_seen = EOL_SEEN_LF;
6330                       break;
6331                     }
6332                 }
6333               if (++total == MAX_EOL_CHECK_COUNT)
6334                 break;
6335             }
6336           src += 2;
6337         }
6338     }
6339   else
6340     {
6341       while (src < src_end)
6342         {
6343           c = *src++;
6344           if (c == '\n' || c == '\r')
6345             {
6346               int this_eol;
6347
6348               if (c == '\n')
6349                 this_eol = EOL_SEEN_LF;
6350               else if (src >= src_end || *src != '\n')
6351                 this_eol = EOL_SEEN_CR;
6352               else
6353                 this_eol = EOL_SEEN_CRLF, src++;
6354
6355               if (eol_seen == EOL_SEEN_NONE)
6356                 /* This is the first end-of-line.  */
6357                 eol_seen = this_eol;
6358               else if (eol_seen != this_eol)
6359                 {
6360                   /* The found type is different from what found before.
6361                      Allow for stray ^M characters in DOS EOL files.  */
6362                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6363                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6364                     eol_seen = EOL_SEEN_CRLF;
6365                   else
6366                     {
6367                       eol_seen = EOL_SEEN_LF;
6368                       break;
6369                     }
6370                 }
6371               if (++total == MAX_EOL_CHECK_COUNT)
6372                 break;
6373             }
6374         }
6375     }
6376   return eol_seen;
6377 }
6378
6379
6380 static Lisp_Object
6381 adjust_coding_eol_type (coding, eol_seen)
6382      struct coding_system *coding;
6383      int eol_seen;
6384 {
6385   Lisp_Object eol_type;
6386
6387   eol_type = CODING_ID_EOL_TYPE (coding->id);
6388   if (eol_seen & EOL_SEEN_LF)
6389     {
6390       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6391       eol_type = Qunix;
6392     }
6393   else if (eol_seen & EOL_SEEN_CRLF)
6394     {
6395       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6396       eol_type = Qdos;
6397     }
6398   else if (eol_seen & EOL_SEEN_CR)
6399     {
6400       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6401       eol_type = Qmac;
6402     }
6403   return eol_type;
6404 }
6405
6406 /* Detect how a text specified in CODING is encoded.  If a coding
6407    system is detected, update fields of CODING by the detected coding
6408    system.  */
6409
6410 void
6411 detect_coding (coding)
6412      struct coding_system *coding;
6413 {
6414   const unsigned char *src, *src_end;
6415   int saved_mode = coding->mode;
6416
6417   coding->consumed = coding->consumed_char = 0;
6418   coding->produced = coding->produced_char = 0;
6419   coding_set_source (coding);
6420
6421   src_end = coding->source + coding->src_bytes;
6422   coding->head_ascii = 0;
6423
6424   /* If we have not yet decided the text encoding type, detect it
6425      now.  */
6426   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6427     {
6428       int c, i;
6429       struct coding_detection_info detect_info;
6430       int null_byte_found = 0, eight_bit_found = 0;
6431
6432       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6433       for (src = coding->source; src < src_end; src++)
6434         {
6435           c = *src;
6436           if (c & 0x80)
6437             {
6438               eight_bit_found = 1;
6439               if (null_byte_found)
6440                 break;
6441             }
6442           else if (c < 0x20)
6443             {
6444               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6445                   && ! inhibit_iso_escape_detection
6446                   && ! detect_info.checked)
6447                 {
6448                   if (detect_coding_iso_2022 (coding, &detect_info))
6449                     {
6450                       /* We have scanned the whole data.  */
6451                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6452                         {
6453                           /* We didn't find an 8-bit code.  We may
6454                              have found a null-byte, but it's very
6455                              rare that a binary file confirm to
6456                              ISO-2022.  */
6457                           src = src_end;
6458                           coding->head_ascii = src - coding->source;
6459                         }
6460                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6461                       break;
6462                     }
6463                 }
6464               else if (! c && !inhibit_null_byte_detection)
6465                 {
6466                   null_byte_found = 1;
6467                   if (eight_bit_found)
6468                     break;
6469                 }
6470               if (! eight_bit_found)
6471                 coding->head_ascii++;
6472             }
6473           else if (! eight_bit_found)
6474             coding->head_ascii++;
6475         }
6476
6477       if (null_byte_found || eight_bit_found
6478           || coding->head_ascii < coding->src_bytes
6479           || detect_info.found)
6480         {
6481           enum coding_category category;
6482           struct coding_system *this;
6483
6484           if (coding->head_ascii == coding->src_bytes)
6485             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6486             for (i = 0; i < coding_category_raw_text; i++)
6487               {
6488                 category = coding_priorities[i];
6489                 this = coding_categories + category;
6490                 if (detect_info.found & (1 << category))
6491                   break;
6492               }
6493           else
6494             {
6495               if (null_byte_found)
6496                 {
6497                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6498                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6499                 }
6500               for (i = 0; i < coding_category_raw_text; i++)
6501                 {
6502                   category = coding_priorities[i];
6503                   this = coding_categories + category;
6504                   if (this->id < 0)
6505                     {
6506                       /* No coding system of this category is defined.  */
6507                       detect_info.rejected |= (1 << category);
6508                     }
6509                   else if (category >= coding_category_raw_text)
6510                     continue;
6511                   else if (detect_info.checked & (1 << category))
6512                     {
6513                       if (detect_info.found & (1 << category))
6514                         break;
6515                     }
6516                   else if ((*(this->detector)) (coding, &detect_info)
6517                            && detect_info.found & (1 << category))
6518                     {
6519                       if (category == coding_category_utf_16_auto)
6520                         {
6521                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6522                             category = coding_category_utf_16_le;
6523                           else
6524                             category = coding_category_utf_16_be;
6525                         }
6526                       break;
6527                     }
6528                 }
6529             }
6530
6531           if (i < coding_category_raw_text)
6532             setup_coding_system (CODING_ID_NAME (this->id), coding);
6533           else if (null_byte_found)
6534             setup_coding_system (Qno_conversion, coding);
6535           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6536                    == CATEGORY_MASK_ANY)
6537             setup_coding_system (Qraw_text, coding);
6538           else if (detect_info.rejected)
6539             for (i = 0; i < coding_category_raw_text; i++)
6540               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6541                 {
6542                   this = coding_categories + coding_priorities[i];
6543                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6544                   break;
6545                 }
6546         }
6547     }
6548   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6549            == coding_category_utf_8_auto)
6550     {
6551       Lisp_Object coding_systems;
6552       struct coding_detection_info detect_info;
6553
6554       coding_systems
6555         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6556       detect_info.found = detect_info.rejected = 0;
6557       coding->head_ascii = 0;
6558       if (CONSP (coding_systems)
6559           && detect_coding_utf_8 (coding, &detect_info))
6560         {
6561           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6562             setup_coding_system (XCAR (coding_systems), coding);
6563           else
6564             setup_coding_system (XCDR (coding_systems), coding);
6565         }
6566     }
6567   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6568            == coding_category_utf_16_auto)
6569     {
6570       Lisp_Object coding_systems;
6571       struct coding_detection_info detect_info;
6572
6573       coding_systems
6574         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6575       detect_info.found = detect_info.rejected = 0;
6576       coding->head_ascii = 0;
6577       if (CONSP (coding_systems)
6578           && detect_coding_utf_16 (coding, &detect_info))
6579         {
6580           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6581             setup_coding_system (XCAR (coding_systems), coding);
6582           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6583             setup_coding_system (XCDR (coding_systems), coding);
6584         }
6585     }
6586   coding->mode = saved_mode;
6587 }
6588
6589
6590 static void
6591 decode_eol (coding)
6592      struct coding_system *coding;
6593 {
6594   Lisp_Object eol_type;
6595   unsigned char *p, *pbeg, *pend;
6596
6597   eol_type = CODING_ID_EOL_TYPE (coding->id);
6598   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6599     return;
6600
6601   if (NILP (coding->dst_object))
6602     pbeg = coding->destination;
6603   else
6604     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6605   pend = pbeg + coding->produced;
6606
6607   if (VECTORP (eol_type))
6608     {
6609       int eol_seen = EOL_SEEN_NONE;
6610
6611       for (p = pbeg; p < pend; p++)
6612         {
6613           if (*p == '\n')
6614             eol_seen |= EOL_SEEN_LF;
6615           else if (*p == '\r')
6616             {
6617               if (p + 1 < pend && *(p + 1) == '\n')
6618                 {
6619                   eol_seen |= EOL_SEEN_CRLF;
6620                   p++;
6621                 }
6622               else
6623                 eol_seen |= EOL_SEEN_CR;
6624             }
6625         }
6626       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6627       if ((eol_seen & EOL_SEEN_CRLF) != 0
6628           && (eol_seen & EOL_SEEN_CR) != 0
6629           && (eol_seen & EOL_SEEN_LF) == 0)
6630         eol_seen = EOL_SEEN_CRLF;
6631       else if (eol_seen != EOL_SEEN_NONE
6632           && eol_seen != EOL_SEEN_LF
6633           && eol_seen != EOL_SEEN_CRLF
6634           && eol_seen != EOL_SEEN_CR)
6635         eol_seen = EOL_SEEN_LF;
6636       if (eol_seen != EOL_SEEN_NONE)
6637         eol_type = adjust_coding_eol_type (coding, eol_seen);
6638     }
6639
6640   if (EQ (eol_type, Qmac))
6641     {
6642       for (p = pbeg; p < pend; p++)
6643         if (*p == '\r')
6644           *p = '\n';
6645     }
6646   else if (EQ (eol_type, Qdos))
6647     {
6648       int n = 0;
6649
6650       if (NILP (coding->dst_object))
6651         {
6652           /* Start deleting '\r' from the tail to minimize the memory
6653              movement.  */
6654           for (p = pend - 2; p >= pbeg; p--)
6655             if (*p == '\r')
6656               {
6657                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6658                 n++;
6659               }
6660         }
6661       else
6662         {
6663           int pos_byte = coding->dst_pos_byte;
6664           int pos = coding->dst_pos;
6665           int pos_end = pos + coding->produced_char - 1;
6666
6667           while (pos < pos_end)
6668             {
6669               p = BYTE_POS_ADDR (pos_byte);
6670               if (*p == '\r' && p[1] == '\n')
6671                 {
6672                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6673                   n++;
6674                   pos_end--;
6675                 }
6676               pos++;
6677               if (coding->dst_multibyte)
6678                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6679               else
6680                 pos_byte++;
6681             }
6682         }
6683       coding->produced -= n;
6684       coding->produced_char -= n;
6685     }
6686 }
6687
6688
6689 /* Return a translation table (or list of them) from coding system
6690    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6691    decoding (ENCODEP is zero). */
6692
6693 static Lisp_Object
6694 get_translation_table (attrs, encodep, max_lookup)
6695      Lisp_Object attrs;
6696      int encodep, *max_lookup;
6697 {
6698   Lisp_Object standard, translation_table;
6699   Lisp_Object val;
6700
6701   if (NILP (Venable_character_translation))
6702     {
6703       if (max_lookup)
6704         *max_lookup = 0;
6705       return Qnil;
6706     }
6707   if (encodep)
6708     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6709       standard = Vstandard_translation_table_for_encode;
6710   else
6711     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6712       standard = Vstandard_translation_table_for_decode;
6713   if (NILP (translation_table))
6714     translation_table = standard;
6715   else
6716     {
6717       if (SYMBOLP (translation_table))
6718         translation_table = Fget (translation_table, Qtranslation_table);
6719       else if (CONSP (translation_table))
6720         {
6721           translation_table = Fcopy_sequence (translation_table);
6722           for (val = translation_table; CONSP (val); val = XCDR (val))
6723             if (SYMBOLP (XCAR (val)))
6724               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6725         }
6726       if (CHAR_TABLE_P (standard))
6727         {
6728           if (CONSP (translation_table))
6729             translation_table = nconc2 (translation_table,
6730                                         Fcons (standard, Qnil));
6731           else
6732             translation_table = Fcons (translation_table,
6733                                        Fcons (standard, Qnil));
6734         }
6735     }
6736
6737   if (max_lookup)
6738     {
6739       *max_lookup = 1;
6740       if (CHAR_TABLE_P (translation_table)
6741           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6742         {
6743           val = XCHAR_TABLE (translation_table)->extras[1];
6744           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6745             *max_lookup = XFASTINT (val);
6746         }
6747       else if (CONSP (translation_table))
6748         {
6749           Lisp_Object tail, val;
6750
6751           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6752             if (CHAR_TABLE_P (XCAR (tail))
6753                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6754               {
6755                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6756                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6757                   *max_lookup = XFASTINT (val);
6758               }
6759         }
6760     }
6761   return translation_table;
6762 }
6763
6764 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6765   do {                                                          \
6766     trans = Qnil;                                               \
6767     if (CHAR_TABLE_P (table))                                   \
6768       {                                                         \
6769         trans = CHAR_TABLE_REF (table, c);                      \
6770         if (CHARACTERP (trans))                                 \
6771           c = XFASTINT (trans), trans = Qnil;                   \
6772       }                                                         \
6773     else if (CONSP (table))                                     \
6774       {                                                         \
6775         Lisp_Object tail;                                       \
6776                                                                 \
6777         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6778           if (CHAR_TABLE_P (XCAR (tail)))                       \
6779             {                                                   \
6780               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6781               if (CHARACTERP (trans))                           \
6782                 c = XFASTINT (trans), trans = Qnil;             \
6783               else if (! NILP (trans))                          \
6784                 break;                                          \
6785             }                                                   \
6786       }                                                         \
6787   } while (0)
6788
6789
6790 /* Return a translation of character(s) at BUF according to TRANS.
6791    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6792    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6793    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6794    translation is found, and Qnil if not found..
6795    If BUF is too short to lookup characters in FROM, return Qt.  */
6796
6797 static Lisp_Object
6798 get_translation (trans, buf, buf_end)
6799      Lisp_Object trans;
6800      int *buf, *buf_end;
6801 {
6802
6803   if (INTEGERP (trans))
6804     return trans;
6805   for (; CONSP (trans); trans = XCDR (trans))
6806     {
6807       Lisp_Object val = XCAR (trans);
6808       Lisp_Object from = XCAR (val);
6809       int len = ASIZE (from);
6810       int i;
6811
6812       for (i = 0; i < len; i++)
6813         {
6814           if (buf + i == buf_end)
6815             return Qt;
6816           if (XINT (AREF (from, i)) != buf[i])
6817             break;
6818         }
6819       if (i == len)
6820         return val;
6821     }
6822   return Qnil;
6823 }
6824
6825
6826 static int
6827 produce_chars (coding, translation_table, last_block)
6828      struct coding_system *coding;
6829      Lisp_Object translation_table;
6830      int last_block;
6831 {
6832   unsigned char *dst = coding->destination + coding->produced;
6833   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6834   EMACS_INT produced;
6835   EMACS_INT produced_chars = 0;
6836   int carryover = 0;
6837
6838   if (! coding->chars_at_source)
6839     {
6840       /* Source characters are in coding->charbuf.  */
6841       int *buf = coding->charbuf;
6842       int *buf_end = buf + coding->charbuf_used;
6843
6844       if (EQ (coding->src_object, coding->dst_object))
6845         {
6846           coding_set_source (coding);
6847           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6848         }
6849
6850       while (buf < buf_end)
6851         {
6852           int c = *buf, i;
6853
6854           if (c >= 0)
6855             {
6856               int from_nchars = 1, to_nchars = 1;
6857               Lisp_Object trans = Qnil;
6858
6859               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6860               if (! NILP (trans))
6861                 {
6862                   trans = get_translation (trans, buf, buf_end);
6863                   if (INTEGERP (trans))
6864                     c = XINT (trans);
6865                   else if (CONSP (trans))
6866                     {
6867                       from_nchars = ASIZE (XCAR (trans));
6868                       trans = XCDR (trans);
6869                       if (INTEGERP (trans))
6870                         c = XINT (trans);
6871                       else
6872                         {
6873                           to_nchars = ASIZE (trans);
6874                           c = XINT (AREF (trans, 0));
6875                         }
6876                     }
6877                   else if (EQ (trans, Qt) && ! last_block)
6878                     break;
6879                 }
6880
6881               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6882                 {
6883                   dst = alloc_destination (coding,
6884                                            buf_end - buf
6885                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6886                                            dst);
6887                   if (EQ (coding->src_object, coding->dst_object))
6888                     {
6889                       coding_set_source (coding);
6890                       dst_end = (((unsigned char *) coding->source)
6891                                  + coding->consumed);
6892                     }
6893                   else
6894                     dst_end = coding->destination + coding->dst_bytes;
6895                 }
6896
6897               for (i = 0; i < to_nchars; i++)
6898                 {
6899                   if (i > 0)
6900                     c = XINT (AREF (trans, i));
6901                   if (coding->dst_multibyte
6902                       || ! CHAR_BYTE8_P (c))
6903                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6904                   else
6905                     *dst++ = CHAR_TO_BYTE8 (c);
6906                 }
6907               produced_chars += to_nchars;
6908               buf += from_nchars;
6909             }
6910           else
6911             /* This is an annotation datum.  (-C) is the length.  */
6912             buf += -c;
6913         }
6914       carryover = buf_end - buf;
6915     }
6916   else
6917     {
6918       /* Source characters are at coding->source.  */
6919       const unsigned char *src = coding->source;
6920       const unsigned char *src_end = src + coding->consumed;
6921
6922       if (EQ (coding->dst_object, coding->src_object))
6923         dst_end = (unsigned char *) src;
6924       if (coding->src_multibyte != coding->dst_multibyte)
6925         {
6926           if (coding->src_multibyte)
6927             {
6928               int multibytep = 1;
6929               EMACS_INT consumed_chars = 0;
6930
6931               while (1)
6932                 {
6933                   const unsigned char *src_base = src;
6934                   int c;
6935
6936                   ONE_MORE_BYTE (c);
6937                   if (dst == dst_end)
6938                     {
6939                       if (EQ (coding->src_object, coding->dst_object))
6940                         dst_end = (unsigned char *) src;
6941                       if (dst == dst_end)
6942                         {
6943                           EMACS_INT offset = src - coding->source;
6944
6945                           dst = alloc_destination (coding, src_end - src + 1,
6946                                                    dst);
6947                           dst_end = coding->destination + coding->dst_bytes;
6948                           coding_set_source (coding);
6949                           src = coding->source + offset;
6950                           src_end = coding->source + coding->src_bytes;
6951                           if (EQ (coding->src_object, coding->dst_object))
6952                             dst_end = (unsigned char *) src;
6953                         }
6954                     }
6955                   *dst++ = c;
6956                   produced_chars++;
6957                 }
6958             no_more_source:
6959               ;
6960             }
6961           else
6962             while (src < src_end)
6963               {
6964                 int multibytep = 1;
6965                 int c = *src++;
6966
6967                 if (dst >= dst_end - 1)
6968                   {
6969                     if (EQ (coding->src_object, coding->dst_object))
6970                       dst_end = (unsigned char *) src;
6971                     if (dst >= dst_end - 1)
6972                       {
6973                         EMACS_INT offset = src - coding->source;
6974                         EMACS_INT more_bytes;
6975
6976                         if (EQ (coding->src_object, coding->dst_object))
6977                           more_bytes = ((src_end - src) / 2) + 2;
6978                         else
6979                           more_bytes = src_end - src + 2;
6980                         dst = alloc_destination (coding, more_bytes, dst);
6981                         dst_end = coding->destination + coding->dst_bytes;
6982                         coding_set_source (coding);
6983                         src = coding->source + offset;
6984                         src_end = coding->source + coding->src_bytes;
6985                         if (EQ (coding->src_object, coding->dst_object))
6986                           dst_end = (unsigned char *) src;
6987                       }
6988                   }
6989                 EMIT_ONE_BYTE (c);
6990               }
6991         }
6992       else
6993         {
6994           if (!EQ (coding->src_object, coding->dst_object))
6995             {
6996               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6997
6998               if (require > 0)
6999                 {
7000                   EMACS_INT offset = src - coding->source;
7001
7002                   dst = alloc_destination (coding, require, dst);
7003                   coding_set_source (coding);
7004                   src = coding->source + offset;
7005                   src_end = coding->source + coding->src_bytes;
7006                 }
7007             }
7008           produced_chars = coding->consumed_char;
7009           while (src < src_end)
7010             *dst++ = *src++;
7011         }
7012     }
7013
7014   produced = dst - (coding->destination + coding->produced);
7015   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7016     insert_from_gap (produced_chars, produced);
7017   coding->produced += produced;
7018   coding->produced_char += produced_chars;
7019   return carryover;
7020 }
7021
7022 /* Compose text in CODING->object according to the annotation data at
7023    CHARBUF.  CHARBUF is an array:
7024      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7025  */
7026
7027 static INLINE void
7028 produce_composition (coding, charbuf, pos)
7029      struct coding_system *coding;
7030      int *charbuf;
7031      EMACS_INT pos;
7032 {
7033   int len;
7034   EMACS_INT to;
7035   enum composition_method method;
7036   Lisp_Object components;
7037
7038   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7039   to = pos + charbuf[2];
7040   method = (enum composition_method) (charbuf[4]);
7041
7042   if (method == COMPOSITION_RELATIVE)
7043     components = Qnil;
7044   else
7045     {
7046       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7047       int i, j;
7048
7049       if (method == COMPOSITION_WITH_RULE)
7050         len = charbuf[2] * 3 - 2;
7051       charbuf += MAX_ANNOTATION_LENGTH;
7052       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7053       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7054         {
7055           if (charbuf[i] >= 0)
7056             args[j] = make_number (charbuf[i]);
7057           else
7058             {
7059               i++;
7060               args[j] = make_number (charbuf[i] % 0x100);
7061             }
7062         }
7063       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7064     }
7065   compose_text (pos, to, components, Qnil, coding->dst_object);
7066 }
7067
7068
7069 /* Put `charset' property on text in CODING->object according to
7070    the annotation data at CHARBUF.  CHARBUF is an array:
7071      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7072  */
7073
7074 static INLINE void
7075 produce_charset (coding, charbuf, pos)
7076      struct coding_system *coding;
7077      int *charbuf;
7078      EMACS_INT pos;
7079 {
7080   EMACS_INT from = pos - charbuf[2];
7081   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7082
7083   Fput_text_property (make_number (from), make_number (pos),
7084                       Qcharset, CHARSET_NAME (charset),
7085                       coding->dst_object);
7086 }
7087
7088
7089 #define CHARBUF_SIZE 0x4000
7090
7091 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7092   do {                                                                  \
7093     int size = CHARBUF_SIZE;                                            \
7094                                                                         \
7095     coding->charbuf = NULL;                                             \
7096     while (size > 1024)                                                 \
7097       {                                                                 \
7098         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7099         if (coding->charbuf)                                            \
7100           break;                                                        \
7101         size >>= 1;                                                     \
7102       }                                                                 \
7103     if (! coding->charbuf)                                              \
7104       {                                                                 \
7105         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7106         return coding->result;                                          \
7107       }                                                                 \
7108     coding->charbuf_size = size;                                        \
7109   } while (0)
7110
7111
7112 static void
7113 produce_annotation (coding, pos)
7114      struct coding_system *coding;
7115      EMACS_INT pos;
7116 {
7117   int *charbuf = coding->charbuf;
7118   int *charbuf_end = charbuf + coding->charbuf_used;
7119
7120   if (NILP (coding->dst_object))
7121     return;
7122
7123   while (charbuf < charbuf_end)
7124     {
7125       if (*charbuf >= 0)
7126         pos++, charbuf++;
7127       else
7128         {
7129           int len = -*charbuf;
7130
7131           if (len > 2)
7132             switch (charbuf[1])
7133               {
7134               case CODING_ANNOTATE_COMPOSITION_MASK:
7135                 produce_composition (coding, charbuf, pos);
7136                 break;
7137               case CODING_ANNOTATE_CHARSET_MASK:
7138                 produce_charset (coding, charbuf, pos);
7139                 break;
7140               }
7141           charbuf += len;
7142         }
7143     }
7144 }
7145
7146 /* Decode the data at CODING->src_object into CODING->dst_object.
7147    CODING->src_object is a buffer, a string, or nil.
7148    CODING->dst_object is a buffer.
7149
7150    If CODING->src_object is a buffer, it must be the current buffer.
7151    In this case, if CODING->src_pos is positive, it is a position of
7152    the source text in the buffer, otherwise, the source text is in the
7153    gap area of the buffer, and CODING->src_pos specifies the offset of
7154    the text from GPT (which must be the same as PT).  If this is the
7155    same buffer as CODING->dst_object, CODING->src_pos must be
7156    negative.
7157
7158    If CODING->src_object is a string, CODING->src_pos is an index to
7159    that string.
7160
7161    If CODING->src_object is nil, CODING->source must already point to
7162    the non-relocatable memory area.  In this case, CODING->src_pos is
7163    an offset from CODING->source.
7164
7165    The decoded data is inserted at the current point of the buffer
7166    CODING->dst_object.
7167 */
7168
7169 static int
7170 decode_coding (coding)
7171      struct coding_system *coding;
7172 {
7173   Lisp_Object attrs;
7174   Lisp_Object undo_list;
7175   Lisp_Object translation_table;
7176   struct ccl_spec cclspec;
7177   int carryover;
7178   int i;
7179
7180   if (BUFFERP (coding->src_object)
7181       && coding->src_pos > 0
7182       && coding->src_pos < GPT
7183       && coding->src_pos + coding->src_chars > GPT)
7184     move_gap_both (coding->src_pos, coding->src_pos_byte);
7185
7186   undo_list = Qt;
7187   if (BUFFERP (coding->dst_object))
7188     {
7189       if (current_buffer != XBUFFER (coding->dst_object))
7190         set_buffer_internal (XBUFFER (coding->dst_object));
7191       if (GPT != PT)
7192         move_gap_both (PT, PT_BYTE);
7193       undo_list = current_buffer->undo_list;
7194       current_buffer->undo_list = Qt;
7195     }
7196
7197   coding->consumed = coding->consumed_char = 0;
7198   coding->produced = coding->produced_char = 0;
7199   coding->chars_at_source = 0;
7200   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7201   coding->errors = 0;
7202
7203   ALLOC_CONVERSION_WORK_AREA (coding);
7204
7205   attrs = CODING_ID_ATTRS (coding->id);
7206   translation_table = get_translation_table (attrs, 0, NULL);
7207
7208   carryover = 0;
7209   if (coding->decoder == decode_coding_ccl)
7210     {
7211       coding->spec.ccl = &cclspec;
7212       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7213     }
7214   do
7215     {
7216       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7217
7218       coding_set_source (coding);
7219       coding->annotated = 0;
7220       coding->charbuf_used = carryover;
7221       (*(coding->decoder)) (coding);
7222       coding_set_destination (coding);
7223       carryover = produce_chars (coding, translation_table, 0);
7224       if (coding->annotated)
7225         produce_annotation (coding, pos);
7226       for (i = 0; i < carryover; i++)
7227         coding->charbuf[i]
7228           = coding->charbuf[coding->charbuf_used - carryover + i];
7229     }
7230   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7231          || (coding->consumed < coding->src_bytes
7232              && (coding->result == CODING_RESULT_SUCCESS
7233                  || coding->result == CODING_RESULT_INVALID_SRC)));
7234
7235   if (carryover > 0)
7236     {
7237       coding_set_destination (coding);
7238       coding->charbuf_used = carryover;
7239       produce_chars (coding, translation_table, 1);
7240     }
7241
7242   coding->carryover_bytes = 0;
7243   if (coding->consumed < coding->src_bytes)
7244     {
7245       int nbytes = coding->src_bytes - coding->consumed;
7246       const unsigned char *src;
7247
7248       coding_set_source (coding);
7249       coding_set_destination (coding);
7250       src = coding->source + coding->consumed;
7251
7252       if (coding->mode & CODING_MODE_LAST_BLOCK)
7253         {
7254           /* Flush out unprocessed data as binary chars.  We are sure
7255              that the number of data is less than the size of
7256              coding->charbuf.  */
7257           coding->charbuf_used = 0;
7258           coding->chars_at_source = 0;
7259
7260           while (nbytes-- > 0)
7261             {
7262               int c = *src++;
7263
7264               if (c & 0x80)
7265                 c = BYTE8_TO_CHAR (c);
7266               coding->charbuf[coding->charbuf_used++] = c;
7267             }
7268           produce_chars (coding, Qnil, 1);
7269         }
7270       else
7271         {
7272           /* Record unprocessed bytes in coding->carryover.  We are
7273              sure that the number of data is less than the size of
7274              coding->carryover.  */
7275           unsigned char *p = coding->carryover;
7276
7277           if (nbytes > sizeof coding->carryover)
7278             nbytes = sizeof coding->carryover;
7279           coding->carryover_bytes = nbytes;
7280           while (nbytes-- > 0)
7281             *p++ = *src++;
7282         }
7283       coding->consumed = coding->src_bytes;
7284     }
7285
7286   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7287       && !inhibit_eol_conversion)
7288     decode_eol (coding);
7289   if (BUFFERP (coding->dst_object))
7290     {
7291       current_buffer->undo_list = undo_list;
7292       record_insert (coding->dst_pos, coding->produced_char);
7293     }
7294   return coding->result;
7295 }
7296
7297
7298 /* Extract an annotation datum from a composition starting at POS and
7299    ending before LIMIT of CODING->src_object (buffer or string), store
7300    the data in BUF, set *STOP to a starting position of the next
7301    composition (if any) or to LIMIT, and return the address of the
7302    next element of BUF.
7303
7304    If such an annotation is not found, set *STOP to a starting
7305    position of a composition after POS (if any) or to LIMIT, and
7306    return BUF.  */
7307
7308 static INLINE int *
7309 handle_composition_annotation (pos, limit, coding, buf, stop)
7310      EMACS_INT pos, limit;
7311      struct coding_system *coding;
7312      int *buf;
7313      EMACS_INT *stop;
7314 {
7315   EMACS_INT start, end;
7316   Lisp_Object prop;
7317
7318   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7319       || end > limit)
7320     *stop = limit;
7321   else if (start > pos)
7322     *stop = start;
7323   else
7324     {
7325       if (start == pos)
7326         {
7327           /* We found a composition.  Store the corresponding
7328              annotation data in BUF.  */
7329           int *head = buf;
7330           enum composition_method method = COMPOSITION_METHOD (prop);
7331           int nchars = COMPOSITION_LENGTH (prop);
7332
7333           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7334           if (method != COMPOSITION_RELATIVE)
7335             {
7336               Lisp_Object components;
7337               int len, i, i_byte;
7338
7339               components = COMPOSITION_COMPONENTS (prop);
7340               if (VECTORP (components))
7341                 {
7342                   len = XVECTOR_SIZE (components);
7343                   for (i = 0; i < len; i++)
7344                     *buf++ = XINT (AREF (components, i));
7345                 }
7346               else if (STRINGP (components))
7347                 {
7348                   len = SCHARS (components);
7349                   i = i_byte = 0;
7350                   while (i < len)
7351                     {
7352                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7353                       buf++;
7354                     }
7355                 }
7356               else if (INTEGERP (components))
7357                 {
7358                   len = 1;
7359                   *buf++ = XINT (components);
7360                 }
7361               else if (CONSP (components))
7362                 {
7363                   for (len = 0; CONSP (components);
7364                        len++, components = XCDR (components))
7365                     *buf++ = XINT (XCAR (components));
7366                 }
7367               else
7368                 abort ();
7369               *head -= len;
7370             }
7371         }
7372
7373       if (find_composition (end, limit, &start, &end, &prop,
7374                             coding->src_object)
7375           && end <= limit)
7376         *stop = start;
7377       else
7378         *stop = limit;
7379     }
7380   return buf;
7381 }
7382
7383
7384 /* Extract an annotation datum from a text property `charset' at POS of
7385    CODING->src_object (buffer of string), store the data in BUF, set
7386    *STOP to the position where the value of `charset' property changes
7387    (limiting by LIMIT), and return the address of the next element of
7388    BUF.
7389
7390    If the property value is nil, set *STOP to the position where the
7391    property value is non-nil (limiting by LIMIT), and return BUF.  */
7392
7393 static INLINE int *
7394 handle_charset_annotation (pos, limit, coding, buf, stop)
7395      EMACS_INT pos, limit;
7396      struct coding_system *coding;
7397      int *buf;
7398      EMACS_INT *stop;
7399 {
7400   Lisp_Object val, next;
7401   int id;
7402
7403   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7404   if (! NILP (val) && CHARSETP (val))
7405     id = XINT (CHARSET_SYMBOL_ID (val));
7406   else
7407     id = -1;
7408   ADD_CHARSET_DATA (buf, 0, id);
7409   next = Fnext_single_property_change (make_number (pos), Qcharset,
7410                                        coding->src_object,
7411                                        make_number (limit));
7412   *stop = XINT (next);
7413   return buf;
7414 }
7415
7416
7417 static void
7418 consume_chars (coding, translation_table, max_lookup)
7419      struct coding_system *coding;
7420      Lisp_Object translation_table;
7421      int max_lookup;
7422 {
7423   int *buf = coding->charbuf;
7424   int *buf_end = coding->charbuf + coding->charbuf_size;
7425   const unsigned char *src = coding->source + coding->consumed;
7426   const unsigned char *src_end = coding->source + coding->src_bytes;
7427   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7428   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7429   int multibytep = coding->src_multibyte;
7430   Lisp_Object eol_type;
7431   int c;
7432   EMACS_INT stop, stop_composition, stop_charset;
7433   int *lookup_buf = NULL;
7434
7435   if (! NILP (translation_table))
7436     lookup_buf = alloca (sizeof (int) * max_lookup);
7437
7438   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7439   if (VECTORP (eol_type))
7440     eol_type = Qunix;
7441
7442   /* Note: composition handling is not yet implemented.  */
7443   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7444
7445   if (NILP (coding->src_object))
7446     stop = stop_composition = stop_charset = end_pos;
7447   else
7448     {
7449       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7450         stop = stop_composition = pos;
7451       else
7452         stop = stop_composition = end_pos;
7453       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7454         stop = stop_charset = pos;
7455       else
7456         stop_charset = end_pos;
7457     }
7458
7459   /* Compensate for CRLF and conversion.  */
7460   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7461   while (buf < buf_end)
7462     {
7463       Lisp_Object trans;
7464
7465       if (pos == stop)
7466         {
7467           if (pos == end_pos)
7468             break;
7469           if (pos == stop_composition)
7470             buf = handle_composition_annotation (pos, end_pos, coding,
7471                                                  buf, &stop_composition);
7472           if (pos == stop_charset)
7473             buf = handle_charset_annotation (pos, end_pos, coding,
7474                                              buf, &stop_charset);
7475           stop = (stop_composition < stop_charset
7476                   ? stop_composition : stop_charset);
7477         }
7478
7479       if (! multibytep)
7480         {
7481           EMACS_INT bytes;
7482
7483           if (coding->encoder == encode_coding_raw_text
7484               || coding->encoder == encode_coding_ccl)
7485             c = *src++, pos++;
7486           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7487             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7488           else
7489             c = BYTE8_TO_CHAR (*src), src++, pos++;
7490         }
7491       else
7492         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7493       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7494         c = '\n';
7495       if (! EQ (eol_type, Qunix))
7496         {
7497           if (c == '\n')
7498             {
7499               if (EQ (eol_type, Qdos))
7500                 *buf++ = '\r';
7501               else
7502                 c = '\r';
7503             }
7504         }
7505
7506       trans = Qnil;
7507       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7508       if (NILP (trans))
7509         *buf++ = c;
7510       else
7511         {
7512           int from_nchars = 1, to_nchars = 1;
7513           int *lookup_buf_end;
7514           const unsigned char *p = src;
7515           int i;
7516
7517           lookup_buf[0] = c;
7518           for (i = 1; i < max_lookup && p < src_end; i++)
7519             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7520           lookup_buf_end = lookup_buf + i;
7521           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7522           if (INTEGERP (trans))
7523             c = XINT (trans);
7524           else if (CONSP (trans))
7525             {
7526               from_nchars = ASIZE (XCAR (trans));
7527               trans = XCDR (trans);
7528               if (INTEGERP (trans))
7529                 c = XINT (trans);
7530               else
7531                 {
7532                   to_nchars = ASIZE (trans);
7533                   if (buf + to_nchars > buf_end)
7534                     break;
7535                   c = XINT (AREF (trans, 0));
7536                 }
7537             }
7538           else
7539             break;
7540           *buf++ = c;
7541           for (i = 1; i < to_nchars; i++)
7542             *buf++ = XINT (AREF (trans, i));
7543           for (i = 1; i < from_nchars; i++, pos++)
7544             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7545         }
7546     }
7547
7548   coding->consumed = src - coding->source;
7549   coding->consumed_char = pos - coding->src_pos;
7550   coding->charbuf_used = buf - coding->charbuf;
7551   coding->chars_at_source = 0;
7552 }
7553
7554
7555 /* Encode the text at CODING->src_object into CODING->dst_object.
7556    CODING->src_object is a buffer or a string.
7557    CODING->dst_object is a buffer or nil.
7558
7559    If CODING->src_object is a buffer, it must be the current buffer.
7560    In this case, if CODING->src_pos is positive, it is a position of
7561    the source text in the buffer, otherwise. the source text is in the
7562    gap area of the buffer, and coding->src_pos specifies the offset of
7563    the text from GPT (which must be the same as PT).  If this is the
7564    same buffer as CODING->dst_object, CODING->src_pos must be
7565    negative and CODING should not have `pre-write-conversion'.
7566
7567    If CODING->src_object is a string, CODING should not have
7568    `pre-write-conversion'.
7569
7570    If CODING->dst_object is a buffer, the encoded data is inserted at
7571    the current point of that buffer.
7572
7573    If CODING->dst_object is nil, the encoded data is placed at the
7574    memory area specified by CODING->destination.  */
7575
7576 static int
7577 encode_coding (coding)
7578      struct coding_system *coding;
7579 {
7580   Lisp_Object attrs;
7581   Lisp_Object translation_table;
7582   int max_lookup;
7583   struct ccl_spec cclspec;
7584
7585   attrs = CODING_ID_ATTRS (coding->id);
7586   if (coding->encoder == encode_coding_raw_text)
7587     translation_table = Qnil, max_lookup = 0;
7588   else
7589     translation_table = get_translation_table (attrs, 1, &max_lookup);
7590
7591   if (BUFFERP (coding->dst_object))
7592     {
7593       set_buffer_internal (XBUFFER (coding->dst_object));
7594       coding->dst_multibyte
7595         = ! NILP (current_buffer->enable_multibyte_characters);
7596     }
7597
7598   coding->consumed = coding->consumed_char = 0;
7599   coding->produced = coding->produced_char = 0;
7600   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7601   coding->errors = 0;
7602
7603   ALLOC_CONVERSION_WORK_AREA (coding);
7604
7605   if (coding->encoder == encode_coding_ccl)
7606     {
7607       coding->spec.ccl = &cclspec;
7608       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7609     }
7610   do {
7611     coding_set_source (coding);
7612     consume_chars (coding, translation_table, max_lookup);
7613     coding_set_destination (coding);
7614     (*(coding->encoder)) (coding);
7615   } while (coding->consumed_char < coding->src_chars);
7616
7617   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7618     insert_from_gap (coding->produced_char, coding->produced);
7619
7620   return (coding->result);
7621 }
7622
7623
7624 /* Name (or base name) of work buffer for code conversion.  */
7625 static Lisp_Object Vcode_conversion_workbuf_name;
7626
7627 /* A working buffer used by the top level conversion.  Once it is
7628    created, it is never destroyed.  It has the name
7629    Vcode_conversion_workbuf_name.  The other working buffers are
7630    destroyed after the use is finished, and their names are modified
7631    versions of Vcode_conversion_workbuf_name.  */
7632 static Lisp_Object Vcode_conversion_reused_workbuf;
7633
7634 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7635 static int reused_workbuf_in_use;
7636
7637
7638 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7639    multibyteness of returning buffer.  */
7640
7641 static Lisp_Object
7642 make_conversion_work_buffer (multibyte)
7643      int multibyte;
7644 {
7645   Lisp_Object name, workbuf;
7646   struct buffer *current;
7647
7648   if (reused_workbuf_in_use++)
7649     {
7650       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7651       workbuf = Fget_buffer_create (name);
7652     }
7653   else
7654     {
7655       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7656         Vcode_conversion_reused_workbuf
7657           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7658       workbuf = Vcode_conversion_reused_workbuf;
7659     }
7660   current = current_buffer;
7661   set_buffer_internal (XBUFFER (workbuf));
7662   /* We can't allow modification hooks to run in the work buffer.  For
7663      instance, directory_files_internal assumes that file decoding
7664      doesn't compile new regexps.  */
7665   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7666   Ferase_buffer ();
7667   current_buffer->undo_list = Qt;
7668   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7669   set_buffer_internal (current);
7670   return workbuf;
7671 }
7672
7673
7674 static Lisp_Object
7675 code_conversion_restore (arg)
7676      Lisp_Object arg;
7677 {
7678   Lisp_Object current, workbuf;
7679   struct gcpro gcpro1;
7680
7681   GCPRO1 (arg);
7682   current = XCAR (arg);
7683   workbuf = XCDR (arg);
7684   if (! NILP (workbuf))
7685     {
7686       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7687         reused_workbuf_in_use = 0;
7688       else if (! NILP (Fbuffer_live_p (workbuf)))
7689         Fkill_buffer (workbuf);
7690     }
7691   set_buffer_internal (XBUFFER (current));
7692   UNGCPRO;
7693   return Qnil;
7694 }
7695
7696 Lisp_Object
7697 code_conversion_save (with_work_buf, multibyte)
7698      int with_work_buf, multibyte;
7699 {
7700   Lisp_Object workbuf = Qnil;
7701
7702   if (with_work_buf)
7703     workbuf = make_conversion_work_buffer (multibyte);
7704   record_unwind_protect (code_conversion_restore,
7705                          Fcons (Fcurrent_buffer (), workbuf));
7706   return workbuf;
7707 }
7708
7709 int
7710 decode_coding_gap (coding, chars, bytes)
7711      struct coding_system *coding;
7712      EMACS_INT chars, bytes;
7713 {
7714   int count = specpdl_ptr - specpdl;
7715   Lisp_Object attrs;
7716
7717   code_conversion_save (0, 0);
7718
7719   coding->src_object = Fcurrent_buffer ();
7720   coding->src_chars = chars;
7721   coding->src_bytes = bytes;
7722   coding->src_pos = -chars;
7723   coding->src_pos_byte = -bytes;
7724   coding->src_multibyte = chars < bytes;
7725   coding->dst_object = coding->src_object;
7726   coding->dst_pos = PT;
7727   coding->dst_pos_byte = PT_BYTE;
7728   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7729
7730   if (CODING_REQUIRE_DETECTION (coding))
7731     detect_coding (coding);
7732
7733   coding->mode |= CODING_MODE_LAST_BLOCK;
7734   current_buffer->text->inhibit_shrinking = 1;
7735   decode_coding (coding);
7736   current_buffer->text->inhibit_shrinking = 0;
7737
7738   attrs = CODING_ID_ATTRS (coding->id);
7739   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7740     {
7741       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7742       Lisp_Object val;
7743
7744       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7745       val = call1 (CODING_ATTR_POST_READ (attrs),
7746                    make_number (coding->produced_char));
7747       CHECK_NATNUM (val);
7748       coding->produced_char += Z - prev_Z;
7749       coding->produced += Z_BYTE - prev_Z_BYTE;
7750     }
7751
7752   unbind_to (count, Qnil);
7753   return coding->result;
7754 }
7755
7756 int
7757 encode_coding_gap (coding, chars, bytes)
7758      struct coding_system *coding;
7759      EMACS_INT chars, bytes;
7760 {
7761   int count = specpdl_ptr - specpdl;
7762
7763   code_conversion_save (0, 0);
7764
7765   coding->src_object = Fcurrent_buffer ();
7766   coding->src_chars = chars;
7767   coding->src_bytes = bytes;
7768   coding->src_pos = -chars;
7769   coding->src_pos_byte = -bytes;
7770   coding->src_multibyte = chars < bytes;
7771   coding->dst_object = coding->src_object;
7772   coding->dst_pos = PT;
7773   coding->dst_pos_byte = PT_BYTE;
7774
7775   encode_coding (coding);
7776
7777   unbind_to (count, Qnil);
7778   return coding->result;
7779 }
7780
7781
7782 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7783    SRC_OBJECT into DST_OBJECT by coding context CODING.
7784
7785    SRC_OBJECT is a buffer, a string, or Qnil.
7786
7787    If it is a buffer, the text is at point of the buffer.  FROM and TO
7788    are positions in the buffer.
7789
7790    If it is a string, the text is at the beginning of the string.
7791    FROM and TO are indices to the string.
7792
7793    If it is nil, the text is at coding->source.  FROM and TO are
7794    indices to coding->source.
7795
7796    DST_OBJECT is a buffer, Qt, or Qnil.
7797
7798    If it is a buffer, the decoded text is inserted at point of the
7799    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7800    is deleted.
7801
7802    If it is Qt, a string is made from the decoded text, and
7803    set in CODING->dst_object.
7804
7805    If it is Qnil, the decoded text is stored at CODING->destination.
7806    The caller must allocate CODING->dst_bytes bytes at
7807    CODING->destination by xmalloc.  If the decoded text is longer than
7808    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7809  */
7810
7811 void
7812 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7813                       dst_object)
7814      struct coding_system *coding;
7815      Lisp_Object src_object;
7816      EMACS_INT from, from_byte, to, to_byte;
7817      Lisp_Object dst_object;
7818 {
7819   int count = specpdl_ptr - specpdl;
7820   unsigned char *destination;
7821   EMACS_INT dst_bytes;
7822   EMACS_INT chars = to - from;
7823   EMACS_INT bytes = to_byte - from_byte;
7824   Lisp_Object attrs;
7825   int saved_pt = -1, saved_pt_byte;
7826   int need_marker_adjustment = 0;
7827   Lisp_Object old_deactivate_mark;
7828
7829   old_deactivate_mark = Vdeactivate_mark;
7830
7831   if (NILP (dst_object))
7832     {
7833       destination = coding->destination;
7834       dst_bytes = coding->dst_bytes;
7835     }
7836
7837   coding->src_object = src_object;
7838   coding->src_chars = chars;
7839   coding->src_bytes = bytes;
7840   coding->src_multibyte = chars < bytes;
7841
7842   if (STRINGP (src_object))
7843     {
7844       coding->src_pos = from;
7845       coding->src_pos_byte = from_byte;
7846     }
7847   else if (BUFFERP (src_object))
7848     {
7849       set_buffer_internal (XBUFFER (src_object));
7850       if (from != GPT)
7851         move_gap_both (from, from_byte);
7852       if (EQ (src_object, dst_object))
7853         {
7854           struct Lisp_Marker *tail;
7855
7856           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7857             {
7858               tail->need_adjustment
7859                 = tail->charpos == (tail->insertion_type ? from : to);
7860               need_marker_adjustment |= tail->need_adjustment;
7861             }
7862           saved_pt = PT, saved_pt_byte = PT_BYTE;
7863           TEMP_SET_PT_BOTH (from, from_byte);
7864           current_buffer->text->inhibit_shrinking = 1;
7865           del_range_both (from, from_byte, to, to_byte, 1);
7866           coding->src_pos = -chars;
7867           coding->src_pos_byte = -bytes;
7868         }
7869       else
7870         {
7871           coding->src_pos = from;
7872           coding->src_pos_byte = from_byte;
7873         }
7874     }
7875
7876   if (CODING_REQUIRE_DETECTION (coding))
7877     detect_coding (coding);
7878   attrs = CODING_ID_ATTRS (coding->id);
7879
7880   if (EQ (dst_object, Qt)
7881       || (! NILP (CODING_ATTR_POST_READ (attrs))
7882           && NILP (dst_object)))
7883     {
7884       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7885       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7886       coding->dst_pos = BEG;
7887       coding->dst_pos_byte = BEG_BYTE;
7888     }
7889   else if (BUFFERP (dst_object))
7890     {
7891       code_conversion_save (0, 0);
7892       coding->dst_object = dst_object;
7893       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7894       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7895       coding->dst_multibyte
7896         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7897     }
7898   else
7899     {
7900       code_conversion_save (0, 0);
7901       coding->dst_object = Qnil;
7902       /* Most callers presume this will return a multibyte result, and they
7903          won't use `binary' or `raw-text' anyway, so let's not worry about
7904          CODING_FOR_UNIBYTE.  */
7905       coding->dst_multibyte = 1;
7906     }
7907
7908   decode_coding (coding);
7909
7910   if (BUFFERP (coding->dst_object))
7911     set_buffer_internal (XBUFFER (coding->dst_object));
7912
7913   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7914     {
7915       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7916       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7917       Lisp_Object val;
7918
7919       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7920       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7921               old_deactivate_mark);
7922       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7923                         make_number (coding->produced_char));
7924       UNGCPRO;
7925       CHECK_NATNUM (val);
7926       coding->produced_char += Z - prev_Z;
7927       coding->produced += Z_BYTE - prev_Z_BYTE;
7928     }
7929
7930   if (EQ (dst_object, Qt))
7931     {
7932       coding->dst_object = Fbuffer_string ();
7933     }
7934   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7935     {
7936       set_buffer_internal (XBUFFER (coding->dst_object));
7937       if (dst_bytes < coding->produced)
7938         {
7939           destination = xrealloc (destination, coding->produced);
7940           if (! destination)
7941             {
7942               record_conversion_result (coding,
7943                                         CODING_RESULT_INSUFFICIENT_MEM);
7944               unbind_to (count, Qnil);
7945               return;
7946             }
7947           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7948             move_gap_both (BEGV, BEGV_BYTE);
7949           bcopy (BEGV_ADDR, destination, coding->produced);
7950           coding->destination = destination;
7951         }
7952     }
7953
7954   if (saved_pt >= 0)
7955     {
7956       /* This is the case of:
7957          (BUFFERP (src_object) && EQ (src_object, dst_object))
7958          As we have moved PT while replacing the original buffer
7959          contents, we must recover it now.  */
7960       set_buffer_internal (XBUFFER (src_object));
7961       current_buffer->text->inhibit_shrinking = 0;
7962       if (saved_pt < from)
7963         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7964       else if (saved_pt < from + chars)
7965         TEMP_SET_PT_BOTH (from, from_byte);
7966       else if (! NILP (current_buffer->enable_multibyte_characters))
7967         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7968                           saved_pt_byte + (coding->produced - bytes));
7969       else
7970         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7971                           saved_pt_byte + (coding->produced - bytes));
7972
7973       if (need_marker_adjustment)
7974         {
7975           struct Lisp_Marker *tail;
7976
7977           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7978             if (tail->need_adjustment)
7979               {
7980                 tail->need_adjustment = 0;
7981                 if (tail->insertion_type)
7982                   {
7983                     tail->bytepos = from_byte;
7984                     tail->charpos = from;
7985                   }
7986                 else
7987                   {
7988                     tail->bytepos = from_byte + coding->produced;
7989                     tail->charpos
7990                       = (NILP (current_buffer->enable_multibyte_characters)
7991                          ? tail->bytepos : from + coding->produced_char);
7992                   }
7993               }
7994         }
7995     }
7996
7997   Vdeactivate_mark = old_deactivate_mark;
7998   unbind_to (count, coding->dst_object);
7999 }
8000
8001
8002 void
8003 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
8004                       dst_object)
8005      struct coding_system *coding;
8006      Lisp_Object src_object;
8007      EMACS_INT from, from_byte, to, to_byte;
8008      Lisp_Object dst_object;
8009 {
8010   int count = specpdl_ptr - specpdl;
8011   EMACS_INT chars = to - from;
8012   EMACS_INT bytes = to_byte - from_byte;
8013   Lisp_Object attrs;
8014   int saved_pt = -1, saved_pt_byte;
8015   int need_marker_adjustment = 0;
8016   int kill_src_buffer = 0;
8017   Lisp_Object old_deactivate_mark;
8018
8019   old_deactivate_mark = Vdeactivate_mark;
8020
8021   coding->src_object = src_object;
8022   coding->src_chars = chars;
8023   coding->src_bytes = bytes;
8024   coding->src_multibyte = chars < bytes;
8025
8026   attrs = CODING_ID_ATTRS (coding->id);
8027
8028   if (EQ (src_object, dst_object))
8029     {
8030       struct Lisp_Marker *tail;
8031
8032       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8033         {
8034           tail->need_adjustment
8035             = tail->charpos == (tail->insertion_type ? from : to);
8036           need_marker_adjustment |= tail->need_adjustment;
8037         }
8038     }
8039
8040   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8041     {
8042       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8043       set_buffer_internal (XBUFFER (coding->src_object));
8044       if (STRINGP (src_object))
8045         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8046       else if (BUFFERP (src_object))
8047         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8048       else
8049         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8050
8051       if (EQ (src_object, dst_object))
8052         {
8053           set_buffer_internal (XBUFFER (src_object));
8054           saved_pt = PT, saved_pt_byte = PT_BYTE;
8055           del_range_both (from, from_byte, to, to_byte, 1);
8056           set_buffer_internal (XBUFFER (coding->src_object));
8057         }
8058
8059       {
8060         Lisp_Object args[3];
8061         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8062
8063         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8064                 old_deactivate_mark);
8065         args[0] = CODING_ATTR_PRE_WRITE (attrs);
8066         args[1] = make_number (BEG);
8067         args[2] = make_number (Z);
8068         safe_call (3, args);
8069         UNGCPRO;
8070       }
8071       if (XBUFFER (coding->src_object) != current_buffer)
8072         kill_src_buffer = 1;
8073       coding->src_object = Fcurrent_buffer ();
8074       if (BEG != GPT)
8075         move_gap_both (BEG, BEG_BYTE);
8076       coding->src_chars = Z - BEG;
8077       coding->src_bytes = Z_BYTE - BEG_BYTE;
8078       coding->src_pos = BEG;
8079       coding->src_pos_byte = BEG_BYTE;
8080       coding->src_multibyte = Z < Z_BYTE;
8081     }
8082   else if (STRINGP (src_object))
8083     {
8084       code_conversion_save (0, 0);
8085       coding->src_pos = from;
8086       coding->src_pos_byte = from_byte;
8087     }
8088   else if (BUFFERP (src_object))
8089     {
8090       code_conversion_save (0, 0);
8091       set_buffer_internal (XBUFFER (src_object));
8092       if (EQ (src_object, dst_object))
8093         {
8094           saved_pt = PT, saved_pt_byte = PT_BYTE;
8095           coding->src_object = del_range_1 (from, to, 1, 1);
8096           coding->src_pos = 0;
8097           coding->src_pos_byte = 0;
8098         }
8099       else
8100         {
8101           if (from < GPT && to >= GPT)
8102             move_gap_both (from, from_byte);
8103           coding->src_pos = from;
8104           coding->src_pos_byte = from_byte;
8105         }
8106     }
8107   else
8108     code_conversion_save (0, 0);
8109
8110   if (BUFFERP (dst_object))
8111     {
8112       coding->dst_object = dst_object;
8113       if (EQ (src_object, dst_object))
8114         {
8115           coding->dst_pos = from;
8116           coding->dst_pos_byte = from_byte;
8117         }
8118       else
8119         {
8120           struct buffer *current = current_buffer;
8121
8122           set_buffer_temp (XBUFFER (dst_object));
8123           coding->dst_pos = PT;
8124           coding->dst_pos_byte = PT_BYTE;
8125           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8126           set_buffer_temp (current);
8127         }
8128       coding->dst_multibyte
8129         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8130     }
8131   else if (EQ (dst_object, Qt))
8132     {
8133       coding->dst_object = Qnil;
8134       coding->dst_bytes = coding->src_chars;
8135       if (coding->dst_bytes == 0)
8136         coding->dst_bytes = 1;
8137       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8138       coding->dst_multibyte = 0;
8139     }
8140   else
8141     {
8142       coding->dst_object = Qnil;
8143       coding->dst_multibyte = 0;
8144     }
8145
8146   encode_coding (coding);
8147
8148   if (EQ (dst_object, Qt))
8149     {
8150       if (BUFFERP (coding->dst_object))
8151         coding->dst_object = Fbuffer_string ();
8152       else
8153         {
8154           coding->dst_object
8155             = make_unibyte_string ((char *) coding->destination,
8156                                    coding->produced);
8157           xfree (coding->destination);
8158         }
8159     }
8160
8161   if (saved_pt >= 0)
8162     {
8163       /* This is the case of:
8164          (BUFFERP (src_object) && EQ (src_object, dst_object))
8165          As we have moved PT while replacing the original buffer
8166          contents, we must recover it now.  */
8167       set_buffer_internal (XBUFFER (src_object));
8168       if (saved_pt < from)
8169         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8170       else if (saved_pt < from + chars)
8171         TEMP_SET_PT_BOTH (from, from_byte);
8172       else if (! NILP (current_buffer->enable_multibyte_characters))
8173         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8174                           saved_pt_byte + (coding->produced - bytes));
8175       else
8176         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8177                           saved_pt_byte + (coding->produced - bytes));
8178
8179       if (need_marker_adjustment)
8180         {
8181           struct Lisp_Marker *tail;
8182
8183           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8184             if (tail->need_adjustment)
8185               {
8186                 tail->need_adjustment = 0;
8187                 if (tail->insertion_type)
8188                   {
8189                     tail->bytepos = from_byte;
8190                     tail->charpos = from;
8191                   }
8192                 else
8193                   {
8194                     tail->bytepos = from_byte + coding->produced;
8195                     tail->charpos
8196                       = (NILP (current_buffer->enable_multibyte_characters)
8197                          ? tail->bytepos : from + coding->produced_char);
8198                   }
8199               }
8200         }
8201     }
8202
8203   if (kill_src_buffer)
8204     Fkill_buffer (coding->src_object);
8205
8206   Vdeactivate_mark = old_deactivate_mark;
8207   unbind_to (count, Qnil);
8208 }
8209
8210
8211 Lisp_Object
8212 preferred_coding_system ()
8213 {
8214   int id = coding_categories[coding_priorities[0]].id;
8215
8216   return CODING_ID_NAME (id);
8217 }
8218
8219 \f
8220 #ifdef emacs
8221 /*** 8. Emacs Lisp library functions ***/
8222
8223 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8224        doc: /* Return t if OBJECT is nil or a coding-system.
8225 See the documentation of `define-coding-system' for information
8226 about coding-system objects.  */)
8227      (object)
8228      Lisp_Object object;
8229 {
8230   if (NILP (object)
8231       || CODING_SYSTEM_ID (object) >= 0)
8232     return Qt;
8233   if (! SYMBOLP (object)
8234       || NILP (Fget (object, Qcoding_system_define_form)))
8235     return Qnil;
8236   return Qt;
8237 }
8238
8239 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8240        Sread_non_nil_coding_system, 1, 1, 0,
8241        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8242      (prompt)
8243      Lisp_Object prompt;
8244 {
8245   Lisp_Object val;
8246   do
8247     {
8248       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8249                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8250     }
8251   while (SCHARS (val) == 0);
8252   return (Fintern (val, Qnil));
8253 }
8254
8255 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8256        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8257 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8258 Ignores case when completing coding systems (all Emacs coding systems
8259 are lower-case).  */)
8260      (prompt, default_coding_system)
8261      Lisp_Object prompt, default_coding_system;
8262 {
8263   Lisp_Object val;
8264   int count = SPECPDL_INDEX ();
8265
8266   if (SYMBOLP (default_coding_system))
8267     default_coding_system = SYMBOL_NAME (default_coding_system);
8268   specbind (Qcompletion_ignore_case, Qt);
8269   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8270                           Qt, Qnil, Qcoding_system_history,
8271                           default_coding_system, Qnil);
8272   unbind_to (count, Qnil);
8273   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8274 }
8275
8276 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8277        1, 1, 0,
8278        doc: /* Check validity of CODING-SYSTEM.
8279 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8280 It is valid if it is nil or a symbol defined as a coding system by the
8281 function `define-coding-system'.  */)
8282   (coding_system)
8283      Lisp_Object coding_system;
8284 {
8285   Lisp_Object define_form;
8286
8287   define_form = Fget (coding_system, Qcoding_system_define_form);
8288   if (! NILP (define_form))
8289     {
8290       Fput (coding_system, Qcoding_system_define_form, Qnil);
8291       safe_eval (define_form);
8292     }
8293   if (!NILP (Fcoding_system_p (coding_system)))
8294     return coding_system;
8295   xsignal1 (Qcoding_system_error, coding_system);
8296 }
8297
8298 \f
8299 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8300    HIGHEST is nonzero, return the coding system of the highest
8301    priority among the detected coding systems.  Otherwise return a
8302    list of detected coding systems sorted by their priorities.  If
8303    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8304    multibyte form but contains only ASCII and eight-bit chars.
8305    Otherwise, the bytes are raw bytes.
8306
8307    CODING-SYSTEM controls the detection as below:
8308
8309    If it is nil, detect both text-format and eol-format.  If the
8310    text-format part of CODING-SYSTEM is already specified
8311    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8312    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8313    detect only text-format.  */
8314
8315 Lisp_Object
8316 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8317                       coding_system)
8318      const unsigned char *src;
8319      EMACS_INT src_chars, src_bytes;
8320      int highest;
8321      int multibytep;
8322      Lisp_Object coding_system;
8323 {
8324   const unsigned char *src_end = src + src_bytes;
8325   Lisp_Object attrs, eol_type;
8326   Lisp_Object val = Qnil;
8327   struct coding_system coding;
8328   int id;
8329   struct coding_detection_info detect_info;
8330   enum coding_category base_category;
8331   int null_byte_found = 0, eight_bit_found = 0;
8332
8333   if (NILP (coding_system))
8334     coding_system = Qundecided;
8335   setup_coding_system (coding_system, &coding);
8336   attrs = CODING_ID_ATTRS (coding.id);
8337   eol_type = CODING_ID_EOL_TYPE (coding.id);
8338   coding_system = CODING_ATTR_BASE_NAME (attrs);
8339
8340   coding.source = src;
8341   coding.src_chars = src_chars;
8342   coding.src_bytes = src_bytes;
8343   coding.src_multibyte = multibytep;
8344   coding.consumed = 0;
8345   coding.mode |= CODING_MODE_LAST_BLOCK;
8346   coding.head_ascii = 0;
8347
8348   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8349
8350   /* At first, detect text-format if necessary.  */
8351   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8352   if (base_category == coding_category_undecided)
8353     {
8354       enum coding_category category;
8355       struct coding_system *this;
8356       int c, i;
8357
8358       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8359       for (; src < src_end; src++)
8360         {
8361           c = *src;
8362           if (c & 0x80)
8363             {
8364               eight_bit_found = 1;
8365               if (null_byte_found)
8366                 break;
8367             }
8368           else if (c < 0x20)
8369             {
8370               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8371                   && ! inhibit_iso_escape_detection
8372                   && ! detect_info.checked)
8373                 {
8374                   if (detect_coding_iso_2022 (&coding, &detect_info))
8375                     {
8376                       /* We have scanned the whole data.  */
8377                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8378                         {
8379                           /* We didn't find an 8-bit code.  We may
8380                              have found a null-byte, but it's very
8381                              rare that a binary file confirm to
8382                              ISO-2022.  */
8383                           src = src_end;
8384                           coding.head_ascii = src - coding.source;
8385                         }
8386                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8387                       break;
8388                     }
8389                 }
8390               else if (! c && !inhibit_null_byte_detection)
8391                 {
8392                   null_byte_found = 1;
8393                   if (eight_bit_found)
8394                     break;
8395                 }
8396               if (! eight_bit_found)
8397                 coding.head_ascii++;
8398             }
8399           else if (! eight_bit_found)
8400             coding.head_ascii++;
8401         }
8402
8403       if (null_byte_found || eight_bit_found
8404           || coding.head_ascii < coding.src_bytes
8405           || detect_info.found)
8406         {
8407           if (coding.head_ascii == coding.src_bytes)
8408             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8409             for (i = 0; i < coding_category_raw_text; i++)
8410               {
8411                 category = coding_priorities[i];
8412                 this = coding_categories + category;
8413                 if (detect_info.found & (1 << category))
8414                   break;
8415               }
8416           else
8417             {
8418               if (null_byte_found)
8419                 {
8420                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8421                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8422                 }
8423               for (i = 0; i < coding_category_raw_text; i++)
8424                 {
8425                   category = coding_priorities[i];
8426                   this = coding_categories + category;
8427
8428                   if (this->id < 0)
8429                     {
8430                       /* No coding system of this category is defined.  */
8431                       detect_info.rejected |= (1 << category);
8432                     }
8433                   else if (category >= coding_category_raw_text)
8434                     continue;
8435                   else if (detect_info.checked & (1 << category))
8436                     {
8437                       if (highest
8438                           && (detect_info.found & (1 << category)))
8439                         break;
8440                     }
8441                   else if ((*(this->detector)) (&coding, &detect_info)
8442                            && highest
8443                            && (detect_info.found & (1 << category)))
8444                     {
8445                       if (category == coding_category_utf_16_auto)
8446                         {
8447                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8448                             category = coding_category_utf_16_le;
8449                           else
8450                             category = coding_category_utf_16_be;
8451                         }
8452                       break;
8453                     }
8454                 }
8455             }
8456         }
8457
8458       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8459           || null_byte_found)
8460         {
8461           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8462           id = CODING_SYSTEM_ID (Qno_conversion);
8463           val = Fcons (make_number (id), Qnil);
8464         }
8465       else if (! detect_info.rejected && ! detect_info.found)
8466         {
8467           detect_info.found = CATEGORY_MASK_ANY;
8468           id = coding_categories[coding_category_undecided].id;
8469           val = Fcons (make_number (id), Qnil);
8470         }
8471       else if (highest)
8472         {
8473           if (detect_info.found)
8474             {
8475               detect_info.found = 1 << category;
8476               val = Fcons (make_number (this->id), Qnil);
8477             }
8478           else
8479             for (i = 0; i < coding_category_raw_text; i++)
8480               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8481                 {
8482                   detect_info.found = 1 << coding_priorities[i];
8483                   id = coding_categories[coding_priorities[i]].id;
8484                   val = Fcons (make_number (id), Qnil);
8485                   break;
8486                 }
8487         }
8488       else
8489         {
8490           int mask = detect_info.rejected | detect_info.found;
8491           int found = 0;
8492
8493           for (i = coding_category_raw_text - 1; i >= 0; i--)
8494             {
8495               category = coding_priorities[i];
8496               if (! (mask & (1 << category)))
8497                 {
8498                   found |= 1 << category;
8499                   id = coding_categories[category].id;
8500                   if (id >= 0)
8501                     val = Fcons (make_number (id), val);
8502                 }
8503             }
8504           for (i = coding_category_raw_text - 1; i >= 0; i--)
8505             {
8506               category = coding_priorities[i];
8507               if (detect_info.found & (1 << category))
8508                 {
8509                   id = coding_categories[category].id;
8510                   val = Fcons (make_number (id), val);
8511                 }
8512             }
8513           detect_info.found |= found;
8514         }
8515     }
8516   else if (base_category == coding_category_utf_8_auto)
8517     {
8518       if (detect_coding_utf_8 (&coding, &detect_info))
8519         {
8520           struct coding_system *this;
8521
8522           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8523             this = coding_categories + coding_category_utf_8_sig;
8524           else
8525             this = coding_categories + coding_category_utf_8_nosig;
8526           val = Fcons (make_number (this->id), Qnil);
8527         }
8528     }
8529   else if (base_category == coding_category_utf_16_auto)
8530     {
8531       if (detect_coding_utf_16 (&coding, &detect_info))
8532         {
8533           struct coding_system *this;
8534
8535           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8536             this = coding_categories + coding_category_utf_16_le;
8537           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8538             this = coding_categories + coding_category_utf_16_be;
8539           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8540             this = coding_categories + coding_category_utf_16_be_nosig;
8541           else
8542             this = coding_categories + coding_category_utf_16_le_nosig;
8543           val = Fcons (make_number (this->id), Qnil);
8544         }
8545     }
8546   else
8547     {
8548       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8549       val = Fcons (make_number (coding.id), Qnil);
8550     }
8551
8552   /* Then, detect eol-format if necessary.  */
8553   {
8554     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8555     Lisp_Object tail;
8556
8557     if (VECTORP (eol_type))
8558       {
8559         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8560           {
8561             if (null_byte_found)
8562               normal_eol = EOL_SEEN_LF;
8563             else
8564               normal_eol = detect_eol (coding.source, src_bytes,
8565                                        coding_category_raw_text);
8566           }
8567         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8568                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8569           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8570                                       coding_category_utf_16_be);
8571         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8572                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8573           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8574                                       coding_category_utf_16_le);
8575       }
8576     else
8577       {
8578         if (EQ (eol_type, Qunix))
8579           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8580         else if (EQ (eol_type, Qdos))
8581           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8582         else
8583           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8584       }
8585
8586     for (tail = val; CONSP (tail); tail = XCDR (tail))
8587       {
8588         enum coding_category category;
8589         int this_eol;
8590
8591         id = XINT (XCAR (tail));
8592         attrs = CODING_ID_ATTRS (id);
8593         category = XINT (CODING_ATTR_CATEGORY (attrs));
8594         eol_type = CODING_ID_EOL_TYPE (id);
8595         if (VECTORP (eol_type))
8596           {
8597             if (category == coding_category_utf_16_be
8598                 || category == coding_category_utf_16_be_nosig)
8599               this_eol = utf_16_be_eol;
8600             else if (category == coding_category_utf_16_le
8601                      || category == coding_category_utf_16_le_nosig)
8602               this_eol = utf_16_le_eol;
8603             else
8604               this_eol = normal_eol;
8605
8606             if (this_eol == EOL_SEEN_LF)
8607               XSETCAR (tail, AREF (eol_type, 0));
8608             else if (this_eol == EOL_SEEN_CRLF)
8609               XSETCAR (tail, AREF (eol_type, 1));
8610             else if (this_eol == EOL_SEEN_CR)
8611               XSETCAR (tail, AREF (eol_type, 2));
8612             else
8613               XSETCAR (tail, CODING_ID_NAME (id));
8614           }
8615         else
8616           XSETCAR (tail, CODING_ID_NAME (id));
8617       }
8618   }
8619
8620   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8621 }
8622
8623
8624 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8625        2, 3, 0,
8626        doc: /* Detect coding system of the text in the region between START and END.
8627 Return a list of possible coding systems ordered by priority.
8628 The coding systems to try and their priorities follows what
8629 the function `coding-system-priority-list' (which see) returns.
8630
8631 If only ASCII characters are found (except for such ISO-2022 control
8632 characters as ESC), it returns a list of single element `undecided'
8633 or its subsidiary coding system according to a detected end-of-line
8634 format.
8635
8636 If optional argument HIGHEST is non-nil, return the coding system of
8637 highest priority.  */)
8638      (start, end, highest)
8639      Lisp_Object start, end, highest;
8640 {
8641   int from, to;
8642   int from_byte, to_byte;
8643
8644   CHECK_NUMBER_COERCE_MARKER (start);
8645   CHECK_NUMBER_COERCE_MARKER (end);
8646
8647   validate_region (&start, &end);
8648   from = XINT (start), to = XINT (end);
8649   from_byte = CHAR_TO_BYTE (from);
8650   to_byte = CHAR_TO_BYTE (to);
8651
8652   if (from < GPT && to >= GPT)
8653     move_gap_both (to, to_byte);
8654
8655   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8656                                to - from, to_byte - from_byte,
8657                                !NILP (highest),
8658                                !NILP (current_buffer
8659                                       ->enable_multibyte_characters),
8660                                Qnil);
8661 }
8662
8663 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8664        1, 2, 0,
8665        doc: /* Detect coding system of the text in STRING.
8666 Return a list of possible coding systems ordered by priority.
8667 The coding systems to try and their priorities follows what
8668 the function `coding-system-priority-list' (which see) returns.
8669
8670 If only ASCII characters are found (except for such ISO-2022 control
8671 characters as ESC), it returns a list of single element `undecided'
8672 or its subsidiary coding system according to a detected end-of-line
8673 format.
8674
8675 If optional argument HIGHEST is non-nil, return the coding system of
8676 highest priority.  */)
8677      (string, highest)
8678      Lisp_Object string, highest;
8679 {
8680   CHECK_STRING (string);
8681
8682   return detect_coding_system (SDATA (string),
8683                                SCHARS (string), SBYTES (string),
8684                                !NILP (highest), STRING_MULTIBYTE (string),
8685                                Qnil);
8686 }
8687
8688
8689 static INLINE int
8690 char_encodable_p (c, attrs)
8691      int c;
8692      Lisp_Object attrs;
8693 {
8694   Lisp_Object tail;
8695   struct charset *charset;
8696   Lisp_Object translation_table;
8697
8698   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8699   if (! NILP (translation_table))
8700     c = translate_char (translation_table, c);
8701   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8702        CONSP (tail); tail = XCDR (tail))
8703     {
8704       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8705       if (CHAR_CHARSET_P (c, charset))
8706         break;
8707     }
8708   return (! NILP (tail));
8709 }
8710
8711
8712 /* Return a list of coding systems that safely encode the text between
8713    START and END.  If EXCLUDE is non-nil, it is a list of coding
8714    systems not to check.  The returned list doesn't contain any such
8715    coding systems.  In any case, if the text contains only ASCII or is
8716    unibyte, return t.  */
8717
8718 DEFUN ("find-coding-systems-region-internal",
8719        Ffind_coding_systems_region_internal,
8720        Sfind_coding_systems_region_internal, 2, 3, 0,
8721        doc: /* Internal use only.  */)
8722      (start, end, exclude)
8723      Lisp_Object start, end, exclude;
8724 {
8725   Lisp_Object coding_attrs_list, safe_codings;
8726   EMACS_INT start_byte, end_byte;
8727   const unsigned char *p, *pbeg, *pend;
8728   int c;
8729   Lisp_Object tail, elt, work_table;
8730
8731   if (STRINGP (start))
8732     {
8733       if (!STRING_MULTIBYTE (start)
8734           || SCHARS (start) == SBYTES (start))
8735         return Qt;
8736       start_byte = 0;
8737       end_byte = SBYTES (start);
8738     }
8739   else
8740     {
8741       CHECK_NUMBER_COERCE_MARKER (start);
8742       CHECK_NUMBER_COERCE_MARKER (end);
8743       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8744         args_out_of_range (start, end);
8745       if (NILP (current_buffer->enable_multibyte_characters))
8746         return Qt;
8747       start_byte = CHAR_TO_BYTE (XINT (start));
8748       end_byte = CHAR_TO_BYTE (XINT (end));
8749       if (XINT (end) - XINT (start) == end_byte - start_byte)
8750         return Qt;
8751
8752       if (XINT (start) < GPT && XINT (end) > GPT)
8753         {
8754           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8755             move_gap_both (XINT (start), start_byte);
8756           else
8757             move_gap_both (XINT (end), end_byte);
8758         }
8759     }
8760
8761   coding_attrs_list = Qnil;
8762   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8763     if (NILP (exclude)
8764         || NILP (Fmemq (XCAR (tail), exclude)))
8765       {
8766         Lisp_Object attrs;
8767
8768         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8769         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8770             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8771           {
8772             ASET (attrs, coding_attr_trans_tbl,
8773                   get_translation_table (attrs, 1, NULL));
8774             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8775           }
8776       }
8777
8778   if (STRINGP (start))
8779     p = pbeg = SDATA (start);
8780   else
8781     p = pbeg = BYTE_POS_ADDR (start_byte);
8782   pend = p + (end_byte - start_byte);
8783
8784   while (p < pend && ASCII_BYTE_P (*p)) p++;
8785   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8786
8787   work_table = Fmake_char_table (Qnil, Qnil);
8788   while (p < pend)
8789     {
8790       if (ASCII_BYTE_P (*p))
8791         p++;
8792       else
8793         {
8794           c = STRING_CHAR_ADVANCE (p);
8795           if (!NILP (char_table_ref (work_table, c)))
8796             /* This character was already checked.  Ignore it.  */
8797             continue;
8798
8799           charset_map_loaded = 0;
8800           for (tail = coding_attrs_list; CONSP (tail);)
8801             {
8802               elt = XCAR (tail);
8803               if (NILP (elt))
8804                 tail = XCDR (tail);
8805               else if (char_encodable_p (c, elt))
8806                 tail = XCDR (tail);
8807               else if (CONSP (XCDR (tail)))
8808                 {
8809                   XSETCAR (tail, XCAR (XCDR (tail)));
8810                   XSETCDR (tail, XCDR (XCDR (tail)));
8811                 }
8812               else
8813                 {
8814                   XSETCAR (tail, Qnil);
8815                   tail = XCDR (tail);
8816                 }
8817             }
8818           if (charset_map_loaded)
8819             {
8820               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8821
8822               if (STRINGP (start))
8823                 pbeg = SDATA (start);
8824               else
8825                 pbeg = BYTE_POS_ADDR (start_byte);
8826               p = pbeg + p_offset;
8827               pend = pbeg + pend_offset;
8828             }
8829           char_table_set (work_table, c, Qt);
8830         }
8831     }
8832
8833   safe_codings = list2 (Qraw_text, Qno_conversion);
8834   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8835     if (! NILP (XCAR (tail)))
8836       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8837
8838   return safe_codings;
8839 }
8840
8841
8842 DEFUN ("unencodable-char-position", Funencodable_char_position,
8843        Sunencodable_char_position, 3, 5, 0,
8844        doc: /*
8845 Return position of first un-encodable character in a region.
8846 START and END specify the region and CODING-SYSTEM specifies the
8847 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8848
8849 If optional 4th argument COUNT is non-nil, it specifies at most how
8850 many un-encodable characters to search.  In this case, the value is a
8851 list of positions.
8852
8853 If optional 5th argument STRING is non-nil, it is a string to search
8854 for un-encodable characters.  In that case, START and END are indexes
8855 to the string.  */)
8856      (start, end, coding_system, count, string)
8857      Lisp_Object start, end, coding_system, count, string;
8858 {
8859   int n;
8860   struct coding_system coding;
8861   Lisp_Object attrs, charset_list, translation_table;
8862   Lisp_Object positions;
8863   int from, to;
8864   const unsigned char *p, *stop, *pend;
8865   int ascii_compatible;
8866
8867   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8868   attrs = CODING_ID_ATTRS (coding.id);
8869   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8870     return Qnil;
8871   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8872   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8873   translation_table = get_translation_table (attrs, 1, NULL);
8874
8875   if (NILP (string))
8876     {
8877       validate_region (&start, &end);
8878       from = XINT (start);
8879       to = XINT (end);
8880       if (NILP (current_buffer->enable_multibyte_characters)
8881           || (ascii_compatible
8882               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8883         return Qnil;
8884       p = CHAR_POS_ADDR (from);
8885       pend = CHAR_POS_ADDR (to);
8886       if (from < GPT && to >= GPT)
8887         stop = GPT_ADDR;
8888       else
8889         stop = pend;
8890     }
8891   else
8892     {
8893       CHECK_STRING (string);
8894       CHECK_NATNUM (start);
8895       CHECK_NATNUM (end);
8896       from = XINT (start);
8897       to = XINT (end);
8898       if (from > to
8899           || to > SCHARS (string))
8900         args_out_of_range_3 (string, start, end);
8901       if (! STRING_MULTIBYTE (string))
8902         return Qnil;
8903       p = SDATA (string) + string_char_to_byte (string, from);
8904       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8905       if (ascii_compatible && (to - from) == (pend - p))
8906         return Qnil;
8907     }
8908
8909   if (NILP (count))
8910     n = 1;
8911   else
8912     {
8913       CHECK_NATNUM (count);
8914       n = XINT (count);
8915     }
8916
8917   positions = Qnil;
8918   while (1)
8919     {
8920       int c;
8921
8922       if (ascii_compatible)
8923         while (p < stop && ASCII_BYTE_P (*p))
8924           p++, from++;
8925       if (p >= stop)
8926         {
8927           if (p >= pend)
8928             break;
8929           stop = pend;
8930           p = GAP_END_ADDR;
8931         }
8932
8933       c = STRING_CHAR_ADVANCE (p);
8934       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8935           && ! char_charset (translate_char (translation_table, c),
8936                              charset_list, NULL))
8937         {
8938           positions = Fcons (make_number (from), positions);
8939           n--;
8940           if (n == 0)
8941             break;
8942         }
8943
8944       from++;
8945     }
8946
8947   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8948 }
8949
8950
8951 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8952        Scheck_coding_systems_region, 3, 3, 0,
8953        doc: /* Check if the region is encodable by coding systems.
8954
8955 START and END are buffer positions specifying the region.
8956 CODING-SYSTEM-LIST is a list of coding systems to check.
8957
8958 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8959 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8960 whole region, POS0, POS1, ... are buffer positions where non-encodable
8961 characters are found.
8962
8963 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8964 value is nil.
8965
8966 START may be a string.  In that case, check if the string is
8967 encodable, and the value contains indices to the string instead of
8968 buffer positions.  END is ignored.
8969
8970 If the current buffer (or START if it is a string) is unibyte, the value
8971 is nil.  */)
8972      (start, end, coding_system_list)
8973      Lisp_Object start, end, coding_system_list;
8974 {
8975   Lisp_Object list;
8976   EMACS_INT start_byte, end_byte;
8977   int pos;
8978   const unsigned char *p, *pbeg, *pend;
8979   int c;
8980   Lisp_Object tail, elt, attrs;
8981
8982   if (STRINGP (start))
8983     {
8984       if (!STRING_MULTIBYTE (start)
8985           || SCHARS (start) == SBYTES (start))
8986         return Qnil;
8987       start_byte = 0;
8988       end_byte = SBYTES (start);
8989       pos = 0;
8990     }
8991   else
8992     {
8993       CHECK_NUMBER_COERCE_MARKER (start);
8994       CHECK_NUMBER_COERCE_MARKER (end);
8995       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8996         args_out_of_range (start, end);
8997       if (NILP (current_buffer->enable_multibyte_characters))
8998         return Qnil;
8999       start_byte = CHAR_TO_BYTE (XINT (start));
9000       end_byte = CHAR_TO_BYTE (XINT (end));
9001       if (XINT (end) - XINT (start) == end_byte - start_byte)
9002         return Qnil;
9003
9004       if (XINT (start) < GPT && XINT (end) > GPT)
9005         {
9006           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9007             move_gap_both (XINT (start), start_byte);
9008           else
9009             move_gap_both (XINT (end), end_byte);
9010         }
9011       pos = XINT (start);
9012     }
9013
9014   list = Qnil;
9015   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9016     {
9017       elt = XCAR (tail);
9018       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9019       ASET (attrs, coding_attr_trans_tbl,
9020             get_translation_table (attrs, 1, NULL));
9021       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9022     }
9023
9024   if (STRINGP (start))
9025     p = pbeg = SDATA (start);
9026   else
9027     p = pbeg = BYTE_POS_ADDR (start_byte);
9028   pend = p + (end_byte - start_byte);
9029
9030   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9031   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9032
9033   while (p < pend)
9034     {
9035       if (ASCII_BYTE_P (*p))
9036         p++;
9037       else
9038         {
9039           c = STRING_CHAR_ADVANCE (p);
9040
9041           charset_map_loaded = 0;
9042           for (tail = list; CONSP (tail); tail = XCDR (tail))
9043             {
9044               elt = XCDR (XCAR (tail));
9045               if (! char_encodable_p (c, XCAR (elt)))
9046                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9047             }
9048           if (charset_map_loaded)
9049             {
9050               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9051
9052               if (STRINGP (start))
9053                 pbeg = SDATA (start);
9054               else
9055                 pbeg = BYTE_POS_ADDR (start_byte);
9056               p = pbeg + p_offset;
9057               pend = pbeg + pend_offset;
9058             }
9059         }
9060       pos++;
9061     }
9062
9063   tail = list;
9064   list = Qnil;
9065   for (; CONSP (tail); tail = XCDR (tail))
9066     {
9067       elt = XCAR (tail);
9068       if (CONSP (XCDR (XCDR (elt))))
9069         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9070                       list);
9071     }
9072
9073   return list;
9074 }
9075
9076
9077 Lisp_Object
9078 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9079      Lisp_Object start, end, coding_system, dst_object;
9080      int encodep, norecord;
9081 {
9082   struct coding_system coding;
9083   EMACS_INT from, from_byte, to, to_byte;
9084   Lisp_Object src_object;
9085
9086   CHECK_NUMBER_COERCE_MARKER (start);
9087   CHECK_NUMBER_COERCE_MARKER (end);
9088   if (NILP (coding_system))
9089     coding_system = Qno_conversion;
9090   else
9091     CHECK_CODING_SYSTEM (coding_system);
9092   src_object = Fcurrent_buffer ();
9093   if (NILP (dst_object))
9094     dst_object = src_object;
9095   else if (! EQ (dst_object, Qt))
9096     CHECK_BUFFER (dst_object);
9097
9098   validate_region (&start, &end);
9099   from = XFASTINT (start);
9100   from_byte = CHAR_TO_BYTE (from);
9101   to = XFASTINT (end);
9102   to_byte = CHAR_TO_BYTE (to);
9103
9104   setup_coding_system (coding_system, &coding);
9105   coding.mode |= CODING_MODE_LAST_BLOCK;
9106
9107   if (encodep)
9108     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9109                           dst_object);
9110   else
9111     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9112                           dst_object);
9113   if (! norecord)
9114     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9115
9116   return (BUFFERP (dst_object)
9117           ? make_number (coding.produced_char)
9118           : coding.dst_object);
9119 }
9120
9121
9122 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9123        3, 4, "r\nzCoding system: ",
9124        doc: /* Decode the current region from the specified coding system.
9125 When called from a program, takes four arguments:
9126         START, END, CODING-SYSTEM, and DESTINATION.
9127 START and END are buffer positions.
9128
9129 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9130 If nil, the region between START and END is replaced by the decoded text.
9131 If buffer, the decoded text is inserted in that buffer after point (point
9132 does not move).
9133 In those cases, the length of the decoded text is returned.
9134 If DESTINATION is t, the decoded text is returned.
9135
9136 This function sets `last-coding-system-used' to the precise coding system
9137 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9138 not fully specified.)  */)
9139      (start, end, coding_system, destination)
9140      Lisp_Object start, end, coding_system, destination;
9141 {
9142   return code_convert_region (start, end, coding_system, destination, 0, 0);
9143 }
9144
9145 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9146        3, 4, "r\nzCoding system: ",
9147        doc: /* Encode the current region by specified coding system.
9148 When called from a program, takes four arguments:
9149         START, END, CODING-SYSTEM and DESTINATION.
9150 START and END are buffer positions.
9151
9152 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9153 If nil, the region between START and END is replace by the encoded text.
9154 If buffer, the encoded text is inserted in that buffer after point (point
9155 does not move).
9156 In those cases, the length of the encoded text is returned.
9157 If DESTINATION is t, the encoded text is returned.
9158
9159 This function sets `last-coding-system-used' to the precise coding system
9160 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9161 not fully specified.)  */)
9162   (start, end, coding_system, destination)
9163      Lisp_Object start, end, coding_system, destination;
9164 {
9165   return code_convert_region (start, end, coding_system, destination, 1, 0);
9166 }
9167
9168 Lisp_Object
9169 code_convert_string (string, coding_system, dst_object,
9170                      encodep, nocopy, norecord)
9171      Lisp_Object string, coding_system, dst_object;
9172      int encodep, nocopy, norecord;
9173 {
9174   struct coding_system coding;
9175   EMACS_INT chars, bytes;
9176
9177   CHECK_STRING (string);
9178   if (NILP (coding_system))
9179     {
9180       if (! norecord)
9181         Vlast_coding_system_used = Qno_conversion;
9182       if (NILP (dst_object))
9183         return (nocopy ? Fcopy_sequence (string) : string);
9184     }
9185
9186   if (NILP (coding_system))
9187     coding_system = Qno_conversion;
9188   else
9189     CHECK_CODING_SYSTEM (coding_system);
9190   if (NILP (dst_object))
9191     dst_object = Qt;
9192   else if (! EQ (dst_object, Qt))
9193     CHECK_BUFFER (dst_object);
9194
9195   setup_coding_system (coding_system, &coding);
9196   coding.mode |= CODING_MODE_LAST_BLOCK;
9197   chars = SCHARS (string);
9198   bytes = SBYTES (string);
9199   if (encodep)
9200     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9201   else
9202     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9203   if (! norecord)
9204     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9205
9206   return (BUFFERP (dst_object)
9207           ? make_number (coding.produced_char)
9208           : coding.dst_object);
9209 }
9210
9211
9212 /* Encode or decode STRING according to CODING_SYSTEM.
9213    Do not set Vlast_coding_system_used.
9214
9215    This function is called only from macros DECODE_FILE and
9216    ENCODE_FILE, thus we ignore character composition.  */
9217
9218 Lisp_Object
9219 code_convert_string_norecord (string, coding_system, encodep)
9220      Lisp_Object string, coding_system;
9221      int encodep;
9222 {
9223   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9224 }
9225
9226
9227 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9228        2, 4, 0,
9229        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9230
9231 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9232 if the decoding operation is trivial.
9233
9234 Optional fourth arg BUFFER non-nil means that the decoded text is
9235 inserted in that buffer after point (point does not move).  In this
9236 case, the return value is the length of the decoded text.
9237
9238 This function sets `last-coding-system-used' to the precise coding system
9239 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9240 not fully specified.)  */)
9241   (string, coding_system, nocopy, buffer)
9242      Lisp_Object string, coding_system, nocopy, buffer;
9243 {
9244   return code_convert_string (string, coding_system, buffer,
9245                               0, ! NILP (nocopy), 0);
9246 }
9247
9248 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9249        2, 4, 0,
9250        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9251
9252 Optional third arg NOCOPY non-nil means it is OK to return STRING
9253 itself if the encoding operation is trivial.
9254
9255 Optional fourth arg BUFFER non-nil means that the encoded text is
9256 inserted in that buffer after point (point does not move).  In this
9257 case, the return value is the length of the encoded text.
9258
9259 This function sets `last-coding-system-used' to the precise coding system
9260 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9261 not fully specified.)  */)
9262      (string, coding_system, nocopy, buffer)
9263      Lisp_Object string, coding_system, nocopy, buffer;
9264 {
9265   return code_convert_string (string, coding_system, buffer,
9266                               1, ! NILP (nocopy), 1);
9267 }
9268
9269 \f
9270 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9271        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9272 Return the corresponding character.  */)
9273      (code)
9274      Lisp_Object code;
9275 {
9276   Lisp_Object spec, attrs, val;
9277   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9278   int c;
9279
9280   CHECK_NATNUM (code);
9281   c = XFASTINT (code);
9282   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9283   attrs = AREF (spec, 0);
9284
9285   if (ASCII_BYTE_P (c)
9286       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9287     return code;
9288
9289   val = CODING_ATTR_CHARSET_LIST (attrs);
9290   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9291   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9292   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9293
9294   if (c <= 0x7F)
9295     charset = charset_roman;
9296   else if (c >= 0xA0 && c < 0xDF)
9297     {
9298       charset = charset_kana;
9299       c -= 0x80;
9300     }
9301   else
9302     {
9303       int s1 = c >> 8, s2 = c & 0xFF;
9304
9305       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9306           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9307         error ("Invalid code: %d", code);
9308       SJIS_TO_JIS (c);
9309       charset = charset_kanji;
9310     }
9311   c = DECODE_CHAR (charset, c);
9312   if (c < 0)
9313     error ("Invalid code: %d", code);
9314   return make_number (c);
9315 }
9316
9317
9318 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9319        doc: /* Encode a Japanese character CH to shift_jis encoding.
9320 Return the corresponding code in SJIS.  */)
9321      (ch)
9322     Lisp_Object ch;
9323 {
9324   Lisp_Object spec, attrs, charset_list;
9325   int c;
9326   struct charset *charset;
9327   unsigned code;
9328
9329   CHECK_CHARACTER (ch);
9330   c = XFASTINT (ch);
9331   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9332   attrs = AREF (spec, 0);
9333
9334   if (ASCII_CHAR_P (c)
9335       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9336     return ch;
9337
9338   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9339   charset = char_charset (c, charset_list, &code);
9340   if (code == CHARSET_INVALID_CODE (charset))
9341     error ("Can't encode by shift_jis encoding: %d", c);
9342   JIS_TO_SJIS (code);
9343
9344   return make_number (code);
9345 }
9346
9347 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9348        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9349 Return the corresponding character.  */)
9350      (code)
9351      Lisp_Object code;
9352 {
9353   Lisp_Object spec, attrs, val;
9354   struct charset *charset_roman, *charset_big5, *charset;
9355   int c;
9356
9357   CHECK_NATNUM (code);
9358   c = XFASTINT (code);
9359   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9360   attrs = AREF (spec, 0);
9361
9362   if (ASCII_BYTE_P (c)
9363       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9364     return code;
9365
9366   val = CODING_ATTR_CHARSET_LIST (attrs);
9367   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9368   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9369
9370   if (c <= 0x7F)
9371     charset = charset_roman;
9372   else
9373     {
9374       int b1 = c >> 8, b2 = c & 0x7F;
9375       if (b1 < 0xA1 || b1 > 0xFE
9376           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9377         error ("Invalid code: %d", code);
9378       charset = charset_big5;
9379     }
9380   c = DECODE_CHAR (charset, (unsigned )c);
9381   if (c < 0)
9382     error ("Invalid code: %d", code);
9383   return make_number (c);
9384 }
9385
9386 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9387        doc: /* Encode the Big5 character CH to BIG5 coding system.
9388 Return the corresponding character code in Big5.  */)
9389      (ch)
9390      Lisp_Object ch;
9391 {
9392   Lisp_Object spec, attrs, charset_list;
9393   struct charset *charset;
9394   int c;
9395   unsigned code;
9396
9397   CHECK_CHARACTER (ch);
9398   c = XFASTINT (ch);
9399   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9400   attrs = AREF (spec, 0);
9401   if (ASCII_CHAR_P (c)
9402       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9403     return ch;
9404
9405   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9406   charset = char_charset (c, charset_list, &code);
9407   if (code == CHARSET_INVALID_CODE (charset))
9408     error ("Can't encode by Big5 encoding: %d", c);
9409
9410   return make_number (code);
9411 }
9412
9413 \f
9414 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9415        Sset_terminal_coding_system_internal, 1, 2, 0,
9416        doc: /* Internal use only.  */)
9417      (coding_system, terminal)
9418      Lisp_Object coding_system;
9419      Lisp_Object terminal;
9420 {
9421   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9422   CHECK_SYMBOL (coding_system);
9423   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9424   /* We had better not send unsafe characters to terminal.  */
9425   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9426   /* Character composition should be disabled.  */
9427   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9428   terminal_coding->src_multibyte = 1;
9429   terminal_coding->dst_multibyte = 0;
9430   return Qnil;
9431 }
9432
9433 DEFUN ("set-safe-terminal-coding-system-internal",
9434        Fset_safe_terminal_coding_system_internal,
9435        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9436        doc: /* Internal use only.  */)
9437      (coding_system)
9438      Lisp_Object coding_system;
9439 {
9440   CHECK_SYMBOL (coding_system);
9441   setup_coding_system (Fcheck_coding_system (coding_system),
9442                        &safe_terminal_coding);
9443   /* Character composition should be disabled.  */
9444   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9445   safe_terminal_coding.src_multibyte = 1;
9446   safe_terminal_coding.dst_multibyte = 0;
9447   return Qnil;
9448 }
9449
9450 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9451        Sterminal_coding_system, 0, 1, 0,
9452        doc: /* Return coding system specified for terminal output on the given terminal.
9453 TERMINAL may be a terminal object, a frame, or nil for the selected
9454 frame's terminal device.  */)
9455      (terminal)
9456      Lisp_Object terminal;
9457 {
9458   struct coding_system *terminal_coding
9459     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9460   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9461
9462   /* For backward compatibility, return nil if it is `undecided'. */
9463   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9464 }
9465
9466 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9467        Sset_keyboard_coding_system_internal, 1, 2, 0,
9468        doc: /* Internal use only.  */)
9469      (coding_system, terminal)
9470      Lisp_Object coding_system;
9471      Lisp_Object terminal;
9472 {
9473   struct terminal *t = get_terminal (terminal, 1);
9474   CHECK_SYMBOL (coding_system);
9475   if (NILP (coding_system))
9476     coding_system = Qno_conversion;
9477   else
9478     Fcheck_coding_system (coding_system);
9479   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9480   /* Character composition should be disabled.  */
9481   TERMINAL_KEYBOARD_CODING (t)->common_flags
9482     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9483   return Qnil;
9484 }
9485
9486 DEFUN ("keyboard-coding-system",
9487        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9488        doc: /* Return coding system specified for decoding keyboard input.  */)
9489      (terminal)
9490      Lisp_Object terminal;
9491 {
9492   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9493                          (get_terminal (terminal, 1))->id);
9494 }
9495
9496 \f
9497 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9498        Sfind_operation_coding_system,  1, MANY, 0,
9499        doc: /* Choose a coding system for an operation based on the target name.
9500 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9501 DECODING-SYSTEM is the coding system to use for decoding
9502 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9503 for encoding (in case OPERATION does encoding).
9504
9505 The first argument OPERATION specifies an I/O primitive:
9506   For file I/O, `insert-file-contents' or `write-region'.
9507   For process I/O, `call-process', `call-process-region', or `start-process'.
9508   For network I/O, `open-network-stream'.
9509
9510 The remaining arguments should be the same arguments that were passed
9511 to the primitive.  Depending on which primitive, one of those arguments
9512 is selected as the TARGET.  For example, if OPERATION does file I/O,
9513 whichever argument specifies the file name is TARGET.
9514
9515 TARGET has a meaning which depends on OPERATION:
9516   For file I/O, TARGET is a file name (except for the special case below).
9517   For process I/O, TARGET is a process name.
9518   For network I/O, TARGET is a service name or a port number.
9519
9520 This function looks up what is specified for TARGET in
9521 `file-coding-system-alist', `process-coding-system-alist',
9522 or `network-coding-system-alist' depending on OPERATION.
9523 They may specify a coding system, a cons of coding systems,
9524 or a function symbol to call.
9525 In the last case, we call the function with one argument,
9526 which is a list of all the arguments given to this function.
9527 If the function can't decide a coding system, it can return
9528 `undecided' so that the normal code-detection is performed.
9529
9530 If OPERATION is `insert-file-contents', the argument corresponding to
9531 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9532 file name to look up, and BUFFER is a buffer that contains the file's
9533 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9534 function to call for FILENAME, that function should examine the
9535 contents of BUFFER instead of reading the file.
9536
9537 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9538      (nargs, args)
9539      int nargs;
9540      Lisp_Object *args;
9541 {
9542   Lisp_Object operation, target_idx, target, val;
9543   register Lisp_Object chain;
9544
9545   if (nargs < 2)
9546     error ("Too few arguments");
9547   operation = args[0];
9548   if (!SYMBOLP (operation)
9549       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9550     error ("Invalid first argument");
9551   if (nargs < 1 + XINT (target_idx))
9552     error ("Too few arguments for operation: %s",
9553            SDATA (SYMBOL_NAME (operation)));
9554   target = args[XINT (target_idx) + 1];
9555   if (!(STRINGP (target)
9556         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9557             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9558         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9559     error ("Invalid %dth argument", XINT (target_idx) + 1);
9560   if (CONSP (target))
9561     target = XCAR (target);
9562
9563   chain = ((EQ (operation, Qinsert_file_contents)
9564             || EQ (operation, Qwrite_region))
9565            ? Vfile_coding_system_alist
9566            : (EQ (operation, Qopen_network_stream)
9567               ? Vnetwork_coding_system_alist
9568               : Vprocess_coding_system_alist));
9569   if (NILP (chain))
9570     return Qnil;
9571
9572   for (; CONSP (chain); chain = XCDR (chain))
9573     {
9574       Lisp_Object elt;
9575
9576       elt = XCAR (chain);
9577       if (CONSP (elt)
9578           && ((STRINGP (target)
9579                && STRINGP (XCAR (elt))
9580                && fast_string_match (XCAR (elt), target) >= 0)
9581               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9582         {
9583           val = XCDR (elt);
9584           /* Here, if VAL is both a valid coding system and a valid
9585              function symbol, we return VAL as a coding system.  */
9586           if (CONSP (val))
9587             return val;
9588           if (! SYMBOLP (val))
9589             return Qnil;
9590           if (! NILP (Fcoding_system_p (val)))
9591             return Fcons (val, val);
9592           if (! NILP (Ffboundp (val)))
9593             {
9594               /* We use call1 rather than safe_call1
9595                  so as to get bug reports about functions called here
9596                  which don't handle the current interface.  */
9597               val = call1 (val, Flist (nargs, args));
9598               if (CONSP (val))
9599                 return val;
9600               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9601                 return Fcons (val, val);
9602             }
9603           return Qnil;
9604         }
9605     }
9606   return Qnil;
9607 }
9608
9609 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9610        Sset_coding_system_priority, 0, MANY, 0,
9611        doc: /* Assign higher priority to the coding systems given as arguments.
9612 If multiple coding systems belong to the same category,
9613 all but the first one are ignored.
9614
9615 usage: (set-coding-system-priority &rest coding-systems)  */)
9616      (nargs, args)
9617      int nargs;
9618      Lisp_Object *args;
9619 {
9620   int i, j;
9621   int changed[coding_category_max];
9622   enum coding_category priorities[coding_category_max];
9623
9624   bzero (changed, sizeof changed);
9625
9626   for (i = j = 0; i < nargs; i++)
9627     {
9628       enum coding_category category;
9629       Lisp_Object spec, attrs;
9630
9631       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9632       attrs = AREF (spec, 0);
9633       category = XINT (CODING_ATTR_CATEGORY (attrs));
9634       if (changed[category])
9635         /* Ignore this coding system because a coding system of the
9636            same category already had a higher priority.  */
9637         continue;
9638       changed[category] = 1;
9639       priorities[j++] = category;
9640       if (coding_categories[category].id >= 0
9641           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9642         setup_coding_system (args[i], &coding_categories[category]);
9643       Fset (AREF (Vcoding_category_table, category), args[i]);
9644     }
9645
9646   /* Now we have decided top J priorities.  Reflect the order of the
9647      original priorities to the remaining priorities.  */
9648
9649   for (i = j, j = 0; i < coding_category_max; i++, j++)
9650     {
9651       while (j < coding_category_max
9652              && changed[coding_priorities[j]])
9653         j++;
9654       if (j == coding_category_max)
9655         abort ();
9656       priorities[i] = coding_priorities[j];
9657     }
9658
9659   bcopy (priorities, coding_priorities, sizeof priorities);
9660
9661   /* Update `coding-category-list'.  */
9662   Vcoding_category_list = Qnil;
9663   for (i = coding_category_max - 1; i >= 0; i--)
9664     Vcoding_category_list
9665       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9666                Vcoding_category_list);
9667
9668   return Qnil;
9669 }
9670
9671 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9672        Scoding_system_priority_list, 0, 1, 0,
9673        doc: /* Return a list of coding systems ordered by their priorities.
9674 The list contains a subset of coding systems; i.e. coding systems
9675 assigned to each coding category (see `coding-category-list').
9676
9677 HIGHESTP non-nil means just return the highest priority one.  */)
9678      (highestp)
9679      Lisp_Object highestp;
9680 {
9681   int i;
9682   Lisp_Object val;
9683
9684   for (i = 0, val = Qnil; i < coding_category_max; i++)
9685     {
9686       enum coding_category category = coding_priorities[i];
9687       int id = coding_categories[category].id;
9688       Lisp_Object attrs;
9689
9690       if (id < 0)
9691         continue;
9692       attrs = CODING_ID_ATTRS (id);
9693       if (! NILP (highestp))
9694         return CODING_ATTR_BASE_NAME (attrs);
9695       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9696     }
9697   return Fnreverse (val);
9698 }
9699
9700 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9701
9702 static Lisp_Object
9703 make_subsidiaries (base)
9704      Lisp_Object base;
9705 {
9706   Lisp_Object subsidiaries;
9707   int base_name_len = SBYTES (SYMBOL_NAME (base));
9708   char *buf = (char *) alloca (base_name_len + 6);
9709   int i;
9710
9711   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9712   subsidiaries = Fmake_vector (make_number (3), Qnil);
9713   for (i = 0; i < 3; i++)
9714     {
9715       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9716       ASET (subsidiaries, i, intern (buf));
9717     }
9718   return subsidiaries;
9719 }
9720
9721
9722 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9723        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9724        doc: /* For internal use only.
9725 usage: (define-coding-system-internal ...)  */)
9726      (nargs, args)
9727      int nargs;
9728      Lisp_Object *args;
9729 {
9730   Lisp_Object name;
9731   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9732   Lisp_Object attrs;            /* Vector of attributes.  */
9733   Lisp_Object eol_type;
9734   Lisp_Object aliases;
9735   Lisp_Object coding_type, charset_list, safe_charsets;
9736   enum coding_category category;
9737   Lisp_Object tail, val;
9738   int max_charset_id = 0;
9739   int i;
9740
9741   if (nargs < coding_arg_max)
9742     goto short_args;
9743
9744   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9745
9746   name = args[coding_arg_name];
9747   CHECK_SYMBOL (name);
9748   CODING_ATTR_BASE_NAME (attrs) = name;
9749
9750   val = args[coding_arg_mnemonic];
9751   if (! STRINGP (val))
9752     CHECK_CHARACTER (val);
9753   CODING_ATTR_MNEMONIC (attrs) = val;
9754
9755   coding_type = args[coding_arg_coding_type];
9756   CHECK_SYMBOL (coding_type);
9757   CODING_ATTR_TYPE (attrs) = coding_type;
9758
9759   charset_list = args[coding_arg_charset_list];
9760   if (SYMBOLP (charset_list))
9761     {
9762       if (EQ (charset_list, Qiso_2022))
9763         {
9764           if (! EQ (coding_type, Qiso_2022))
9765             error ("Invalid charset-list");
9766           charset_list = Viso_2022_charset_list;
9767         }
9768       else if (EQ (charset_list, Qemacs_mule))
9769         {
9770           if (! EQ (coding_type, Qemacs_mule))
9771             error ("Invalid charset-list");
9772           charset_list = Vemacs_mule_charset_list;
9773         }
9774       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9775         if (max_charset_id < XFASTINT (XCAR (tail)))
9776           max_charset_id = XFASTINT (XCAR (tail));
9777     }
9778   else
9779     {
9780       charset_list = Fcopy_sequence (charset_list);
9781       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9782         {
9783           struct charset *charset;
9784
9785           val = XCAR (tail);
9786           CHECK_CHARSET_GET_CHARSET (val, charset);
9787           if (EQ (coding_type, Qiso_2022)
9788               ? CHARSET_ISO_FINAL (charset) < 0
9789               : EQ (coding_type, Qemacs_mule)
9790               ? CHARSET_EMACS_MULE_ID (charset) < 0
9791               : 0)
9792             error ("Can't handle charset `%s'",
9793                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9794
9795           XSETCAR (tail, make_number (charset->id));
9796           if (max_charset_id < charset->id)
9797             max_charset_id = charset->id;
9798         }
9799     }
9800   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9801
9802   safe_charsets = make_uninit_string (max_charset_id + 1);
9803   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9804   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9805     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9806   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9807
9808   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9809
9810   val = args[coding_arg_decode_translation_table];
9811   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9812     CHECK_SYMBOL (val);
9813   CODING_ATTR_DECODE_TBL (attrs) = val;
9814
9815   val = args[coding_arg_encode_translation_table];
9816   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9817     CHECK_SYMBOL (val);
9818   CODING_ATTR_ENCODE_TBL (attrs) = val;
9819
9820   val = args[coding_arg_post_read_conversion];
9821   CHECK_SYMBOL (val);
9822   CODING_ATTR_POST_READ (attrs) = val;
9823
9824   val = args[coding_arg_pre_write_conversion];
9825   CHECK_SYMBOL (val);
9826   CODING_ATTR_PRE_WRITE (attrs) = val;
9827
9828   val = args[coding_arg_default_char];
9829   if (NILP (val))
9830     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9831   else
9832     {
9833       CHECK_CHARACTER (val);
9834       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9835     }
9836
9837   val = args[coding_arg_for_unibyte];
9838   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9839
9840   val = args[coding_arg_plist];
9841   CHECK_LIST (val);
9842   CODING_ATTR_PLIST (attrs) = val;
9843
9844   if (EQ (coding_type, Qcharset))
9845     {
9846       /* Generate a lisp vector of 256 elements.  Each element is nil,
9847          integer, or a list of charset IDs.
9848
9849          If Nth element is nil, the byte code N is invalid in this
9850          coding system.
9851
9852          If Nth element is a number NUM, N is the first byte of a
9853          charset whose ID is NUM.
9854
9855          If Nth element is a list of charset IDs, N is the first byte
9856          of one of them.  The list is sorted by dimensions of the
9857          charsets.  A charset of smaller dimension comes first. */
9858       val = Fmake_vector (make_number (256), Qnil);
9859
9860       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9861         {
9862           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9863           int dim = CHARSET_DIMENSION (charset);
9864           int idx = (dim - 1) * 4;
9865
9866           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9867             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9868
9869           for (i = charset->code_space[idx];
9870                i <= charset->code_space[idx + 1]; i++)
9871             {
9872               Lisp_Object tmp, tmp2;
9873               int dim2;
9874
9875               tmp = AREF (val, i);
9876               if (NILP (tmp))
9877                 tmp = XCAR (tail);
9878               else if (NUMBERP (tmp))
9879                 {
9880                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9881                   if (dim < dim2)
9882                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9883                   else
9884                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9885                 }
9886               else
9887                 {
9888                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9889                     {
9890                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9891                       if (dim < dim2)
9892                         break;
9893                     }
9894                   if (NILP (tmp2))
9895                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9896                   else
9897                     {
9898                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9899                       XSETCAR (tmp2, XCAR (tail));
9900                     }
9901                 }
9902               ASET (val, i, tmp);
9903             }
9904         }
9905       ASET (attrs, coding_attr_charset_valids, val);
9906       category = coding_category_charset;
9907     }
9908   else if (EQ (coding_type, Qccl))
9909     {
9910       Lisp_Object valids;
9911
9912       if (nargs < coding_arg_ccl_max)
9913         goto short_args;
9914
9915       val = args[coding_arg_ccl_decoder];
9916       CHECK_CCL_PROGRAM (val);
9917       if (VECTORP (val))
9918         val = Fcopy_sequence (val);
9919       ASET (attrs, coding_attr_ccl_decoder, val);
9920
9921       val = args[coding_arg_ccl_encoder];
9922       CHECK_CCL_PROGRAM (val);
9923       if (VECTORP (val))
9924         val = Fcopy_sequence (val);
9925       ASET (attrs, coding_attr_ccl_encoder, val);
9926
9927       val = args[coding_arg_ccl_valids];
9928       valids = Fmake_string (make_number (256), make_number (0));
9929       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9930         {
9931           int from, to;
9932
9933           val = Fcar (tail);
9934           if (INTEGERP (val))
9935             {
9936               from = to = XINT (val);
9937               if (from < 0 || from > 255)
9938                 args_out_of_range_3 (val, make_number (0), make_number (255));
9939             }
9940           else
9941             {
9942               CHECK_CONS (val);
9943               CHECK_NATNUM_CAR (val);
9944               CHECK_NATNUM_CDR (val);
9945               from = XINT (XCAR (val));
9946               if (from > 255)
9947                 args_out_of_range_3 (XCAR (val),
9948                                      make_number (0), make_number (255));
9949               to = XINT (XCDR (val));
9950               if (to < from || to > 255)
9951                 args_out_of_range_3 (XCDR (val),
9952                                      XCAR (val), make_number (255));
9953             }
9954           for (i = from; i <= to; i++)
9955             SSET (valids, i, 1);
9956         }
9957       ASET (attrs, coding_attr_ccl_valids, valids);
9958
9959       category = coding_category_ccl;
9960     }
9961   else if (EQ (coding_type, Qutf_16))
9962     {
9963       Lisp_Object bom, endian;
9964
9965       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9966
9967       if (nargs < coding_arg_utf16_max)
9968         goto short_args;
9969
9970       bom = args[coding_arg_utf16_bom];
9971       if (! NILP (bom) && ! EQ (bom, Qt))
9972         {
9973           CHECK_CONS (bom);
9974           val = XCAR (bom);
9975           CHECK_CODING_SYSTEM (val);
9976           val = XCDR (bom);
9977           CHECK_CODING_SYSTEM (val);
9978         }
9979       ASET (attrs, coding_attr_utf_bom, bom);
9980
9981       endian = args[coding_arg_utf16_endian];
9982       CHECK_SYMBOL (endian);
9983       if (NILP (endian))
9984         endian = Qbig;
9985       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9986         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9987       ASET (attrs, coding_attr_utf_16_endian, endian);
9988
9989       category = (CONSP (bom)
9990                   ? coding_category_utf_16_auto
9991                   : NILP (bom)
9992                   ? (EQ (endian, Qbig)
9993                      ? coding_category_utf_16_be_nosig
9994                      : coding_category_utf_16_le_nosig)
9995                   : (EQ (endian, Qbig)
9996                      ? coding_category_utf_16_be
9997                      : coding_category_utf_16_le));
9998     }
9999   else if (EQ (coding_type, Qiso_2022))
10000     {
10001       Lisp_Object initial, reg_usage, request, flags;
10002       int i;
10003
10004       if (nargs < coding_arg_iso2022_max)
10005         goto short_args;
10006
10007       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10008       CHECK_VECTOR (initial);
10009       for (i = 0; i < 4; i++)
10010         {
10011           val = Faref (initial, make_number (i));
10012           if (! NILP (val))
10013             {
10014               struct charset *charset;
10015
10016               CHECK_CHARSET_GET_CHARSET (val, charset);
10017               ASET (initial, i, make_number (CHARSET_ID (charset)));
10018               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10019                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10020             }
10021           else
10022             ASET (initial, i, make_number (-1));
10023         }
10024
10025       reg_usage = args[coding_arg_iso2022_reg_usage];
10026       CHECK_CONS (reg_usage);
10027       CHECK_NUMBER_CAR (reg_usage);
10028       CHECK_NUMBER_CDR (reg_usage);
10029
10030       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10031       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
10032         {
10033           int id;
10034           Lisp_Object tmp;
10035
10036           val = Fcar (tail);
10037           CHECK_CONS (val);
10038           tmp = XCAR (val);
10039           CHECK_CHARSET_GET_ID (tmp, id);
10040           CHECK_NATNUM_CDR (val);
10041           if (XINT (XCDR (val)) >= 4)
10042             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
10043           XSETCAR (val, make_number (id));
10044         }
10045
10046       flags = args[coding_arg_iso2022_flags];
10047       CHECK_NATNUM (flags);
10048       i = XINT (flags);
10049       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10050         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10051
10052       ASET (attrs, coding_attr_iso_initial, initial);
10053       ASET (attrs, coding_attr_iso_usage, reg_usage);
10054       ASET (attrs, coding_attr_iso_request, request);
10055       ASET (attrs, coding_attr_iso_flags, flags);
10056       setup_iso_safe_charsets (attrs);
10057
10058       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10059         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10060                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10061                     ? coding_category_iso_7_else
10062                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10063                     ? coding_category_iso_7
10064                     : coding_category_iso_7_tight);
10065       else
10066         {
10067           int id = XINT (AREF (initial, 1));
10068
10069           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10070                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10071                        || id < 0)
10072                       ? coding_category_iso_8_else
10073                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10074                       ? coding_category_iso_8_1
10075                       : coding_category_iso_8_2);
10076         }
10077       if (category != coding_category_iso_8_1
10078           && category != coding_category_iso_8_2)
10079         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10080     }
10081   else if (EQ (coding_type, Qemacs_mule))
10082     {
10083       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10084         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10085       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10086       category = coding_category_emacs_mule;
10087     }
10088   else if (EQ (coding_type, Qshift_jis))
10089     {
10090
10091       struct charset *charset;
10092
10093       if (XINT (Flength (charset_list)) != 3
10094           && XINT (Flength (charset_list)) != 4)
10095         error ("There should be three or four charsets");
10096
10097       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10098       if (CHARSET_DIMENSION (charset) != 1)
10099         error ("Dimension of charset %s is not one",
10100                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10101       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10102         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10103
10104       charset_list = XCDR (charset_list);
10105       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10106       if (CHARSET_DIMENSION (charset) != 1)
10107         error ("Dimension of charset %s is not one",
10108                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10109
10110       charset_list = XCDR (charset_list);
10111       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10112       if (CHARSET_DIMENSION (charset) != 2)
10113         error ("Dimension of charset %s is not two",
10114                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10115
10116       charset_list = XCDR (charset_list);
10117       if (! NILP (charset_list))
10118         {
10119           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10120           if (CHARSET_DIMENSION (charset) != 2)
10121             error ("Dimension of charset %s is not two",
10122                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10123         }
10124
10125       category = coding_category_sjis;
10126       Vsjis_coding_system = name;
10127     }
10128   else if (EQ (coding_type, Qbig5))
10129     {
10130       struct charset *charset;
10131
10132       if (XINT (Flength (charset_list)) != 2)
10133         error ("There should be just two charsets");
10134
10135       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10136       if (CHARSET_DIMENSION (charset) != 1)
10137         error ("Dimension of charset %s is not one",
10138                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10139       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10140         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10141
10142       charset_list = XCDR (charset_list);
10143       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10144       if (CHARSET_DIMENSION (charset) != 2)
10145         error ("Dimension of charset %s is not two",
10146                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10147
10148       category = coding_category_big5;
10149       Vbig5_coding_system = name;
10150     }
10151   else if (EQ (coding_type, Qraw_text))
10152     {
10153       category = coding_category_raw_text;
10154       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10155     }
10156   else if (EQ (coding_type, Qutf_8))
10157     {
10158       Lisp_Object bom;
10159
10160       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10161
10162       if (nargs < coding_arg_utf8_max)
10163         goto short_args;
10164
10165       bom = args[coding_arg_utf8_bom];
10166       if (! NILP (bom) && ! EQ (bom, Qt))
10167         {
10168           CHECK_CONS (bom);
10169           val = XCAR (bom);
10170           CHECK_CODING_SYSTEM (val);
10171           val = XCDR (bom);
10172           CHECK_CODING_SYSTEM (val);
10173         }
10174       ASET (attrs, coding_attr_utf_bom, bom);
10175
10176       category = (CONSP (bom) ? coding_category_utf_8_auto
10177                   : NILP (bom) ? coding_category_utf_8_nosig
10178                   : coding_category_utf_8_sig);
10179     }
10180   else if (EQ (coding_type, Qundecided))
10181     category = coding_category_undecided;
10182   else
10183     error ("Invalid coding system type: %s",
10184            SDATA (SYMBOL_NAME (coding_type)));
10185
10186   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10187   CODING_ATTR_PLIST (attrs)
10188     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10189                                 CODING_ATTR_PLIST (attrs)));
10190   CODING_ATTR_PLIST (attrs)
10191     = Fcons (QCascii_compatible_p,
10192              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10193                     CODING_ATTR_PLIST (attrs)));
10194
10195   eol_type = args[coding_arg_eol_type];
10196   if (! NILP (eol_type)
10197       && ! EQ (eol_type, Qunix)
10198       && ! EQ (eol_type, Qdos)
10199       && ! EQ (eol_type, Qmac))
10200     error ("Invalid eol-type");
10201
10202   aliases = Fcons (name, Qnil);
10203
10204   if (NILP (eol_type))
10205     {
10206       eol_type = make_subsidiaries (name);
10207       for (i = 0; i < 3; i++)
10208         {
10209           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10210
10211           this_name = AREF (eol_type, i);
10212           this_aliases = Fcons (this_name, Qnil);
10213           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10214           this_spec = Fmake_vector (make_number (3), attrs);
10215           ASET (this_spec, 1, this_aliases);
10216           ASET (this_spec, 2, this_eol_type);
10217           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10218           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10219           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10220           if (NILP (val))
10221             Vcoding_system_alist
10222               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10223                        Vcoding_system_alist);
10224         }
10225     }
10226
10227   spec_vec = Fmake_vector (make_number (3), attrs);
10228   ASET (spec_vec, 1, aliases);
10229   ASET (spec_vec, 2, eol_type);
10230
10231   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10232   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10233   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10234   if (NILP (val))
10235     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10236                                   Vcoding_system_alist);
10237
10238   {
10239     int id = coding_categories[category].id;
10240
10241     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10242       setup_coding_system (name, &coding_categories[category]);
10243   }
10244
10245   return Qnil;
10246
10247  short_args:
10248   return Fsignal (Qwrong_number_of_arguments,
10249                   Fcons (intern ("define-coding-system-internal"),
10250                          make_number (nargs)));
10251 }
10252
10253
10254 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10255        3, 3, 0,
10256        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10257   (coding_system, prop, val)
10258      Lisp_Object coding_system, prop, val;
10259 {
10260   Lisp_Object spec, attrs;
10261
10262   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10263   attrs = AREF (spec, 0);
10264   if (EQ (prop, QCmnemonic))
10265     {
10266       if (! STRINGP (val))
10267         CHECK_CHARACTER (val);
10268       CODING_ATTR_MNEMONIC (attrs) = val;
10269     }
10270   else if (EQ (prop, QCdefault_char))
10271     {
10272       if (NILP (val))
10273         val = make_number (' ');
10274       else
10275         CHECK_CHARACTER (val);
10276       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10277     }
10278   else if (EQ (prop, QCdecode_translation_table))
10279     {
10280       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10281         CHECK_SYMBOL (val);
10282       CODING_ATTR_DECODE_TBL (attrs) = val;
10283     }
10284   else if (EQ (prop, QCencode_translation_table))
10285     {
10286       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10287         CHECK_SYMBOL (val);
10288       CODING_ATTR_ENCODE_TBL (attrs) = val;
10289     }
10290   else if (EQ (prop, QCpost_read_conversion))
10291     {
10292       CHECK_SYMBOL (val);
10293       CODING_ATTR_POST_READ (attrs) = val;
10294     }
10295   else if (EQ (prop, QCpre_write_conversion))
10296     {
10297       CHECK_SYMBOL (val);
10298       CODING_ATTR_PRE_WRITE (attrs) = val;
10299     }
10300   else if (EQ (prop, QCascii_compatible_p))
10301     {
10302       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10303     }
10304
10305   CODING_ATTR_PLIST (attrs)
10306     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10307   return val;
10308 }
10309
10310
10311 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10312        Sdefine_coding_system_alias, 2, 2, 0,
10313        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10314      (alias, coding_system)
10315      Lisp_Object alias, coding_system;
10316 {
10317   Lisp_Object spec, aliases, eol_type, val;
10318
10319   CHECK_SYMBOL (alias);
10320   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10321   aliases = AREF (spec, 1);
10322   /* ALIASES should be a list of length more than zero, and the first
10323      element is a base coding system.  Append ALIAS at the tail of the
10324      list.  */
10325   while (!NILP (XCDR (aliases)))
10326     aliases = XCDR (aliases);
10327   XSETCDR (aliases, Fcons (alias, Qnil));
10328
10329   eol_type = AREF (spec, 2);
10330   if (VECTORP (eol_type))
10331     {
10332       Lisp_Object subsidiaries;
10333       int i;
10334
10335       subsidiaries = make_subsidiaries (alias);
10336       for (i = 0; i < 3; i++)
10337         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10338                                      AREF (eol_type, i));
10339     }
10340
10341   Fputhash (alias, spec, Vcoding_system_hash_table);
10342   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10343   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10344   if (NILP (val))
10345     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10346                                   Vcoding_system_alist);
10347
10348   return Qnil;
10349 }
10350
10351 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10352        1, 1, 0,
10353        doc: /* Return the base of CODING-SYSTEM.
10354 Any alias or subsidiary coding system is not a base coding system.  */)
10355   (coding_system)
10356      Lisp_Object coding_system;
10357 {
10358   Lisp_Object spec, attrs;
10359
10360   if (NILP (coding_system))
10361     return (Qno_conversion);
10362   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10363   attrs = AREF (spec, 0);
10364   return CODING_ATTR_BASE_NAME (attrs);
10365 }
10366
10367 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10368        1, 1, 0,
10369        doc: "Return the property list of CODING-SYSTEM.")
10370      (coding_system)
10371      Lisp_Object coding_system;
10372 {
10373   Lisp_Object spec, attrs;
10374
10375   if (NILP (coding_system))
10376     coding_system = Qno_conversion;
10377   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10378   attrs = AREF (spec, 0);
10379   return CODING_ATTR_PLIST (attrs);
10380 }
10381
10382
10383 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10384        1, 1, 0,
10385        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10386      (coding_system)
10387      Lisp_Object coding_system;
10388 {
10389   Lisp_Object spec;
10390
10391   if (NILP (coding_system))
10392     coding_system = Qno_conversion;
10393   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10394   return AREF (spec, 1);
10395 }
10396
10397 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10398        Scoding_system_eol_type, 1, 1, 0,
10399        doc: /* Return eol-type of CODING-SYSTEM.
10400 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10401
10402 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10403 and CR respectively.
10404
10405 A vector value indicates that a format of end-of-line should be
10406 detected automatically.  Nth element of the vector is the subsidiary
10407 coding system whose eol-type is N.  */)
10408      (coding_system)
10409      Lisp_Object coding_system;
10410 {
10411   Lisp_Object spec, eol_type;
10412   int n;
10413
10414   if (NILP (coding_system))
10415     coding_system = Qno_conversion;
10416   if (! CODING_SYSTEM_P (coding_system))
10417     return Qnil;
10418   spec = CODING_SYSTEM_SPEC (coding_system);
10419   eol_type = AREF (spec, 2);
10420   if (VECTORP (eol_type))
10421     return Fcopy_sequence (eol_type);
10422   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10423   return make_number (n);
10424 }
10425
10426 #endif /* emacs */
10427
10428 \f
10429 /*** 9. Post-amble ***/
10430
10431 void
10432 init_coding_once ()
10433 {
10434   int i;
10435
10436   for (i = 0; i < coding_category_max; i++)
10437     {
10438       coding_categories[i].id = -1;
10439       coding_priorities[i] = i;
10440     }
10441
10442   /* ISO2022 specific initialize routine.  */
10443   for (i = 0; i < 0x20; i++)
10444     iso_code_class[i] = ISO_control_0;
10445   for (i = 0x21; i < 0x7F; i++)
10446     iso_code_class[i] = ISO_graphic_plane_0;
10447   for (i = 0x80; i < 0xA0; i++)
10448     iso_code_class[i] = ISO_control_1;
10449   for (i = 0xA1; i < 0xFF; i++)
10450     iso_code_class[i] = ISO_graphic_plane_1;
10451   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10452   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10453   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10454   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10455   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10456   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10457   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10458   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10459   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10460
10461   for (i = 0; i < 256; i++)
10462     {
10463       emacs_mule_bytes[i] = 1;
10464     }
10465   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10466   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10467   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10468   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10469 }
10470
10471 #ifdef emacs
10472
10473 void
10474 syms_of_coding ()
10475 {
10476   staticpro (&Vcoding_system_hash_table);
10477   {
10478     Lisp_Object args[2];
10479     args[0] = QCtest;
10480     args[1] = Qeq;
10481     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10482   }
10483
10484   staticpro (&Vsjis_coding_system);
10485   Vsjis_coding_system = Qnil;
10486
10487   staticpro (&Vbig5_coding_system);
10488   Vbig5_coding_system = Qnil;
10489
10490   staticpro (&Vcode_conversion_reused_workbuf);
10491   Vcode_conversion_reused_workbuf = Qnil;
10492
10493   staticpro (&Vcode_conversion_workbuf_name);
10494   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10495
10496   reused_workbuf_in_use = 0;
10497
10498   DEFSYM (Qcharset, "charset");
10499   DEFSYM (Qtarget_idx, "target-idx");
10500   DEFSYM (Qcoding_system_history, "coding-system-history");
10501   Fset (Qcoding_system_history, Qnil);
10502
10503   /* Target FILENAME is the first argument.  */
10504   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10505   /* Target FILENAME is the third argument.  */
10506   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10507
10508   DEFSYM (Qcall_process, "call-process");
10509   /* Target PROGRAM is the first argument.  */
10510   Fput (Qcall_process, Qtarget_idx, make_number (0));
10511
10512   DEFSYM (Qcall_process_region, "call-process-region");
10513   /* Target PROGRAM is the third argument.  */
10514   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10515
10516   DEFSYM (Qstart_process, "start-process");
10517   /* Target PROGRAM is the third argument.  */
10518   Fput (Qstart_process, Qtarget_idx, make_number (2));
10519
10520   DEFSYM (Qopen_network_stream, "open-network-stream");
10521   /* Target SERVICE is the fourth argument.  */
10522   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10523
10524   DEFSYM (Qcoding_system, "coding-system");
10525   DEFSYM (Qcoding_aliases, "coding-aliases");
10526
10527   DEFSYM (Qeol_type, "eol-type");
10528   DEFSYM (Qunix, "unix");
10529   DEFSYM (Qdos, "dos");
10530
10531   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10532   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10533   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10534   DEFSYM (Qdefault_char, "default-char");
10535   DEFSYM (Qundecided, "undecided");
10536   DEFSYM (Qno_conversion, "no-conversion");
10537   DEFSYM (Qraw_text, "raw-text");
10538
10539   DEFSYM (Qiso_2022, "iso-2022");
10540
10541   DEFSYM (Qutf_8, "utf-8");
10542   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10543
10544   DEFSYM (Qutf_16, "utf-16");
10545   DEFSYM (Qbig, "big");
10546   DEFSYM (Qlittle, "little");
10547
10548   DEFSYM (Qshift_jis, "shift-jis");
10549   DEFSYM (Qbig5, "big5");
10550
10551   DEFSYM (Qcoding_system_p, "coding-system-p");
10552
10553   DEFSYM (Qcoding_system_error, "coding-system-error");
10554   Fput (Qcoding_system_error, Qerror_conditions,
10555         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10556   Fput (Qcoding_system_error, Qerror_message,
10557         make_pure_c_string ("Invalid coding system"));
10558
10559   /* Intern this now in case it isn't already done.
10560      Setting this variable twice is harmless.
10561      But don't staticpro it here--that is done in alloc.c.  */
10562   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10563
10564   DEFSYM (Qtranslation_table, "translation-table");
10565   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10566   DEFSYM (Qtranslation_table_id, "translation-table-id");
10567   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10568   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10569
10570   DEFSYM (Qvalid_codes, "valid-codes");
10571
10572   DEFSYM (Qemacs_mule, "emacs-mule");
10573
10574   DEFSYM (QCcategory, ":category");
10575   DEFSYM (QCmnemonic, ":mnemonic");
10576   DEFSYM (QCdefault_char, ":default-char");
10577   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10578   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10579   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10580   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10581   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10582
10583   Vcoding_category_table
10584     = Fmake_vector (make_number (coding_category_max), Qnil);
10585   staticpro (&Vcoding_category_table);
10586   /* Followings are target of code detection.  */
10587   ASET (Vcoding_category_table, coding_category_iso_7,
10588         intern_c_string ("coding-category-iso-7"));
10589   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10590         intern_c_string ("coding-category-iso-7-tight"));
10591   ASET (Vcoding_category_table, coding_category_iso_8_1,
10592         intern_c_string ("coding-category-iso-8-1"));
10593   ASET (Vcoding_category_table, coding_category_iso_8_2,
10594         intern_c_string ("coding-category-iso-8-2"));
10595   ASET (Vcoding_category_table, coding_category_iso_7_else,
10596         intern_c_string ("coding-category-iso-7-else"));
10597   ASET (Vcoding_category_table, coding_category_iso_8_else,
10598         intern_c_string ("coding-category-iso-8-else"));
10599   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10600         intern_c_string ("coding-category-utf-8-auto"));
10601   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10602         intern_c_string ("coding-category-utf-8"));
10603   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10604         intern_c_string ("coding-category-utf-8-sig"));
10605   ASET (Vcoding_category_table, coding_category_utf_16_be,
10606         intern_c_string ("coding-category-utf-16-be"));
10607   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10608         intern_c_string ("coding-category-utf-16-auto"));
10609   ASET (Vcoding_category_table, coding_category_utf_16_le,
10610         intern_c_string ("coding-category-utf-16-le"));
10611   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10612         intern_c_string ("coding-category-utf-16-be-nosig"));
10613   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10614         intern_c_string ("coding-category-utf-16-le-nosig"));
10615   ASET (Vcoding_category_table, coding_category_charset,
10616         intern_c_string ("coding-category-charset"));
10617   ASET (Vcoding_category_table, coding_category_sjis,
10618         intern_c_string ("coding-category-sjis"));
10619   ASET (Vcoding_category_table, coding_category_big5,
10620         intern_c_string ("coding-category-big5"));
10621   ASET (Vcoding_category_table, coding_category_ccl,
10622         intern_c_string ("coding-category-ccl"));
10623   ASET (Vcoding_category_table, coding_category_emacs_mule,
10624         intern_c_string ("coding-category-emacs-mule"));
10625   /* Followings are NOT target of code detection.  */
10626   ASET (Vcoding_category_table, coding_category_raw_text,
10627         intern_c_string ("coding-category-raw-text"));
10628   ASET (Vcoding_category_table, coding_category_undecided,
10629         intern_c_string ("coding-category-undecided"));
10630
10631   DEFSYM (Qinsufficient_source, "insufficient-source");
10632   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10633   DEFSYM (Qinvalid_source, "invalid-source");
10634   DEFSYM (Qinterrupted, "interrupted");
10635   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10636   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10637
10638   defsubr (&Scoding_system_p);
10639   defsubr (&Sread_coding_system);
10640   defsubr (&Sread_non_nil_coding_system);
10641   defsubr (&Scheck_coding_system);
10642   defsubr (&Sdetect_coding_region);
10643   defsubr (&Sdetect_coding_string);
10644   defsubr (&Sfind_coding_systems_region_internal);
10645   defsubr (&Sunencodable_char_position);
10646   defsubr (&Scheck_coding_systems_region);
10647   defsubr (&Sdecode_coding_region);
10648   defsubr (&Sencode_coding_region);
10649   defsubr (&Sdecode_coding_string);
10650   defsubr (&Sencode_coding_string);
10651   defsubr (&Sdecode_sjis_char);
10652   defsubr (&Sencode_sjis_char);
10653   defsubr (&Sdecode_big5_char);
10654   defsubr (&Sencode_big5_char);
10655   defsubr (&Sset_terminal_coding_system_internal);
10656   defsubr (&Sset_safe_terminal_coding_system_internal);
10657   defsubr (&Sterminal_coding_system);
10658   defsubr (&Sset_keyboard_coding_system_internal);
10659   defsubr (&Skeyboard_coding_system);
10660   defsubr (&Sfind_operation_coding_system);
10661   defsubr (&Sset_coding_system_priority);
10662   defsubr (&Sdefine_coding_system_internal);
10663   defsubr (&Sdefine_coding_system_alias);
10664   defsubr (&Scoding_system_put);
10665   defsubr (&Scoding_system_base);
10666   defsubr (&Scoding_system_plist);
10667   defsubr (&Scoding_system_aliases);
10668   defsubr (&Scoding_system_eol_type);
10669   defsubr (&Scoding_system_priority_list);
10670
10671   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10672                doc: /* List of coding systems.
10673
10674 Do not alter the value of this variable manually.  This variable should be
10675 updated by the functions `define-coding-system' and
10676 `define-coding-system-alias'.  */);
10677   Vcoding_system_list = Qnil;
10678
10679   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10680                doc: /* Alist of coding system names.
10681 Each element is one element list of coding system name.
10682 This variable is given to `completing-read' as COLLECTION argument.
10683
10684 Do not alter the value of this variable manually.  This variable should be
10685 updated by the functions `make-coding-system' and
10686 `define-coding-system-alias'.  */);
10687   Vcoding_system_alist = Qnil;
10688
10689   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10690                doc: /* List of coding-categories (symbols) ordered by priority.
10691
10692 On detecting a coding system, Emacs tries code detection algorithms
10693 associated with each coding-category one by one in this order.  When
10694 one algorithm agrees with a byte sequence of source text, the coding
10695 system bound to the corresponding coding-category is selected.
10696
10697 Don't modify this variable directly, but use `set-coding-priority'.  */);
10698   {
10699     int i;
10700
10701     Vcoding_category_list = Qnil;
10702     for (i = coding_category_max - 1; i >= 0; i--)
10703       Vcoding_category_list
10704         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10705                  Vcoding_category_list);
10706   }
10707
10708   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10709                doc: /* Specify the coding system for read operations.
10710 It is useful to bind this variable with `let', but do not set it globally.
10711 If the value is a coding system, it is used for decoding on read operation.
10712 If not, an appropriate element is used from one of the coding system alists.
10713 There are three such tables: `file-coding-system-alist',
10714 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10715   Vcoding_system_for_read = Qnil;
10716
10717   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10718                doc: /* Specify the coding system for write operations.
10719 Programs bind this variable with `let', but you should not set it globally.
10720 If the value is a coding system, it is used for encoding of output,
10721 when writing it to a file and when sending it to a file or subprocess.
10722
10723 If this does not specify a coding system, an appropriate element
10724 is used from one of the coding system alists.
10725 There are three such tables: `file-coding-system-alist',
10726 `process-coding-system-alist', and `network-coding-system-alist'.
10727 For output to files, if the above procedure does not specify a coding system,
10728 the value of `buffer-file-coding-system' is used.  */);
10729   Vcoding_system_for_write = Qnil;
10730
10731   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10732                doc: /*
10733 Coding system used in the latest file or process I/O.  */);
10734   Vlast_coding_system_used = Qnil;
10735
10736   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10737                doc: /*
10738 Error status of the last code conversion.
10739
10740 When an error was detected in the last code conversion, this variable
10741 is set to one of the following symbols.
10742   `insufficient-source'
10743   `inconsistent-eol'
10744   `invalid-source'
10745   `interrupted'
10746   `insufficient-memory'
10747 When no error was detected, the value doesn't change.  So, to check
10748 the error status of a code conversion by this variable, you must
10749 explicitly set this variable to nil before performing code
10750 conversion.  */);
10751   Vlast_code_conversion_error = Qnil;
10752
10753   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10754                doc: /*
10755 *Non-nil means always inhibit code conversion of end-of-line format.
10756 See info node `Coding Systems' and info node `Text and Binary' concerning
10757 such conversion.  */);
10758   inhibit_eol_conversion = 0;
10759
10760   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10761                doc: /*
10762 Non-nil means process buffer inherits coding system of process output.
10763 Bind it to t if the process output is to be treated as if it were a file
10764 read from some filesystem.  */);
10765   inherit_process_coding_system = 0;
10766
10767   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10768                doc: /*
10769 Alist to decide a coding system to use for a file I/O operation.
10770 The format is ((PATTERN . VAL) ...),
10771 where PATTERN is a regular expression matching a file name,
10772 VAL is a coding system, a cons of coding systems, or a function symbol.
10773 If VAL is a coding system, it is used for both decoding and encoding
10774 the file contents.
10775 If VAL is a cons of coding systems, the car part is used for decoding,
10776 and the cdr part is used for encoding.
10777 If VAL is a function symbol, the function must return a coding system
10778 or a cons of coding systems which are used as above.  The function is
10779 called with an argument that is a list of the arguments with which
10780 `find-operation-coding-system' was called.  If the function can't decide
10781 a coding system, it can return `undecided' so that the normal
10782 code-detection is performed.
10783
10784 See also the function `find-operation-coding-system'
10785 and the variable `auto-coding-alist'.  */);
10786   Vfile_coding_system_alist = Qnil;
10787
10788   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10789                doc: /*
10790 Alist to decide a coding system to use for a process I/O operation.
10791 The format is ((PATTERN . VAL) ...),
10792 where PATTERN is a regular expression matching a program name,
10793 VAL is a coding system, a cons of coding systems, or a function symbol.
10794 If VAL is a coding system, it is used for both decoding what received
10795 from the program and encoding what sent to the program.
10796 If VAL is a cons of coding systems, the car part is used for decoding,
10797 and the cdr part is used for encoding.
10798 If VAL is a function symbol, the function must return a coding system
10799 or a cons of coding systems which are used as above.
10800
10801 See also the function `find-operation-coding-system'.  */);
10802   Vprocess_coding_system_alist = Qnil;
10803
10804   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10805                doc: /*
10806 Alist to decide a coding system to use for a network I/O operation.
10807 The format is ((PATTERN . VAL) ...),
10808 where PATTERN is a regular expression matching a network service name
10809 or is a port number to connect to,
10810 VAL is a coding system, a cons of coding systems, or a function symbol.
10811 If VAL is a coding system, it is used for both decoding what received
10812 from the network stream and encoding what sent to the network stream.
10813 If VAL is a cons of coding systems, the car part is used for decoding,
10814 and the cdr part is used for encoding.
10815 If VAL is a function symbol, the function must return a coding system
10816 or a cons of coding systems which are used as above.
10817
10818 See also the function `find-operation-coding-system'.  */);
10819   Vnetwork_coding_system_alist = Qnil;
10820
10821   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10822                doc: /* Coding system to use with system messages.
10823 Also used for decoding keyboard input on X Window system.  */);
10824   Vlocale_coding_system = Qnil;
10825
10826   /* The eol mnemonics are reset in startup.el system-dependently.  */
10827   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10828                doc: /*
10829 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10830   eol_mnemonic_unix = make_pure_c_string (":");
10831
10832   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10833                doc: /*
10834 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10835   eol_mnemonic_dos = make_pure_c_string ("\\");
10836
10837   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10838                doc: /*
10839 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10840   eol_mnemonic_mac = make_pure_c_string ("/");
10841
10842   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10843                doc: /*
10844 *String displayed in mode line when end-of-line format is not yet determined.  */);
10845   eol_mnemonic_undecided = make_pure_c_string (":");
10846
10847   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10848                doc: /*
10849 *Non-nil enables character translation while encoding and decoding.  */);
10850   Venable_character_translation = Qt;
10851
10852   DEFVAR_LISP ("standard-translation-table-for-decode",
10853                &Vstandard_translation_table_for_decode,
10854                doc: /* Table for translating characters while decoding.  */);
10855   Vstandard_translation_table_for_decode = Qnil;
10856
10857   DEFVAR_LISP ("standard-translation-table-for-encode",
10858                &Vstandard_translation_table_for_encode,
10859                doc: /* Table for translating characters while encoding.  */);
10860   Vstandard_translation_table_for_encode = Qnil;
10861
10862   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10863                doc: /* Alist of charsets vs revision numbers.
10864 While encoding, if a charset (car part of an element) is found,
10865 designate it with the escape sequence identifying revision (cdr part
10866 of the element).  */);
10867   Vcharset_revision_table = Qnil;
10868
10869   DEFVAR_LISP ("default-process-coding-system",
10870                &Vdefault_process_coding_system,
10871                doc: /* Cons of coding systems used for process I/O by default.
10872 The car part is used for decoding a process output,
10873 the cdr part is used for encoding a text to be sent to a process.  */);
10874   Vdefault_process_coding_system = Qnil;
10875
10876   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10877                doc: /*
10878 Table of extra Latin codes in the range 128..159 (inclusive).
10879 This is a vector of length 256.
10880 If Nth element is non-nil, the existence of code N in a file
10881 \(or output of subprocess) doesn't prevent it to be detected as
10882 a coding system of ISO 2022 variant which has a flag
10883 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10884 or reading output of a subprocess.
10885 Only 128th through 159th elements have a meaning.  */);
10886   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10887
10888   DEFVAR_LISP ("select-safe-coding-system-function",
10889                &Vselect_safe_coding_system_function,
10890                doc: /*
10891 Function to call to select safe coding system for encoding a text.
10892
10893 If set, this function is called to force a user to select a proper
10894 coding system which can encode the text in the case that a default
10895 coding system used in each operation can't encode the text.  The
10896 function should take care that the buffer is not modified while
10897 the coding system is being selected.
10898
10899 The default value is `select-safe-coding-system' (which see).  */);
10900   Vselect_safe_coding_system_function = Qnil;
10901
10902   DEFVAR_BOOL ("coding-system-require-warning",
10903                &coding_system_require_warning,
10904                doc: /* Internal use only.
10905 If non-nil, on writing a file, `select-safe-coding-system-function' is
10906 called even if `coding-system-for-write' is non-nil.  The command
10907 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10908   coding_system_require_warning = 0;
10909
10910
10911   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10912                &inhibit_iso_escape_detection,
10913                doc: /*
10914 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10915
10916 When Emacs reads text, it tries to detect how the text is encoded.
10917 This code detection is sensitive to escape sequences.  If Emacs sees
10918 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10919 of the ISO2022 encodings, and decodes text by the corresponding coding
10920 system (e.g. `iso-2022-7bit').
10921
10922 However, there may be a case that you want to read escape sequences in
10923 a file as is.  In such a case, you can set this variable to non-nil.
10924 Then the code detection will ignore any escape sequences, and no text is
10925 detected as encoded in some ISO-2022 encoding.  The result is that all
10926 escape sequences become visible in a buffer.
10927
10928 The default value is nil, and it is strongly recommended not to change
10929 it.  That is because many Emacs Lisp source files that contain
10930 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10931 in Emacs's distribution, and they won't be decoded correctly on
10932 reading if you suppress escape sequence detection.
10933
10934 The other way to read escape sequences in a file without decoding is
10935 to explicitly specify some coding system that doesn't use ISO-2022
10936 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10937   inhibit_iso_escape_detection = 0;
10938
10939   DEFVAR_BOOL ("inhibit-null-byte-detection",
10940                &inhibit_null_byte_detection,
10941                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10942 By default, Emacs treats it as binary data, and does not attempt to
10943 decode it.  The effect is as if you specified `no-conversion' for
10944 reading that text.
10945
10946 Set this to non-nil when a regular text happens to include null bytes.
10947 Examples are Index nodes of Info files and null-byte delimited output
10948 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10949 decode text as usual.  */);
10950   inhibit_null_byte_detection = 0;
10951
10952   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10953                doc: /* Char table for translating self-inserting characters.
10954 This is applied to the result of input methods, not their input.
10955 See also `keyboard-translate-table'.
10956
10957 Use of this variable for character code unification was rendered
10958 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10959 internal character representation.  */);
10960     Vtranslation_table_for_input = Qnil;
10961
10962   {
10963     Lisp_Object args[coding_arg_max];
10964     Lisp_Object plist[16];
10965     int i;
10966
10967     for (i = 0; i < coding_arg_max; i++)
10968       args[i] = Qnil;
10969
10970     plist[0] = intern_c_string (":name");
10971     plist[1] = args[coding_arg_name] = Qno_conversion;
10972     plist[2] = intern_c_string (":mnemonic");
10973     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10974     plist[4] = intern_c_string (":coding-type");
10975     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10976     plist[6] = intern_c_string (":ascii-compatible-p");
10977     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10978     plist[8] = intern_c_string (":default-char");
10979     plist[9] = args[coding_arg_default_char] = make_number (0);
10980     plist[10] = intern_c_string (":for-unibyte");
10981     plist[11] = args[coding_arg_for_unibyte] = Qt;
10982     plist[12] = intern_c_string (":docstring");
10983     plist[13] = make_pure_c_string ("Do no conversion.\n\
10984 \n\
10985 When you visit a file with this coding, the file is read into a\n\
10986 unibyte buffer as is, thus each byte of a file is treated as a\n\
10987 character.");
10988     plist[14] = intern_c_string (":eol-type");
10989     plist[15] = args[coding_arg_eol_type] = Qunix;
10990     args[coding_arg_plist] = Flist (16, plist);
10991     Fdefine_coding_system_internal (coding_arg_max, args);
10992
10993     plist[1] = args[coding_arg_name] = Qundecided;
10994     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10995     plist[5] = args[coding_arg_coding_type] = Qundecided;
10996     /* This is already set.
10997        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10998     plist[8] = intern_c_string (":charset-list");
10999     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11000     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11001     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11002     plist[15] = args[coding_arg_eol_type] = Qnil;
11003     args[coding_arg_plist] = Flist (16, plist);
11004     Fdefine_coding_system_internal (coding_arg_max, args);
11005   }
11006
11007   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11008
11009   {
11010     int i;
11011
11012     for (i = 0; i < coding_category_max; i++)
11013       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11014   }
11015 #if defined (MSDOS) || defined (WINDOWSNT)
11016   system_eol_type = Qdos;
11017 #else
11018   system_eol_type = Qunix;
11019 #endif
11020   staticpro (&system_eol_type);
11021 }
11022
11023 char *
11024 emacs_strerror (error_number)
11025      int error_number;
11026 {
11027   char *str;
11028
11029   synchronize_system_messages_locale ();
11030   str = strerror (error_number);
11031
11032   if (! NILP (Vlocale_coding_system))
11033     {
11034       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11035                                                       Vlocale_coding_system,
11036                                                       0);
11037       str = (char *) SDATA (dec);
11038     }
11039
11040   return str;
11041 }
11042
11043 #endif /* emacs */
11044
11045 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
11046    (do not change this comment) */