src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 653   do {                                                  \
 654     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 655     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 656   } while (0)
 657
 658
 659 /* Safely get one byte from the source text pointed by SRC which ends
 660    at SRC_END, and set C to that byte.  If there are not enough bytes
 661    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 662    and a multibyte character is found at SRC, set C to the
 663    negative value of the character code.  The caller should declare
 664    and set these variables appropriately in advance:
 665         src, src_end, multibytep */
 666
 667 #define ONE_MORE_BYTE(c)                                \
 668   do {                                                  \
 669     if (src == src_end)                                 \
 670       {                                                 \
 671         if (src_base < src)                             \
 672           record_conversion_result                      \
 673             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 674         goto no_more_source;                            \
 675       }                                                 \
 676     c = *src++;                                         \
 677     if (multibytep && (c & 0x80))                       \
 678       {                                                 \
 679         if ((c & 0xFE) == 0xC0)                         \
 680           c = ((c & 1) << 6) | *src++;                  \
 681         else                                            \
 682           {                                             \
 683             src--;                                      \
 684             c = - string_char (src, &src, NULL);        \
 685             record_conversion_result                    \
 686               (coding, CODING_RESULT_INVALID_SRC);      \
 687           }                                             \
 688       }                                                 \
 689     consumed_chars++;                                   \
 690   } while (0)
 691
 692 /* Safely get two bytes from the source text pointed by SRC which ends
 693    at SRC_END, and set C1 and C2 to those bytes while skipping the
 694    heading multibyte characters.  If there are not enough bytes in the
 695    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 696    a multibyte character is found for C2, set C2 to the negative value
 697    of the character code.  The caller should declare and set these
 698    variables appropriately in advance:
 699         src, src_end, multibytep
 700    It is intended that this macro is used in detect_coding_utf_16.  */
 701
 702 #define TWO_MORE_BYTES(c1, c2)                          \
 703   do {                                                  \
 704     do {                                                \
 705       if (src == src_end)                               \
 706         goto no_more_source;                            \
 707       c1 = *src++;                                      \
 708       if (multibytep && (c1 & 0x80))                    \
 709         {                                               \
 710           if ((c1 & 0xFE) == 0xC0)                      \
 711             c1 = ((c1 & 1) << 6) | *src++;              \
 712           else                                          \
 713             {                                           \
 714               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 715               c1 = -1;                                  \
 716             }                                           \
 717         }                                               \
 718     } while (c1 < 0);                                   \
 719     if (src == src_end)                                 \
 720       goto no_more_source;                              \
 721     c2 = *src++;                                        \
 722     if (multibytep && (c2 & 0x80))                      \
 723       {                                                 \
 724         if ((c2 & 0xFE) == 0xC0)                        \
 725           c2 = ((c2 & 1) << 6) | *src++;                \
 726         else                                            \
 727           c2 = -1;                                      \
 728       }                                                 \
 729   } while (0)
 730
 731
 732 /* Store a byte C in the place pointed by DST and increment DST to the
 733    next free point, and increment PRODUCED_CHARS.  The caller should
 734    assure that C is 0..127, and declare and set the variable `dst'
 735    appropriately in advance.
 736 */
 737
 738
 739 #define EMIT_ONE_ASCII_BYTE(c)  \
 740   do {                          \
 741     produced_chars++;           \
 742     *dst++ = (c);               \
 743   } while (0)
 744
 745
 746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 747
 748 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 749   do {                                  \
 750     produced_chars += 2;                \
 751     *dst++ = (c1), *dst++ = (c2);       \
 752   } while (0)
 753
 754
 755 /* Store a byte C in the place pointed by DST and increment DST to the
 756    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 757    store in an appropriate multibyte form.  The caller should
 758    declare and set the variables `dst' and `multibytep' appropriately
 759    in advance.  */
 760
 761 #define EMIT_ONE_BYTE(c)                \
 762   do {                                  \
 763     produced_chars++;                   \
 764     if (multibytep)                     \
 765       {                                 \
 766         unsigned ch = (c);              \
 767         if (ch >= 0x80)                 \
 768           ch = BYTE8_TO_CHAR (ch);      \
 769         CHAR_STRING_ADVANCE (ch, dst);  \
 770       }                                 \
 771     else                                \
 772       *dst++ = (c);                     \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_BYTES(c1, c2)          \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     if (multibytep)                     \
 782       {                                 \
 783         unsigned ch;                    \
 784                                         \
 785         ch = (c1);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789         ch = (c2);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793       }                                 \
 794     else                                \
 795       {                                 \
 796         *dst++ = (c1);                  \
 797         *dst++ = (c2);                  \
 798       }                                 \
 799   } while (0)
 800
 801
 802 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 803   do {                                  \
 804     EMIT_ONE_BYTE (c1);                 \
 805     EMIT_TWO_BYTES (c2, c3);            \
 806   } while (0)
 807
 808
 809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 810   do {                                          \
 811     EMIT_TWO_BYTES (c1, c2);                    \
 812     EMIT_TWO_BYTES (c3, c4);                    \
 813   } while (0)
 814
 815
 816 static void
 817 record_conversion_result (struct coding_system *coding,
 818                           enum coding_result_code result)
 819 {
 820   coding->result = result;
 821   switch (result)
 822     {
 823     case CODING_RESULT_INSUFFICIENT_SRC:
 824       Vlast_code_conversion_error = Qinsufficient_source;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_DST:
 833       /* Don't record this error in Vlast_code_conversion_error
 834          because it happens just temporarily and is resolved when the
 835          whole conversion is finished.  */
 836       break;
 837     case CODING_RESULT_SUCCESS:
 838       break;
 839     default:
 840       Vlast_code_conversion_error = intern ("Unknown error");
 841     }
 842 }
 843
 844 /* These wrapper macros are used to preserve validity of pointers into
 845    buffer text across calls to decode_char, encode_char, etc, which
 846    could cause relocation of buffers if it loads a charset map,
 847    because loading a charset map allocates large structures.  */
 848
 849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 850   do {                                                                       \
 851     ptrdiff_t offset;                                                        \
 852                                                                              \
 853     charset_map_loaded = 0;                                                  \
 854     c = DECODE_CHAR (charset, code);                                         \
 855     if (charset_map_loaded                                                   \
 856         && (offset = coding_change_source (coding)))                         \
 857       {                                                                      \
 858         src += offset;                                                       \
 859         src_base += offset;                                                  \
 860         src_end += offset;                                                   \
 861       }                                                                      \
 862   } while (0)
 863
 864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 865   do {                                                                  \
 866     ptrdiff_t offset;                                                   \
 867                                                                         \
 868     charset_map_loaded = 0;                                             \
 869     code = ENCODE_CHAR (charset, c);                                    \
 870     if (charset_map_loaded                                              \
 871         && (offset = coding_change_destination (coding)))               \
 872       {                                                                 \
 873         dst += offset;                                                  \
 874         dst_end += offset;                                              \
 875       }                                                                 \
 876   } while (0)
 877
 878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 879   do {                                                                  \
 880     ptrdiff_t offset;                                                   \
 881                                                                         \
 882     charset_map_loaded = 0;                                             \
 883     charset = char_charset (c, charset_list, code_return);              \
 884     if (charset_map_loaded                                              \
 885         && (offset = coding_change_destination (coding)))               \
 886       {                                                                 \
 887         dst += offset;                                                  \
 888         dst_end += offset;                                              \
 889       }                                                                 \
 890   } while (0)
 891
 892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 893   do {                                                                  \
 894     ptrdiff_t offset;                                                   \
 895                                                                         \
 896     charset_map_loaded = 0;                                             \
 897     result = CHAR_CHARSET_P (c, charset);                               \
 898     if (charset_map_loaded                                              \
 899         && (offset = coding_change_destination (coding)))               \
 900       {                                                                 \
 901         dst += offset;                                                  \
 902         dst_end += offset;                                              \
 903       }                                                                 \
 904   } while (0)
 905
 906
 907 /* If there are at least BYTES length of room at dst, allocate memory
 908    for coding->destination and update dst and dst_end.  We don't have
 909    to take care of coding->source which will be relocated.  It is
 910    handled by calling coding_set_source in encode_coding.  */
 911
 912 #define ASSURE_DESTINATION(bytes)                               \
 913   do {                                                          \
 914     if (dst + (bytes) >= dst_end)                               \
 915       {                                                         \
 916         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 917                                                                 \
 918         dst = alloc_destination (coding, more_bytes, dst);      \
 919         dst_end = coding->destination + coding->dst_bytes;      \
 920       }                                                         \
 921   } while (0)
 922
 923
 924 /* Store multibyte form of the character C in P, and advance P to the
 925    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 926    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 927    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 928
 929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 930
 931 /* Return the character code of character whose multibyte form is at
 932    P, and advance P to the end of the multibyte form.  This used to be
 933    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 934    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 935
 936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 937
 938 /* Set coding->source from coding->src_object.  */
 939
 940 static void
 941 coding_set_source (struct coding_system *coding)
 942 {
 943   if (BUFFERP (coding->src_object))
 944     {
 945       struct buffer *buf = XBUFFER (coding->src_object);
 946
 947       if (coding->src_pos < 0)
 948         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 949       else
 950         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 951     }
 952   else if (STRINGP (coding->src_object))
 953     {
 954       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 955     }
 956   else
 957     {
 958       /* Otherwise, the source is C string and is never relocated
 959          automatically.  Thus we don't have to update anything.  */
 960     }
 961 }
 962
 963
 964 /* Set coding->source from coding->src_object, and return how many
 965    bytes coding->source was changed.  */
 966
 967 static ptrdiff_t
 968 coding_change_source (struct coding_system *coding)
 969 {
 970   const unsigned char *orig = coding->source;
 971   coding_set_source (coding);
 972   return coding->source - orig;
 973 }
 974
 975
 976 /* Set coding->destination from coding->dst_object.  */
 977
 978 static void
 979 coding_set_destination (struct coding_system *coding)
 980 {
 981   if (BUFFERP (coding->dst_object))
 982     {
 983       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 984         {
 985           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 986           coding->dst_bytes = (GAP_END_ADDR
 987                                - (coding->src_bytes - coding->consumed)
 988                                - coding->destination);
 989         }
 990       else
 991         {
 992           /* We are sure that coding->dst_pos_byte is before the gap
 993              of the buffer. */
 994           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 995                                  + coding->dst_pos_byte - BEG_BYTE);
 996           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 997                                - coding->destination);
 998         }
 999     }
1000   else
1001     {
1002       /* Otherwise, the destination is C string and is never relocated
1003          automatically.  Thus we don't have to update anything.  */
1004     }
1005 }
1006
1007
1008 /* Set coding->destination from coding->dst_object, and return how
1009    many bytes coding->destination was changed.  */
1010
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1013 {
1014   const unsigned char *orig = coding->destination;
1015   coding_set_destination (coding);
1016   return coding->destination - orig;
1017 }
1018
1019
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1022 {
1023   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024     string_overflow ();
1025   coding->destination = xrealloc (coding->destination,
1026                                   coding->dst_bytes + bytes);
1027   coding->dst_bytes += bytes;
1028 }
1029
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1033 {
1034   if (EQ (coding->src_object, coding->dst_object))
1035     {
1036       /* The gap may contain the produced data at the head and not-yet
1037          consumed data at the tail.  To preserve those data, we at
1038          first make the gap size to zero, then increase the gap
1039          size.  */
1040       ptrdiff_t add = GAP_SIZE;
1041
1042       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044       make_gap (bytes);
1045       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1047     }
1048   else
1049     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1050 }
1051
1052
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055                    unsigned char *dst)
1056 {
1057   ptrdiff_t offset = dst - coding->destination;
1058
1059   if (BUFFERP (coding->dst_object))
1060     {
1061       struct buffer *buf = XBUFFER (coding->dst_object);
1062
1063       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1064     }
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   coding_set_destination (coding);
1068   dst = coding->destination + offset;
1069   return dst;
1070 }
1071
1072 /** Macros for annotations.  */
1073
1074 /* An annotation data is stored in the array coding->charbuf in this
1075    format:
1076      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077    LENGTH is the number of elements in the annotation.
1078    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079    NCHARS is the number of characters in the text annotated.
1080
1081    The format of the following elements depend on ANNOTATION_MASK.
1082
1083    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084    follows:
1085      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1086
1087    NBYTES is the number of bytes specified in the header part of
1088    old-style emacs-mule encoding, or 0 for the other kind of
1089    composition.
1090
1091    METHOD is one of enum composition_method.
1092
1093    Optional COMPOSITION-COMPONENTS are characters and composition
1094    rules.
1095
1096    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097    follows.
1098
1099    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100    recover from an invalid annotation, and should be skipped by
1101    produce_annotation.  */
1102
1103 /* Maximum length of the header of annotation data.  */
1104 #define MAX_ANNOTATION_LENGTH 5
1105
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1107   do {                                                  \
1108     *(buf)++ = -(len);                                  \
1109     *(buf)++ = (mask);                                  \
1110     *(buf)++ = (nchars);                                \
1111     coding->annotated = 1;                              \
1112   } while (0);
1113
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1115   do {                                                                      \
1116     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117     *buf++ = nbytes;                                                        \
1118     *buf++ = method;                                                        \
1119   } while (0)
1120
1121
1122 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1123   do {                                                                  \
1124     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125     *buf++ = id;                                                        \
1126   } while (0)
1127
1128 \f
1129 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1130
1131
1132
1133 \f
1134 /*** 3. UTF-8 ***/
1135
1136 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1137    Return true if a text is encoded in UTF-8.  */
1138
1139 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1140 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1141 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1142 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1143 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1144 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1145
1146 #define UTF_8_BOM_1 0xEF
1147 #define UTF_8_BOM_2 0xBB
1148 #define UTF_8_BOM_3 0xBF
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   bool found = 0;
1160
1161   detect_info->checked |= CATEGORY_MASK_UTF_8;
1162   /* A coding system of this category is always ASCII compatible.  */
1163   src += coding->head_ascii;
1164
1165   while (1)
1166     {
1167       int c, c1, c2, c3, c4;
1168
1169       src_base = src;
1170       ONE_MORE_BYTE (c);
1171       if (c < 0 || UTF_8_1_OCTET_P (c))
1172         continue;
1173       ONE_MORE_BYTE (c1);
1174       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175         break;
1176       if (UTF_8_2_OCTET_LEADING_P (c))
1177         {
1178           found = 1;
1179           continue;
1180         }
1181       ONE_MORE_BYTE (c2);
1182       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1183         break;
1184       if (UTF_8_3_OCTET_LEADING_P (c))
1185         {
1186           found = 1;
1187           if (src_base == coding->source
1188               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189             bom_found = 1;
1190           continue;
1191         }
1192       ONE_MORE_BYTE (c3);
1193       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1194         break;
1195       if (UTF_8_4_OCTET_LEADING_P (c))
1196         {
1197           found = 1;
1198           continue;
1199         }
1200       ONE_MORE_BYTE (c4);
1201       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1202         break;
1203       if (UTF_8_5_OCTET_LEADING_P (c))
1204         {
1205           found = 1;
1206           continue;
1207         }
1208       break;
1209     }
1210   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1211   return 0;
1212
1213  no_more_source:
1214   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1215     {
1216       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1217       return 0;
1218     }
1219   if (bom_found)
1220     {
1221       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1222       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1223     }
1224   else
1225     {
1226       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1227       if (found)
1228         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1229     }
1230   return 1;
1231 }
1232
1233
1234 static void
1235 decode_coding_utf_8 (struct coding_system *coding)
1236 {
1237   const unsigned char *src = coding->source + coding->consumed;
1238   const unsigned char *src_end = coding->source + coding->src_bytes;
1239   const unsigned char *src_base;
1240   int *charbuf = coding->charbuf + coding->charbuf_used;
1241   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1242   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1243   bool multibytep = coding->src_multibyte;
1244   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1245   bool eol_dos
1246     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1247   int byte_after_cr = -1;
1248
1249   if (bom != utf_without_bom)
1250     {
1251       int c1, c2, c3;
1252
1253       src_base = src;
1254       ONE_MORE_BYTE (c1);
1255       if (! UTF_8_3_OCTET_LEADING_P (c1))
1256         src = src_base;
1257       else
1258         {
1259           ONE_MORE_BYTE (c2);
1260           if (! UTF_8_EXTRA_OCTET_P (c2))
1261             src = src_base;
1262           else
1263             {
1264               ONE_MORE_BYTE (c3);
1265               if (! UTF_8_EXTRA_OCTET_P (c3))
1266                 src = src_base;
1267               else
1268                 {
1269                   if ((c1 != UTF_8_BOM_1)
1270                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1271                     src = src_base;
1272                   else
1273                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1274                 }
1275             }
1276         }
1277     }
1278   CODING_UTF_8_BOM (coding) = utf_without_bom;
1279
1280   while (1)
1281     {
1282       int c, c1, c2, c3, c4, c5;
1283
1284       src_base = src;
1285       consumed_chars_base = consumed_chars;
1286
1287       if (charbuf >= charbuf_end)
1288         {
1289           if (byte_after_cr >= 0)
1290             src_base--;
1291           break;
1292         }
1293
1294       if (byte_after_cr >= 0)
1295         c1 = byte_after_cr, byte_after_cr = -1;
1296       else
1297         ONE_MORE_BYTE (c1);
1298       if (c1 < 0)
1299         {
1300           c = - c1;
1301         }
1302       else if (UTF_8_1_OCTET_P (c1))
1303         {
1304           if (eol_dos && c1 == '\r')
1305             ONE_MORE_BYTE (byte_after_cr);
1306           c = c1;
1307         }
1308       else
1309         {
1310           ONE_MORE_BYTE (c2);
1311           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1312             goto invalid_code;
1313           if (UTF_8_2_OCTET_LEADING_P (c1))
1314             {
1315               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1316               /* Reject overlong sequences here and below.  Encoders
1317                  producing them are incorrect, they can be misleading,
1318                  and they mess up read/write invariance.  */
1319               if (c < 128)
1320                 goto invalid_code;
1321             }
1322           else
1323             {
1324               ONE_MORE_BYTE (c3);
1325               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1326                 goto invalid_code;
1327               if (UTF_8_3_OCTET_LEADING_P (c1))
1328                 {
1329                   c = (((c1 & 0xF) << 12)
1330                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1331                   if (c < 0x800
1332                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1333                     goto invalid_code;
1334                 }
1335               else
1336                 {
1337                   ONE_MORE_BYTE (c4);
1338                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1339                     goto invalid_code;
1340                   if (UTF_8_4_OCTET_LEADING_P (c1))
1341                     {
1342                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1343                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1344                     if (c < 0x10000)
1345                       goto invalid_code;
1346                     }
1347                   else
1348                     {
1349                       ONE_MORE_BYTE (c5);
1350                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1351                         goto invalid_code;
1352                       if (UTF_8_5_OCTET_LEADING_P (c1))
1353                         {
1354                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1355                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1356                                | (c5 & 0x3F));
1357                           if ((c > MAX_CHAR) || (c < 0x200000))
1358                             goto invalid_code;
1359                         }
1360                       else
1361                         goto invalid_code;
1362                     }
1363                 }
1364             }
1365         }
1366
1367       *charbuf++ = c;
1368       continue;
1369
1370     invalid_code:
1371       src = src_base;
1372       consumed_chars = consumed_chars_base;
1373       ONE_MORE_BYTE (c);
1374       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1375       coding->errors++;
1376     }
1377
1378  no_more_source:
1379   coding->consumed_char += consumed_chars_base;
1380   coding->consumed = src_base - coding->source;
1381   coding->charbuf_used = charbuf - coding->charbuf;
1382 }
1383
1384
1385 static bool
1386 encode_coding_utf_8 (struct coding_system *coding)
1387 {
1388   bool multibytep = coding->dst_multibyte;
1389   int *charbuf = coding->charbuf;
1390   int *charbuf_end = charbuf + coding->charbuf_used;
1391   unsigned char *dst = coding->destination + coding->produced;
1392   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1393   ptrdiff_t produced_chars = 0;
1394   int c;
1395
1396   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1397     {
1398       ASSURE_DESTINATION (3);
1399       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1400       CODING_UTF_8_BOM (coding) = utf_without_bom;
1401     }
1402
1403   if (multibytep)
1404     {
1405       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1406
1407       while (charbuf < charbuf_end)
1408         {
1409           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1410
1411           ASSURE_DESTINATION (safe_room);
1412           c = *charbuf++;
1413           if (CHAR_BYTE8_P (c))
1414             {
1415               c = CHAR_TO_BYTE8 (c);
1416               EMIT_ONE_BYTE (c);
1417             }
1418           else
1419             {
1420               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1421               for (p = str; p < pend; p++)
1422                 EMIT_ONE_BYTE (*p);
1423             }
1424         }
1425     }
1426   else
1427     {
1428       int safe_room = MAX_MULTIBYTE_LENGTH;
1429
1430       while (charbuf < charbuf_end)
1431         {
1432           ASSURE_DESTINATION (safe_room);
1433           c = *charbuf++;
1434           if (CHAR_BYTE8_P (c))
1435             *dst++ = CHAR_TO_BYTE8 (c);
1436           else
1437             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1438           produced_chars++;
1439         }
1440     }
1441   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1442   coding->produced_char += produced_chars;
1443   coding->produced = dst - coding->destination;
1444   return 0;
1445 }
1446
1447
1448 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1449    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1450
1451 #define UTF_16_HIGH_SURROGATE_P(val) \
1452   (((val) & 0xFC00) == 0xD800)
1453
1454 #define UTF_16_LOW_SURROGATE_P(val) \
1455   (((val) & 0xFC00) == 0xDC00)
1456
1457
1458 static bool
1459 detect_coding_utf_16 (struct coding_system *coding,
1460                       struct coding_detection_info *detect_info)
1461 {
1462   const unsigned char *src = coding->source;
1463   const unsigned char *src_end = coding->source + coding->src_bytes;
1464   bool multibytep = coding->src_multibyte;
1465   int c1, c2;
1466
1467   detect_info->checked |= CATEGORY_MASK_UTF_16;
1468   if (coding->mode & CODING_MODE_LAST_BLOCK
1469       && (coding->src_chars & 1))
1470     {
1471       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1472       return 0;
1473     }
1474
1475   TWO_MORE_BYTES (c1, c2);
1476   if ((c1 == 0xFF) && (c2 == 0xFE))
1477     {
1478       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1479                              | CATEGORY_MASK_UTF_16_AUTO);
1480       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1481                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1482                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1483     }
1484   else if ((c1 == 0xFE) && (c2 == 0xFF))
1485     {
1486       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1487                              | CATEGORY_MASK_UTF_16_AUTO);
1488       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1489                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1490                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1491     }
1492   else if (c2 < 0)
1493     {
1494       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495       return 0;
1496     }
1497   else
1498     {
1499       /* We check the dispersion of Eth and Oth bytes where E is even and
1500          O is odd.  If both are high, we assume binary data.*/
1501       unsigned char e[256], o[256];
1502       unsigned e_num = 1, o_num = 1;
1503
1504       memset (e, 0, 256);
1505       memset (o, 0, 256);
1506       e[c1] = 1;
1507       o[c2] = 1;
1508
1509       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1510                                 |CATEGORY_MASK_UTF_16_BE
1511                                 | CATEGORY_MASK_UTF_16_LE);
1512
1513       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1514              != CATEGORY_MASK_UTF_16)
1515         {
1516           TWO_MORE_BYTES (c1, c2);
1517           if (c2 < 0)
1518             break;
1519           if (! e[c1])
1520             {
1521               e[c1] = 1;
1522               e_num++;
1523               if (e_num >= 128)
1524                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1525             }
1526           if (! o[c2])
1527             {
1528               o[c2] = 1;
1529               o_num++;
1530               if (o_num >= 128)
1531                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1532             }
1533         }
1534       return 0;
1535     }
1536
1537  no_more_source:
1538   return 1;
1539 }
1540
1541 static void
1542 decode_coding_utf_16 (struct coding_system *coding)
1543 {
1544   const unsigned char *src = coding->source + coding->consumed;
1545   const unsigned char *src_end = coding->source + coding->src_bytes;
1546   const unsigned char *src_base;
1547   int *charbuf = coding->charbuf + coding->charbuf_used;
1548   /* We may produces at most 3 chars in one loop.  */
1549   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1550   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1551   bool multibytep = coding->src_multibyte;
1552   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1553   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1554   int surrogate = CODING_UTF_16_SURROGATE (coding);
1555   bool eol_dos
1556     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1557   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1558
1559   if (bom == utf_with_bom)
1560     {
1561       int c, c1, c2;
1562
1563       src_base = src;
1564       ONE_MORE_BYTE (c1);
1565       ONE_MORE_BYTE (c2);
1566       c = (c1 << 8) | c2;
1567
1568       if (endian == utf_16_big_endian
1569           ? c != 0xFEFF : c != 0xFFFE)
1570         {
1571           /* The first two bytes are not BOM.  Treat them as bytes
1572              for a normal character.  */
1573           src = src_base;
1574           coding->errors++;
1575         }
1576       CODING_UTF_16_BOM (coding) = utf_without_bom;
1577     }
1578   else if (bom == utf_detect_bom)
1579     {
1580       /* We have already tried to detect BOM and failed in
1581          detect_coding.  */
1582       CODING_UTF_16_BOM (coding) = utf_without_bom;
1583     }
1584
1585   while (1)
1586     {
1587       int c, c1, c2;
1588
1589       src_base = src;
1590       consumed_chars_base = consumed_chars;
1591
1592       if (charbuf >= charbuf_end)
1593         {
1594           if (byte_after_cr1 >= 0)
1595             src_base -= 2;
1596           break;
1597         }
1598
1599       if (byte_after_cr1 >= 0)
1600         c1 = byte_after_cr1, byte_after_cr1 = -1;
1601       else
1602         ONE_MORE_BYTE (c1);
1603       if (c1 < 0)
1604         {
1605           *charbuf++ = -c1;
1606           continue;
1607         }
1608       if (byte_after_cr2 >= 0)
1609         c2 = byte_after_cr2, byte_after_cr2 = -1;
1610       else
1611         ONE_MORE_BYTE (c2);
1612       if (c2 < 0)
1613         {
1614           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1615           *charbuf++ = -c2;
1616           continue;
1617         }
1618       c = (endian == utf_16_big_endian
1619            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1620
1621       if (surrogate)
1622         {
1623           if (! UTF_16_LOW_SURROGATE_P (c))
1624             {
1625               if (endian == utf_16_big_endian)
1626                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1627               else
1628                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1629               *charbuf++ = c1;
1630               *charbuf++ = c2;
1631               coding->errors++;
1632               if (UTF_16_HIGH_SURROGATE_P (c))
1633                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1634               else
1635                 *charbuf++ = c;
1636             }
1637           else
1638             {
1639               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1640               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1641               *charbuf++ = 0x10000 + c;
1642             }
1643         }
1644       else
1645         {
1646           if (UTF_16_HIGH_SURROGATE_P (c))
1647             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1648           else
1649             {
1650               if (eol_dos && c == '\r')
1651                 {
1652                   ONE_MORE_BYTE (byte_after_cr1);
1653                   ONE_MORE_BYTE (byte_after_cr2);
1654                 }
1655               *charbuf++ = c;
1656             }
1657         }
1658     }
1659
1660  no_more_source:
1661   coding->consumed_char += consumed_chars_base;
1662   coding->consumed = src_base - coding->source;
1663   coding->charbuf_used = charbuf - coding->charbuf;
1664 }
1665
1666 static bool
1667 encode_coding_utf_16 (struct coding_system *coding)
1668 {
1669   bool multibytep = coding->dst_multibyte;
1670   int *charbuf = coding->charbuf;
1671   int *charbuf_end = charbuf + coding->charbuf_used;
1672   unsigned char *dst = coding->destination + coding->produced;
1673   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1674   int safe_room = 8;
1675   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1676   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1677   ptrdiff_t produced_chars = 0;
1678   int c;
1679
1680   if (bom != utf_without_bom)
1681     {
1682       ASSURE_DESTINATION (safe_room);
1683       if (big_endian)
1684         EMIT_TWO_BYTES (0xFE, 0xFF);
1685       else
1686         EMIT_TWO_BYTES (0xFF, 0xFE);
1687       CODING_UTF_16_BOM (coding) = utf_without_bom;
1688     }
1689
1690   while (charbuf < charbuf_end)
1691     {
1692       ASSURE_DESTINATION (safe_room);
1693       c = *charbuf++;
1694       if (c > MAX_UNICODE_CHAR)
1695         c = coding->default_char;
1696
1697       if (c < 0x10000)
1698         {
1699           if (big_endian)
1700             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1701           else
1702             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1703         }
1704       else
1705         {
1706           int c1, c2;
1707
1708           c -= 0x10000;
1709           c1 = (c >> 10) + 0xD800;
1710           c2 = (c & 0x3FF) + 0xDC00;
1711           if (big_endian)
1712             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1713           else
1714             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1715         }
1716     }
1717   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1718   coding->produced = dst - coding->destination;
1719   coding->produced_char += produced_chars;
1720   return 0;
1721 }
1722
1723 \f
1724 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1725
1726 /* Emacs' internal format for representation of multiple character
1727    sets is a kind of multi-byte encoding, i.e. characters are
1728    represented by variable-length sequences of one-byte codes.
1729
1730    ASCII characters and control characters (e.g. `tab', `newline') are
1731    represented by one-byte sequences which are their ASCII codes, in
1732    the range 0x00 through 0x7F.
1733
1734    8-bit characters of the range 0x80..0x9F are represented by
1735    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1736    code + 0x20).
1737
1738    8-bit characters of the range 0xA0..0xFF are represented by
1739    one-byte sequences which are their 8-bit code.
1740
1741    The other characters are represented by a sequence of `base
1742    leading-code', optional `extended leading-code', and one or two
1743    `position-code's.  The length of the sequence is determined by the
1744    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1745    whereas extended leading-code and position-code take the range 0xA0
1746    through 0xFF.  See `charset.h' for more details about leading-code
1747    and position-code.
1748
1749    --- CODE RANGE of Emacs' internal format ---
1750    character set        range
1751    -------------        -----
1752    ascii                0x00..0x7F
1753    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1754    eight-bit-graphic    0xA0..0xBF
1755    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1756    ---------------------------------------------
1757
1758    As this is the internal character representation, the format is
1759    usually not used externally (i.e. in a file or in a data sent to a
1760    process).  But, it is possible to have a text externally in this
1761    format (i.e. by encoding by the coding system `emacs-mule').
1762
1763    In that case, a sequence of one-byte codes has a slightly different
1764    form.
1765
1766    At first, all characters in eight-bit-control are represented by
1767    one-byte sequences which are their 8-bit code.
1768
1769    Next, character composition data are represented by the byte
1770    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1771    where,
1772         METHOD is 0xF2 plus one of composition method (enum
1773         composition_method),
1774
1775         BYTES is 0xA0 plus a byte length of this composition data,
1776
1777         CHARS is 0xA0 plus a number of characters composed by this
1778         data,
1779
1780         COMPONENTs are characters of multibyte form or composition
1781         rules encoded by two-byte of ASCII codes.
1782
1783    In addition, for backward compatibility, the following formats are
1784    also recognized as composition data on decoding.
1785
1786    0x80 MSEQ ...
1787    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1788
1789    Here,
1790         MSEQ is a multibyte form but in these special format:
1791           ASCII: 0xA0 ASCII_CODE+0x80,
1792           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1793         RULE is a one byte code of the range 0xA0..0xF0 that
1794         represents a composition rule.
1795   */
1796
1797 char emacs_mule_bytes[256];
1798
1799
1800 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1801    Return true if a text is encoded in 'emacs-mule'.  */
1802
1803 static bool
1804 detect_coding_emacs_mule (struct coding_system *coding,
1805                           struct coding_detection_info *detect_info)
1806 {
1807   const unsigned char *src = coding->source, *src_base;
1808   const unsigned char *src_end = coding->source + coding->src_bytes;
1809   bool multibytep = coding->src_multibyte;
1810   ptrdiff_t consumed_chars = 0;
1811   int c;
1812   int found = 0;
1813
1814   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1815   /* A coding system of this category is always ASCII compatible.  */
1816   src += coding->head_ascii;
1817
1818   while (1)
1819     {
1820       src_base = src;
1821       ONE_MORE_BYTE (c);
1822       if (c < 0)
1823         continue;
1824       if (c == 0x80)
1825         {
1826           /* Perhaps the start of composite character.  We simply skip
1827              it because analyzing it is too heavy for detecting.  But,
1828              at least, we check that the composite character
1829              constitutes of more than 4 bytes.  */
1830           const unsigned char *src_start;
1831
1832         repeat:
1833           src_start = src;
1834           do
1835             {
1836               ONE_MORE_BYTE (c);
1837             }
1838           while (c >= 0xA0);
1839
1840           if (src - src_start <= 4)
1841             break;
1842           found = CATEGORY_MASK_EMACS_MULE;
1843           if (c == 0x80)
1844             goto repeat;
1845         }
1846
1847       if (c < 0x80)
1848         {
1849           if (c < 0x20
1850               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1851             break;
1852         }
1853       else
1854         {
1855           int more_bytes = emacs_mule_bytes[c] - 1;
1856
1857           while (more_bytes > 0)
1858             {
1859               ONE_MORE_BYTE (c);
1860               if (c < 0xA0)
1861                 {
1862                   src--;        /* Unread the last byte.  */
1863                   break;
1864                 }
1865               more_bytes--;
1866             }
1867           if (more_bytes != 0)
1868             break;
1869           found = CATEGORY_MASK_EMACS_MULE;
1870         }
1871     }
1872   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1873   return 0;
1874
1875  no_more_source:
1876   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1877     {
1878       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1879       return 0;
1880     }
1881   detect_info->found |= found;
1882   return 1;
1883 }
1884
1885
1886 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1887    character.  If CMP_STATUS indicates that we must expect MSEQ or
1888    RULE described above, decode it and return the negative value of
1889    the decoded character or rule.  If an invalid byte is found, return
1890    -1.  If SRC is too short, return -2.  */
1891
1892 static int
1893 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1894                  int *nbytes, int *nchars, int *id,
1895                  struct composition_status *cmp_status)
1896 {
1897   const unsigned char *src_end = coding->source + coding->src_bytes;
1898   const unsigned char *src_base = src;
1899   bool multibytep = coding->src_multibyte;
1900   int charset_ID;
1901   unsigned code;
1902   int c;
1903   int consumed_chars = 0;
1904   bool mseq_found = 0;
1905
1906   ONE_MORE_BYTE (c);
1907   if (c < 0)
1908     {
1909       c = -c;
1910       charset_ID = emacs_mule_charset[0];
1911     }
1912   else
1913     {
1914       if (c >= 0xA0)
1915         {
1916           if (cmp_status->state != COMPOSING_NO
1917               && cmp_status->old_form)
1918             {
1919               if (cmp_status->state == COMPOSING_CHAR)
1920                 {
1921                   if (c == 0xA0)
1922                     {
1923                       ONE_MORE_BYTE (c);
1924                       c -= 0x80;
1925                       if (c < 0)
1926                         goto invalid_code;
1927                     }
1928                   else
1929                     c -= 0x20;
1930                   mseq_found = 1;
1931                 }
1932               else
1933                 {
1934                   *nbytes = src - src_base;
1935                   *nchars = consumed_chars;
1936                   return -c;
1937                 }
1938             }
1939           else
1940             goto invalid_code;
1941         }
1942
1943       switch (emacs_mule_bytes[c])
1944         {
1945         case 2:
1946           if ((charset_ID = emacs_mule_charset[c]) < 0)
1947             goto invalid_code;
1948           ONE_MORE_BYTE (c);
1949           if (c < 0xA0)
1950             goto invalid_code;
1951           code = c & 0x7F;
1952           break;
1953
1954         case 3:
1955           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1956               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1957             {
1958               ONE_MORE_BYTE (c);
1959               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1960                 goto invalid_code;
1961               ONE_MORE_BYTE (c);
1962               if (c < 0xA0)
1963                 goto invalid_code;
1964               code = c & 0x7F;
1965             }
1966           else
1967             {
1968               if ((charset_ID = emacs_mule_charset[c]) < 0)
1969                 goto invalid_code;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code = (c & 0x7F) << 8;
1974               ONE_MORE_BYTE (c);
1975               if (c < 0xA0)
1976                 goto invalid_code;
1977               code |= c & 0x7F;
1978             }
1979           break;
1980
1981         case 4:
1982           ONE_MORE_BYTE (c);
1983           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1984             goto invalid_code;
1985           ONE_MORE_BYTE (c);
1986           if (c < 0xA0)
1987             goto invalid_code;
1988           code = (c & 0x7F) << 8;
1989           ONE_MORE_BYTE (c);
1990           if (c < 0xA0)
1991             goto invalid_code;
1992           code |= c & 0x7F;
1993           break;
1994
1995         case 1:
1996           code = c;
1997           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
1998           break;
1999
2000         default:
2001           emacs_abort ();
2002         }
2003       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2004                           CHARSET_FROM_ID (charset_ID), code, c);
2005       if (c < 0)
2006         goto invalid_code;
2007     }
2008   *nbytes = src - src_base;
2009   *nchars = consumed_chars;
2010   if (id)
2011     *id = charset_ID;
2012   return (mseq_found ? -c : c);
2013
2014  no_more_source:
2015   return -2;
2016
2017  invalid_code:
2018   return -1;
2019 }
2020
2021
2022 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2023
2024 /* Handle these composition sequence ('|': the end of header elements,
2025    BYTES and CHARS >= 0xA0):
2026
2027    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2028    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2029    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2030
2031    and these old form:
2032
2033    (4) relative composition: 0x80 | MSEQ ... MSEQ
2034    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2035
2036    When the starter 0x80 and the following header elements are found,
2037    this annotation header is produced.
2038
2039         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2040
2041    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2042    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2043
2044    Then, upon reading the following elements, these codes are produced
2045    until the composition end is found:
2046
2047    (1) CHAR ... CHAR
2048    (2) ALT ... ALT CHAR ... CHAR
2049    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2050    (4) CHAR ... CHAR
2051    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2052
2053    When the composition end is found, LENGTH and NCHARS in the
2054    annotation header is updated as below:
2055
2056    (1) LENGTH: unchanged, NCHARS: unchanged
2057    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2058    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2059    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2060    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2061
2062    If an error is found while composing, the annotation header is
2063    changed to the original composition header (plus filler -1s) as
2064    below:
2065
2066    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2067    (5)          [ 0x80 0xFF -1 -1- -1 ]
2068
2069    and the sequence [ -2 DECODED-RULE ] is changed to the original
2070    byte sequence as below:
2071         o the original byte sequence is B: [ B -1 ]
2072         o the original byte sequence is B1 B2: [ B1 B2 ]
2073
2074    Most of the routines are implemented by macros because many
2075    variables and labels in the caller decode_coding_emacs_mule must be
2076    accessible, and they are usually called just once (thus doesn't
2077    increase the size of compiled object).  */
2078
2079 /* Decode a composition rule represented by C as a component of
2080    composition sequence of Emacs 20 style.  Set RULE to the decoded
2081    rule. */
2082
2083 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2084   do {                                                  \
2085     int gref, nref;                                     \
2086                                                         \
2087     c -= 0xA0;                                          \
2088     if (c < 0 || c >= 81)                               \
2089       goto invalid_code;                                \
2090     gref = c / 9, nref = c % 9;                         \
2091     if (gref == 4) gref = 10;                           \
2092     if (nref == 4) nref = 10;                           \
2093     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2094   } while (0)
2095
2096
2097 /* Decode a composition rule represented by C and the following byte
2098    at SRC as a component of composition sequence of Emacs 21 style.
2099    Set RULE to the decoded rule.  */
2100
2101 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2102   do {                                                  \
2103     int gref, nref;                                     \
2104                                                         \
2105     gref = c - 0x20;                                    \
2106     if (gref < 0 || gref >= 81)                         \
2107       goto invalid_code;                                \
2108     ONE_MORE_BYTE (c);                                  \
2109     nref = c - 0x20;                                    \
2110     if (nref < 0 || nref >= 81)                         \
2111       goto invalid_code;                                \
2112     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2113   } while (0)
2114
2115
2116 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2117    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2118    byte length of this composition information, CHARS is the number of
2119    characters composed by this composition.  */
2120
2121 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2122   do {                                                                  \
2123     enum composition_method method = c - 0xF2;                          \
2124     int nbytes, nchars;                                                 \
2125                                                                         \
2126     ONE_MORE_BYTE (c);                                                  \
2127     if (c < 0)                                                          \
2128       goto invalid_code;                                                \
2129     nbytes = c - 0xA0;                                                  \
2130     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2131       goto invalid_code;                                                \
2132     ONE_MORE_BYTE (c);                                                  \
2133     nchars = c - 0xA0;                                                  \
2134     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2135       goto invalid_code;                                                \
2136     cmp_status->old_form = 0;                                           \
2137     cmp_status->method = method;                                        \
2138     if (method == COMPOSITION_RELATIVE)                                 \
2139       cmp_status->state = COMPOSING_CHAR;                               \
2140     else                                                                \
2141       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2142     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2143     cmp_status->nchars = nchars;                                        \
2144     cmp_status->ncomps = nbytes - 4;                                    \
2145     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2146   } while (0)
2147
2148
2149 /* Start of Emacs 20 style format for relative composition.  */
2150
2151 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2152   do {                                                          \
2153     cmp_status->old_form = 1;                                   \
2154     cmp_status->method = COMPOSITION_RELATIVE;                  \
2155     cmp_status->state = COMPOSING_CHAR;                         \
2156     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2157     cmp_status->nchars = cmp_status->ncomps = 0;                \
2158     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2159   } while (0)
2160
2161
2162 /* Start of Emacs 20 style format for rule-base composition.  */
2163
2164 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2165   do {                                                          \
2166     cmp_status->old_form = 1;                                   \
2167     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2168     cmp_status->state = COMPOSING_CHAR;                         \
2169     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2170     cmp_status->nchars = cmp_status->ncomps = 0;                \
2171     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2172   } while (0)
2173
2174
2175 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2176   do {                                                  \
2177     const unsigned char *current_src = src;             \
2178                                                         \
2179     ONE_MORE_BYTE (c);                                  \
2180     if (c < 0)                                          \
2181       goto invalid_code;                                \
2182     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2183         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2184       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2185     else if (c < 0xA0)                                  \
2186       goto invalid_code;                                \
2187     else if (c < 0xC0)                                  \
2188       {                                                 \
2189         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2190         /* Re-read C as a composition component.  */    \
2191         src = current_src;                              \
2192       }                                                 \
2193     else if (c == 0xFF)                                 \
2194       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2195     else                                                \
2196       goto invalid_code;                                \
2197   } while (0)
2198
2199 #define EMACS_MULE_COMPOSITION_END()                            \
2200   do {                                                          \
2201     int idx = - cmp_status->length;                             \
2202                                                                 \
2203     if (cmp_status->old_form)                                   \
2204       charbuf[idx + 2] = cmp_status->nchars;                    \
2205     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2206       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2207     cmp_status->state = COMPOSING_NO;                           \
2208   } while (0)
2209
2210
2211 static int
2212 emacs_mule_finish_composition (int *charbuf,
2213                                struct composition_status *cmp_status)
2214 {
2215   int idx = - cmp_status->length;
2216   int new_chars;
2217
2218   if (cmp_status->old_form && cmp_status->nchars > 0)
2219     {
2220       charbuf[idx + 2] = cmp_status->nchars;
2221       new_chars = 0;
2222       if (cmp_status->method == COMPOSITION_WITH_RULE
2223           && cmp_status->state == COMPOSING_CHAR)
2224         {
2225           /* The last rule was invalid.  */
2226           int rule = charbuf[-1] + 0xA0;
2227
2228           charbuf[-2] = BYTE8_TO_CHAR (rule);
2229           charbuf[-1] = -1;
2230           new_chars = 1;
2231         }
2232     }
2233   else
2234     {
2235       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2236
2237       if (cmp_status->method == COMPOSITION_WITH_RULE)
2238         {
2239           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2240           charbuf[idx++] = -3;
2241           charbuf[idx++] = 0;
2242           new_chars = 1;
2243         }
2244       else
2245         {
2246           int nchars = charbuf[idx + 1] + 0xA0;
2247           int nbytes = charbuf[idx + 2] + 0xA0;
2248
2249           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2250           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2251           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2252           charbuf[idx++] = -1;
2253           new_chars = 4;
2254         }
2255     }
2256   cmp_status->state = COMPOSING_NO;
2257   return new_chars;
2258 }
2259
2260 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2261   do {                                                                    \
2262     if (cmp_status->state != COMPOSING_NO)                                \
2263       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2264   } while (0)
2265
2266
2267 static void
2268 decode_coding_emacs_mule (struct coding_system *coding)
2269 {
2270   const unsigned char *src = coding->source + coding->consumed;
2271   const unsigned char *src_end = coding->source + coding->src_bytes;
2272   const unsigned char *src_base;
2273   int *charbuf = coding->charbuf + coding->charbuf_used;
2274   /* We may produce two annotations (charset and composition) in one
2275      loop and one more charset annotation at the end.  */
2276   int *charbuf_end
2277     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2278       /* We can produce up to 2 characters in a loop.  */
2279       - 1;
2280   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2281   bool multibytep = coding->src_multibyte;
2282   ptrdiff_t char_offset = coding->produced_char;
2283   ptrdiff_t last_offset = char_offset;
2284   int last_id = charset_ascii;
2285   bool eol_dos
2286     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2287   int byte_after_cr = -1;
2288   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2289
2290   if (cmp_status->state != COMPOSING_NO)
2291     {
2292       int i;
2293
2294       if (charbuf_end - charbuf < cmp_status->length)
2295         emacs_abort ();
2296       for (i = 0; i < cmp_status->length; i++)
2297         *charbuf++ = cmp_status->carryover[i];
2298       coding->annotated = 1;
2299     }
2300
2301   while (1)
2302     {
2303       int c, id IF_LINT (= 0);
2304
2305       src_base = src;
2306       consumed_chars_base = consumed_chars;
2307
2308       if (charbuf >= charbuf_end)
2309         {
2310           if (byte_after_cr >= 0)
2311             src_base--;
2312           break;
2313         }
2314
2315       if (byte_after_cr >= 0)
2316         c = byte_after_cr, byte_after_cr = -1;
2317       else
2318         ONE_MORE_BYTE (c);
2319
2320       if (c < 0 || c == 0x80)
2321         {
2322           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2323           if (c < 0)
2324             {
2325               *charbuf++ = -c;
2326               char_offset++;
2327             }
2328           else
2329             DECODE_EMACS_MULE_COMPOSITION_START ();
2330           continue;
2331         }
2332
2333       if (c < 0x80)
2334         {
2335           if (eol_dos && c == '\r')
2336             ONE_MORE_BYTE (byte_after_cr);
2337           id = charset_ascii;
2338           if (cmp_status->state != COMPOSING_NO)
2339             {
2340               if (cmp_status->old_form)
2341                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2342               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2343                 cmp_status->ncomps--;
2344             }
2345         }
2346       else
2347         {
2348           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2349           /* emacs_mule_char can load a charset map from a file, which
2350              allocates a large structure and might cause buffer text
2351              to be relocated as result.  Thus, we need to remember the
2352              original pointer to buffer text, and fix up all related
2353              pointers after the call.  */
2354           const unsigned char *orig = coding->source;
2355           ptrdiff_t offset;
2356
2357           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2358                                cmp_status);
2359           offset = coding->source - orig;
2360           if (offset)
2361             {
2362               src += offset;
2363               src_base += offset;
2364               src_end += offset;
2365             }
2366           if (c < 0)
2367             {
2368               if (c == -1)
2369                 goto invalid_code;
2370               if (c == -2)
2371                 break;
2372             }
2373           src = src_base + nbytes;
2374           consumed_chars = consumed_chars_base + nchars;
2375           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2376             cmp_status->ncomps -= nchars;
2377         }
2378
2379       /* Now if C >= 0, we found a normally encoded character, if C <
2380          0, we found an old-style composition component character or
2381          rule.  */
2382
2383       if (cmp_status->state == COMPOSING_NO)
2384         {
2385           if (last_id != id)
2386             {
2387               if (last_id != charset_ascii)
2388                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2389                                   last_id);
2390               last_id = id;
2391               last_offset = char_offset;
2392             }
2393           *charbuf++ = c;
2394           char_offset++;
2395         }
2396       else if (cmp_status->state == COMPOSING_CHAR)
2397         {
2398           if (cmp_status->old_form)
2399             {
2400               if (c >= 0)
2401                 {
2402                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403                   *charbuf++ = c;
2404                   char_offset++;
2405                 }
2406               else
2407                 {
2408                   *charbuf++ = -c;
2409                   cmp_status->nchars++;
2410                   cmp_status->length++;
2411                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2412                     EMACS_MULE_COMPOSITION_END ();
2413                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2414                     cmp_status->state = COMPOSING_RULE;
2415                 }
2416             }
2417           else
2418             {
2419               *charbuf++ = c;
2420               cmp_status->length++;
2421               cmp_status->nchars--;
2422               if (cmp_status->nchars == 0)
2423                 EMACS_MULE_COMPOSITION_END ();
2424             }
2425         }
2426       else if (cmp_status->state == COMPOSING_RULE)
2427         {
2428           int rule;
2429
2430           if (c >= 0)
2431             {
2432               EMACS_MULE_COMPOSITION_END ();
2433               *charbuf++ = c;
2434               char_offset++;
2435             }
2436           else
2437             {
2438               c = -c;
2439               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2440               if (rule < 0)
2441                 goto invalid_code;
2442               *charbuf++ = -2;
2443               *charbuf++ = rule;
2444               cmp_status->length += 2;
2445               cmp_status->state = COMPOSING_CHAR;
2446             }
2447         }
2448       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2449         {
2450           *charbuf++ = c;
2451           cmp_status->length++;
2452           if (cmp_status->ncomps == 0)
2453             cmp_status->state = COMPOSING_CHAR;
2454           else if (cmp_status->ncomps > 0)
2455             {
2456               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2457                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2458             }
2459           else
2460             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2461         }
2462       else                      /* COMPOSING_COMPONENT_RULE */
2463         {
2464           int rule;
2465
2466           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2467           if (rule < 0)
2468             goto invalid_code;
2469           *charbuf++ = -2;
2470           *charbuf++ = rule;
2471           cmp_status->length += 2;
2472           cmp_status->ncomps--;
2473           if (cmp_status->ncomps > 0)
2474             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2475           else
2476             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2477         }
2478       continue;
2479
2480     invalid_code:
2481       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482       src = src_base;
2483       consumed_chars = consumed_chars_base;
2484       ONE_MORE_BYTE (c);
2485       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2486       char_offset++;
2487       coding->errors++;
2488     }
2489
2490  no_more_source:
2491   if (cmp_status->state != COMPOSING_NO)
2492     {
2493       if (coding->mode & CODING_MODE_LAST_BLOCK)
2494         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495       else
2496         {
2497           int i;
2498
2499           charbuf -= cmp_status->length;
2500           for (i = 0; i < cmp_status->length; i++)
2501             cmp_status->carryover[i] = charbuf[i];
2502         }
2503     }
2504   if (last_id != charset_ascii)
2505     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2506   coding->consumed_char += consumed_chars_base;
2507   coding->consumed = src_base - coding->source;
2508   coding->charbuf_used = charbuf - coding->charbuf;
2509 }
2510
2511
2512 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2513   do {                                          \
2514     if (id < 0xA0)                              \
2515       codes[0] = id, codes[1] = 0;              \
2516     else if (id < 0xE0)                         \
2517       codes[0] = 0x9A, codes[1] = id;           \
2518     else if (id < 0xF0)                         \
2519       codes[0] = 0x9B, codes[1] = id;           \
2520     else if (id < 0xF5)                         \
2521       codes[0] = 0x9C, codes[1] = id;           \
2522     else                                        \
2523       codes[0] = 0x9D, codes[1] = id;           \
2524   } while (0);
2525
2526
2527 static bool
2528 encode_coding_emacs_mule (struct coding_system *coding)
2529 {
2530   bool multibytep = coding->dst_multibyte;
2531   int *charbuf = coding->charbuf;
2532   int *charbuf_end = charbuf + coding->charbuf_used;
2533   unsigned char *dst = coding->destination + coding->produced;
2534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2535   int safe_room = 8;
2536   ptrdiff_t produced_chars = 0;
2537   Lisp_Object attrs, charset_list;
2538   int c;
2539   int preferred_charset_id = -1;
2540
2541   CODING_GET_INFO (coding, attrs, charset_list);
2542   if (! EQ (charset_list, Vemacs_mule_charset_list))
2543     {
2544       charset_list = Vemacs_mule_charset_list;
2545       ASET (attrs, coding_attr_charset_list, charset_list);
2546     }
2547
2548   while (charbuf < charbuf_end)
2549     {
2550       ASSURE_DESTINATION (safe_room);
2551       c = *charbuf++;
2552
2553       if (c < 0)
2554         {
2555           /* Handle an annotation.  */
2556           switch (*charbuf)
2557             {
2558             case CODING_ANNOTATE_COMPOSITION_MASK:
2559               /* Not yet implemented.  */
2560               break;
2561             case CODING_ANNOTATE_CHARSET_MASK:
2562               preferred_charset_id = charbuf[3];
2563               if (preferred_charset_id >= 0
2564                   && NILP (Fmemq (make_number (preferred_charset_id),
2565                                   charset_list)))
2566                 preferred_charset_id = -1;
2567               break;
2568             default:
2569               emacs_abort ();
2570             }
2571           charbuf += -c - 1;
2572           continue;
2573         }
2574
2575       if (ASCII_CHAR_P (c))
2576         EMIT_ONE_ASCII_BYTE (c);
2577       else if (CHAR_BYTE8_P (c))
2578         {
2579           c = CHAR_TO_BYTE8 (c);
2580           EMIT_ONE_BYTE (c);
2581         }
2582       else
2583         {
2584           struct charset *charset;
2585           unsigned code;
2586           int dimension;
2587           int emacs_mule_id;
2588           unsigned char leading_codes[2];
2589
2590           if (preferred_charset_id >= 0)
2591             {
2592               bool result;
2593
2594               charset = CHARSET_FROM_ID (preferred_charset_id);
2595               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2596               if (result)
2597                 code = ENCODE_CHAR (charset, c);
2598               else
2599                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2600                                      &code, charset);
2601             }
2602           else
2603             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2604                                  &code, charset);
2605           if (! charset)
2606             {
2607               c = coding->default_char;
2608               if (ASCII_CHAR_P (c))
2609                 {
2610                   EMIT_ONE_ASCII_BYTE (c);
2611                   continue;
2612                 }
2613               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2614                                    &code, charset);
2615             }
2616           dimension = CHARSET_DIMENSION (charset);
2617           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2618           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2619           EMIT_ONE_BYTE (leading_codes[0]);
2620           if (leading_codes[1])
2621             EMIT_ONE_BYTE (leading_codes[1]);
2622           if (dimension == 1)
2623             EMIT_ONE_BYTE (code | 0x80);
2624           else
2625             {
2626               code |= 0x8080;
2627               EMIT_ONE_BYTE (code >> 8);
2628               EMIT_ONE_BYTE (code & 0xFF);
2629             }
2630         }
2631     }
2632   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2633   coding->produced_char += produced_chars;
2634   coding->produced = dst - coding->destination;
2635   return 0;
2636 }
2637
2638 \f
2639 /*** 7. ISO2022 handlers ***/
2640
2641 /* The following note describes the coding system ISO2022 briefly.
2642    Since the intention of this note is to help understand the
2643    functions in this file, some parts are NOT ACCURATE or are OVERLY
2644    SIMPLIFIED.  For thorough understanding, please refer to the
2645    original document of ISO2022.  This is equivalent to the standard
2646    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2647
2648    ISO2022 provides many mechanisms to encode several character sets
2649    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2650    is encoded using bytes less than 128.  This may make the encoded
2651    text a little bit longer, but the text passes more easily through
2652    several types of gateway, some of which strip off the MSB (Most
2653    Significant Bit).
2654
2655    There are two kinds of character sets: control character sets and
2656    graphic character sets.  The former contain control characters such
2657    as `newline' and `escape' to provide control functions (control
2658    functions are also provided by escape sequences).  The latter
2659    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2660    two control character sets and many graphic character sets.
2661
2662    Graphic character sets are classified into one of the following
2663    four classes, according to the number of bytes (DIMENSION) and
2664    number of characters in one dimension (CHARS) of the set:
2665    - DIMENSION1_CHARS94
2666    - DIMENSION1_CHARS96
2667    - DIMENSION2_CHARS94
2668    - DIMENSION2_CHARS96
2669
2670    In addition, each character set is assigned an identification tag,
2671    unique for each set, called the "final character" (denoted as <F>
2672    hereafter).  The <F> of each character set is decided by ECMA(*)
2673    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2674    (0x30..0x3F are for private use only).
2675
2676    Note (*): ECMA = European Computer Manufacturers Association
2677
2678    Here are examples of graphic character sets [NAME(<F>)]:
2679         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2680         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2681         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2682         o DIMENSION2_CHARS96 -- none for the moment
2683
2684    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2685         C0 [0x00..0x1F] -- control character plane 0
2686         GL [0x20..0x7F] -- graphic character plane 0
2687         C1 [0x80..0x9F] -- control character plane 1
2688         GR [0xA0..0xFF] -- graphic character plane 1
2689
2690    A control character set is directly designated and invoked to C0 or
2691    C1 by an escape sequence.  The most common case is that:
2692    - ISO646's  control character set is designated/invoked to C0, and
2693    - ISO6429's control character set is designated/invoked to C1,
2694    and usually these designations/invocations are omitted in encoded
2695    text.  In a 7-bit environment, only C0 can be used, and a control
2696    character for C1 is encoded by an appropriate escape sequence to
2697    fit into the environment.  All control characters for C1 are
2698    defined to have corresponding escape sequences.
2699
2700    A graphic character set is at first designated to one of four
2701    graphic registers (G0 through G3), then these graphic registers are
2702    invoked to GL or GR.  These designations and invocations can be
2703    done independently.  The most common case is that G0 is invoked to
2704    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2705    these invocations and designations are omitted in encoded text.
2706    In a 7-bit environment, only GL can be used.
2707
2708    When a graphic character set of CHARS94 is invoked to GL, codes
2709    0x20 and 0x7F of the GL area work as control characters SPACE and
2710    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2711    be used.
2712
2713    There are two ways of invocation: locking-shift and single-shift.
2714    With locking-shift, the invocation lasts until the next different
2715    invocation, whereas with single-shift, the invocation affects the
2716    following character only and doesn't affect the locking-shift
2717    state.  Invocations are done by the following control characters or
2718    escape sequences:
2719
2720    ----------------------------------------------------------------------
2721    abbrev  function                  cntrl escape seq   description
2722    ----------------------------------------------------------------------
2723    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2724    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2725    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2726    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2727    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2728    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2729    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2730    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2731    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2732    ----------------------------------------------------------------------
2733    (*) These are not used by any known coding system.
2734
2735    Control characters for these functions are defined by macros
2736    ISO_CODE_XXX in `coding.h'.
2737
2738    Designations are done by the following escape sequences:
2739    ----------------------------------------------------------------------
2740    escape sequence      description
2741    ----------------------------------------------------------------------
2742    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2743    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2744    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2745    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2746    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2747    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2748    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2749    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2750    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2751    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2752    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2753    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2754    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2755    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2756    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2757    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2758    ----------------------------------------------------------------------
2759
2760    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2761    of dimension 1, chars 94, and final character <F>, etc...
2762
2763    Note (*): Although these designations are not allowed in ISO2022,
2764    Emacs accepts them on decoding, and produces them on encoding
2765    CHARS96 character sets in a coding system which is characterized as
2766    7-bit environment, non-locking-shift, and non-single-shift.
2767
2768    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2769    '(' must be omitted.  We refer to this as "short-form" hereafter.
2770
2771    Now you may notice that there are a lot of ways of encoding the
2772    same multilingual text in ISO2022.  Actually, there exist many
2773    coding systems such as Compound Text (used in X11's inter client
2774    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2775    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2776    localized platforms), and all of these are variants of ISO2022.
2777
2778    In addition to the above, Emacs handles two more kinds of escape
2779    sequences: ISO6429's direction specification and Emacs' private
2780    sequence for specifying character composition.
2781
2782    ISO6429's direction specification takes the following form:
2783         o CSI ']'      -- end of the current direction
2784         o CSI '0' ']'  -- end of the current direction
2785         o CSI '1' ']'  -- start of left-to-right text
2786         o CSI '2' ']'  -- start of right-to-left text
2787    The control character CSI (0x9B: control sequence introducer) is
2788    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2789
2790    Character composition specification takes the following form:
2791         o ESC '0' -- start relative composition
2792         o ESC '1' -- end composition
2793         o ESC '2' -- start rule-base composition (*)
2794         o ESC '3' -- start relative composition with alternate chars  (**)
2795         o ESC '4' -- start rule-base composition with alternate chars  (**)
2796   Since these are not standard escape sequences of any ISO standard,
2797   the use of them with these meanings is restricted to Emacs only.
2798
2799   (*) This form is used only in Emacs 20.7 and older versions,
2800   but newer versions can safely decode it.
2801   (**) This form is used only in Emacs 21.1 and newer versions,
2802   and older versions can't decode it.
2803
2804   Here's a list of example usages of these composition escape
2805   sequences (categorized by `enum composition_method').
2806
2807   COMPOSITION_RELATIVE:
2808         ESC 0 CHAR [ CHAR ] ESC 1
2809   COMPOSITION_WITH_RULE:
2810         ESC 2 CHAR [ RULE CHAR ] ESC 1
2811   COMPOSITION_WITH_ALTCHARS:
2812         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2813   COMPOSITION_WITH_RULE_ALTCHARS:
2814         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2815
2816 static enum iso_code_class_type iso_code_class[256];
2817
2818 #define SAFE_CHARSET_P(coding, id)      \
2819   ((id) <= (coding)->max_charset_id     \
2820    && (coding)->safe_charsets[id] != 255)
2821
2822 static void
2823 setup_iso_safe_charsets (Lisp_Object attrs)
2824 {
2825   Lisp_Object charset_list, safe_charsets;
2826   Lisp_Object request;
2827   Lisp_Object reg_usage;
2828   Lisp_Object tail;
2829   EMACS_INT reg94, reg96;
2830   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2831   int max_charset_id;
2832
2833   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2834   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2835       && ! EQ (charset_list, Viso_2022_charset_list))
2836     {
2837       charset_list = Viso_2022_charset_list;
2838       ASET (attrs, coding_attr_charset_list, charset_list);
2839       ASET (attrs, coding_attr_safe_charsets, Qnil);
2840     }
2841
2842   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2843     return;
2844
2845   max_charset_id = 0;
2846   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2847     {
2848       int id = XINT (XCAR (tail));
2849       if (max_charset_id < id)
2850         max_charset_id = id;
2851     }
2852
2853   safe_charsets = make_uninit_string (max_charset_id + 1);
2854   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2855   request = AREF (attrs, coding_attr_iso_request);
2856   reg_usage = AREF (attrs, coding_attr_iso_usage);
2857   reg94 = XINT (XCAR (reg_usage));
2858   reg96 = XINT (XCDR (reg_usage));
2859
2860   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2861     {
2862       Lisp_Object id;
2863       Lisp_Object reg;
2864       struct charset *charset;
2865
2866       id = XCAR (tail);
2867       charset = CHARSET_FROM_ID (XINT (id));
2868       reg = Fcdr (Fassq (id, request));
2869       if (! NILP (reg))
2870         SSET (safe_charsets, XINT (id), XINT (reg));
2871       else if (charset->iso_chars_96)
2872         {
2873           if (reg96 < 4)
2874             SSET (safe_charsets, XINT (id), reg96);
2875         }
2876       else
2877         {
2878           if (reg94 < 4)
2879             SSET (safe_charsets, XINT (id), reg94);
2880         }
2881     }
2882   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2883 }
2884
2885
2886 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2887    Return true if a text is encoded in one of ISO-2022 based coding
2888    systems.  */
2889
2890 static bool
2891 detect_coding_iso_2022 (struct coding_system *coding,
2892                         struct coding_detection_info *detect_info)
2893 {
2894   const unsigned char *src = coding->source, *src_base = src;
2895   const unsigned char *src_end = coding->source + coding->src_bytes;
2896   bool multibytep = coding->src_multibyte;
2897   bool single_shifting = 0;
2898   int id;
2899   int c, c1;
2900   ptrdiff_t consumed_chars = 0;
2901   int i;
2902   int rejected = 0;
2903   int found = 0;
2904   int composition_count = -1;
2905
2906   detect_info->checked |= CATEGORY_MASK_ISO;
2907
2908   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2909     {
2910       struct coding_system *this = &(coding_categories[i]);
2911       Lisp_Object attrs, val;
2912
2913       if (this->id < 0)
2914         continue;
2915       attrs = CODING_ID_ATTRS (this->id);
2916       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2917           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2918         setup_iso_safe_charsets (attrs);
2919       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2920       this->max_charset_id = SCHARS (val) - 1;
2921       this->safe_charsets = SDATA (val);
2922     }
2923
2924   /* A coding system of this category is always ASCII compatible.  */
2925   src += coding->head_ascii;
2926
2927   while (rejected != CATEGORY_MASK_ISO)
2928     {
2929       src_base = src;
2930       ONE_MORE_BYTE (c);
2931       switch (c)
2932         {
2933         case ISO_CODE_ESC:
2934           if (inhibit_iso_escape_detection)
2935             break;
2936           single_shifting = 0;
2937           ONE_MORE_BYTE (c);
2938           if (c == 'N' || c == 'O')
2939             {
2940               /* ESC <Fe> for SS2 or SS3.  */
2941               single_shifting = 1;
2942               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2943             }
2944           else if (c == '1')
2945             {
2946               /* End of composition.  */
2947               if (composition_count < 0
2948                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2949                 /* Invalid */
2950                 break;
2951               composition_count = -1;
2952               found |= CATEGORY_MASK_ISO;
2953             }
2954           else if (c >= '0' && c <= '4')
2955             {
2956               /* ESC <Fp> for start/end composition.  */
2957               composition_count = 0;
2958             }
2959           else
2960             {
2961               if (c >= '(' && c <= '/')
2962                 {
2963                   /* Designation sequence for a charset of dimension 1.  */
2964                   ONE_MORE_BYTE (c1);
2965                   if (c1 < ' ' || c1 >= 0x80
2966                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2967                     /* Invalid designation sequence.  Just ignore.  */
2968                     break;
2969                 }
2970               else if (c == '$')
2971                 {
2972                   /* Designation sequence for a charset of dimension 2.  */
2973                   ONE_MORE_BYTE (c);
2974                   if (c >= '@' && c <= 'B')
2975                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2976                     id = iso_charset_table[1][0][c];
2977                   else if (c >= '(' && c <= '/')
2978                     {
2979                       ONE_MORE_BYTE (c1);
2980                       if (c1 < ' ' || c1 >= 0x80
2981                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2982                         /* Invalid designation sequence.  Just ignore.  */
2983                         break;
2984                     }
2985                   else
2986                     /* Invalid designation sequence.  Just ignore it.  */
2987                     break;
2988                 }
2989               else
2990                 {
2991                   /* Invalid escape sequence.  Just ignore it.  */
2992                   break;
2993                 }
2994
2995               /* We found a valid designation sequence for CHARSET.  */
2996               rejected |= CATEGORY_MASK_ISO_8BIT;
2997               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2998                                   id))
2999                 found |= CATEGORY_MASK_ISO_7;
3000               else
3001                 rejected |= CATEGORY_MASK_ISO_7;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_ELSE;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_8_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3017             }
3018           break;
3019
3020         case ISO_CODE_SO:
3021         case ISO_CODE_SI:
3022           /* Locking shift out/in.  */
3023           if (inhibit_iso_escape_detection)
3024             break;
3025           single_shifting = 0;
3026           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3027           break;
3028
3029         case ISO_CODE_CSI:
3030           /* Control sequence introducer.  */
3031           single_shifting = 0;
3032           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3033           found |= CATEGORY_MASK_ISO_8_ELSE;
3034           goto check_extra_latin;
3035
3036         case ISO_CODE_SS2:
3037         case ISO_CODE_SS3:
3038           /* Single shift.   */
3039           if (inhibit_iso_escape_detection)
3040             break;
3041           single_shifting = 0;
3042           rejected |= CATEGORY_MASK_ISO_7BIT;
3043           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3044               & CODING_ISO_FLAG_SINGLE_SHIFT)
3045             {
3046               found |= CATEGORY_MASK_ISO_8_1;
3047               single_shifting = 1;
3048             }
3049           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3050               & CODING_ISO_FLAG_SINGLE_SHIFT)
3051             {
3052               found |= CATEGORY_MASK_ISO_8_2;
3053               single_shifting = 1;
3054             }
3055           if (single_shifting)
3056             break;
3057           goto check_extra_latin;
3058
3059         default:
3060           if (c < 0)
3061             continue;
3062           if (c < 0x80)
3063             {
3064               if (composition_count >= 0)
3065                 composition_count++;
3066               single_shifting = 0;
3067               break;
3068             }
3069           if (c >= 0xA0)
3070             {
3071               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072               found |= CATEGORY_MASK_ISO_8_1;
3073               /* Check the length of succeeding codes of the range
3074                  0xA0..0FF.  If the byte length is even, we include
3075                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3076                  only when we are not single shifting.  */
3077               if (! single_shifting
3078                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3079                 {
3080                   int len = 1;
3081                   while (src < src_end)
3082                     {
3083                       src_base = src;
3084                       ONE_MORE_BYTE (c);
3085                       if (c < 0xA0)
3086                         {
3087                           src = src_base;
3088                           break;
3089                         }
3090                       len++;
3091                     }
3092
3093                   if (len & 1 && src < src_end)
3094                     {
3095                       rejected |= CATEGORY_MASK_ISO_8_2;
3096                       if (composition_count >= 0)
3097                         composition_count += len;
3098                     }
3099                   else
3100                     {
3101                       found |= CATEGORY_MASK_ISO_8_2;
3102                       if (composition_count >= 0)
3103                         composition_count += len / 2;
3104                     }
3105                 }
3106               break;
3107             }
3108         check_extra_latin:
3109           if (! VECTORP (Vlatin_extra_code_table)
3110               || NILP (AREF (Vlatin_extra_code_table, c)))
3111             {
3112               rejected = CATEGORY_MASK_ISO;
3113               break;
3114             }
3115           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116               & CODING_ISO_FLAG_LATIN_EXTRA)
3117             found |= CATEGORY_MASK_ISO_8_1;
3118           else
3119             rejected |= CATEGORY_MASK_ISO_8_1;
3120           rejected |= CATEGORY_MASK_ISO_8_2;
3121           break;
3122         }
3123     }
3124   detect_info->rejected |= CATEGORY_MASK_ISO;
3125   return 0;
3126
3127  no_more_source:
3128   detect_info->rejected |= rejected;
3129   detect_info->found |= (found & ~rejected);
3130   return 1;
3131 }
3132
3133
3134 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3135    escape sequence should be kept.  */
3136 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3137   do {                                                                  \
3138     int id, prev;                                                       \
3139                                                                         \
3140     if (final < '0' || final >= 128                                     \
3141         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3142         || !SAFE_CHARSET_P (coding, id))                                \
3143       {                                                                 \
3144         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3145         chars_96 = -1;                                                  \
3146         break;                                                          \
3147       }                                                                 \
3148     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3149     if (id == charset_jisx0201_roman)                                   \
3150       {                                                                 \
3151         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3152           id = charset_ascii;                                           \
3153       }                                                                 \
3154     else if (id == charset_jisx0208_1978)                               \
3155       {                                                                 \
3156         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3157           id = charset_jisx0208;                                        \
3158       }                                                                 \
3159     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3160     /* If there was an invalid designation to REG previously, and this  \
3161        designation is ASCII to REG, we should keep this designation     \
3162        sequence.  */                                                    \
3163     if (prev == -2 && id == charset_ascii)                              \
3164       chars_96 = -1;                                                    \
3165   } while (0)
3166
3167
3168 /* Handle these composition sequence (ALT: alternate char):
3169
3170    (1) relative composition: ESC 0 CHAR ... ESC 1
3171    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3172    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3173    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3174
3175    When the start sequence (ESC 0/2/3/4) is found, this annotation
3176    header is produced.
3177
3178         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3179
3180    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3181    produced until the end sequence (ESC 1) is found:
3182
3183    (1) CHAR ... CHAR
3184    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3185    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3186    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3187
3188    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3189    annotation header is updated as below:
3190
3191    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3192    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3193    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3194    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3195
3196    If an error is found while composing, the annotation header is
3197    changed to:
3198
3199         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3200
3201    and the sequence [ -2 DECODED-RULE ] is changed to the original
3202    byte sequence as below:
3203         o the original byte sequence is B: [ B -1 ]
3204         o the original byte sequence is B1 B2: [ B1 B2 ]
3205    and the sequence [ -1 -1 ] is changed to the original byte
3206    sequence:
3207         [ ESC '0' ]
3208 */
3209
3210 /* Decode a composition rule C1 and maybe one more byte from the
3211    source, and set RULE to the encoded composition rule.  If the rule
3212    is invalid, goto invalid_code.  */
3213
3214 #define DECODE_COMPOSITION_RULE(rule)                                   \
3215   do {                                                                  \
3216     rule = c1 - 32;                                                     \
3217     if (rule < 0)                                                       \
3218       goto invalid_code;                                                \
3219     if (rule < 81)              /* old format (before ver.21) */        \
3220       {                                                                 \
3221         int gref = (rule) / 9;                                          \
3222         int nref = (rule) % 9;                                          \
3223         if (gref == 4) gref = 10;                                       \
3224         if (nref == 4) nref = 10;                                       \
3225         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3226       }                                                                 \
3227     else                        /* new format (after ver.21) */         \
3228       {                                                                 \
3229         int b;                                                          \
3230                                                                         \
3231         ONE_MORE_BYTE (b);                                              \
3232         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3233           goto invalid_code;                                            \
3234         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3235         rule += 0x100;   /* Distinguish it from the old format.  */     \
3236       }                                                                 \
3237   } while (0)
3238
3239 #define ENCODE_COMPOSITION_RULE(rule)                           \
3240   do {                                                          \
3241     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3242                                                                 \
3243     if (rule < 0x100)           /* old format */                \
3244       {                                                         \
3245         if (gref == 10) gref = 4;                               \
3246         if (nref == 10) nref = 4;                               \
3247         charbuf[idx] = 32 + gref * 9 + nref;                    \
3248         charbuf[idx + 1] = -1;                                  \
3249         new_chars++;                                            \
3250       }                                                         \
3251     else                                /* new format */        \
3252       {                                                         \
3253         charbuf[idx] = 32 + 81 + gref;                          \
3254         charbuf[idx + 1] = 32 + nref;                           \
3255         new_chars += 2;                                         \
3256       }                                                         \
3257   } while (0)
3258
3259 /* Finish the current composition as invalid.  */
3260
3261 static int
3262 finish_composition (int *charbuf, struct composition_status *cmp_status)
3263 {
3264   int idx = - cmp_status->length;
3265   int new_chars;
3266
3267   /* Recover the original ESC sequence */
3268   charbuf[idx++] = ISO_CODE_ESC;
3269   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3270                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3271                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3272                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3273                     : '4');
3274   charbuf[idx++] = -2;
3275   charbuf[idx++] = 0;
3276   charbuf[idx++] = -1;
3277   new_chars = cmp_status->nchars;
3278   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3279     for (; idx < 0; idx++)
3280       {
3281         int elt = charbuf[idx];
3282
3283         if (elt == -2)
3284           {
3285             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3286             idx++;
3287           }
3288         else if (elt == -1)
3289           {
3290             charbuf[idx++] = ISO_CODE_ESC;
3291             charbuf[idx] = '0';
3292             new_chars += 2;
3293           }
3294       }
3295   cmp_status->state = COMPOSING_NO;
3296   return new_chars;
3297 }
3298
3299 /* If characters are under composition, finish the composition.  */
3300 #define MAYBE_FINISH_COMPOSITION()                              \
3301   do {                                                          \
3302     if (cmp_status->state != COMPOSING_NO)                      \
3303       char_offset += finish_composition (charbuf, cmp_status);  \
3304   } while (0)
3305
3306 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3307
3308    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3309    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3310    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3311    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3312
3313    Produce this annotation sequence now:
3314
3315    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3316 */
3317
3318 #define DECODE_COMPOSITION_START(c1)                                       \
3319   do {                                                                     \
3320     if (c1 == '0'                                                          \
3321         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3322              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3323             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3324                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3325       {                                                                    \
3326         *charbuf++ = -1;                                                   \
3327         *charbuf++= -1;                                                    \
3328         cmp_status->state = COMPOSING_CHAR;                                \
3329         cmp_status->length += 2;                                           \
3330       }                                                                    \
3331     else                                                                   \
3332       {                                                                    \
3333         MAYBE_FINISH_COMPOSITION ();                                       \
3334         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3335                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3336                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3337                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3338         cmp_status->state                                                  \
3339           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3340         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3341         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3342         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3343         coding->annotated = 1;                                             \
3344       }                                                                    \
3345   } while (0)
3346
3347
3348 /* Handle composition end sequence ESC 1.  */
3349
3350 #define DECODE_COMPOSITION_END()                                        \
3351   do {                                                                  \
3352     if (cmp_status->nchars == 0                                         \
3353         || ((cmp_status->state == COMPOSING_CHAR)                       \
3354             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3355       {                                                                 \
3356         MAYBE_FINISH_COMPOSITION ();                                    \
3357         goto invalid_code;                                              \
3358       }                                                                 \
3359     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3360       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3361     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3362       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3363     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3364     char_offset += cmp_status->nchars;                                  \
3365     cmp_status->state = COMPOSING_NO;                                   \
3366   } while (0)
3367
3368 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3369
3370 #define STORE_COMPOSITION_RULE(rule)    \
3371   do {                                  \
3372     *charbuf++ = -2;                    \
3373     *charbuf++ = rule;                  \
3374     cmp_status->length += 2;            \
3375     cmp_status->state--;                \
3376   } while (0)
3377
3378 /* Store a composed char or a component char C in charbuf, and update
3379    cmp_status.  */
3380
3381 #define STORE_COMPOSITION_CHAR(c)                                       \
3382   do {                                                                  \
3383     *charbuf++ = (c);                                                   \
3384     cmp_status->length++;                                               \
3385     if (cmp_status->state == COMPOSING_CHAR)                            \
3386       cmp_status->nchars++;                                             \
3387     else                                                                \
3388       cmp_status->ncomps++;                                             \
3389     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3390         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3391             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3392       cmp_status->state++;                                              \
3393   } while (0)
3394
3395
3396 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3397
3398 static void
3399 decode_coding_iso_2022 (struct coding_system *coding)
3400 {
3401   const unsigned char *src = coding->source + coding->consumed;
3402   const unsigned char *src_end = coding->source + coding->src_bytes;
3403   const unsigned char *src_base;
3404   int *charbuf = coding->charbuf + coding->charbuf_used;
3405   /* We may produce two annotations (charset and composition) in one
3406      loop and one more charset annotation at the end.  */
3407   int *charbuf_end
3408     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3409   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3410   bool multibytep = coding->src_multibyte;
3411   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3412   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3413   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3414   int charset_id_2, charset_id_3;
3415   struct charset *charset;
3416   int c;
3417   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3418   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3419   ptrdiff_t char_offset = coding->produced_char;
3420   ptrdiff_t last_offset = char_offset;
3421   int last_id = charset_ascii;
3422   bool eol_dos
3423     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3424   int byte_after_cr = -1;
3425   int i;
3426
3427   setup_iso_safe_charsets (attrs);
3428   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3429
3430   if (cmp_status->state != COMPOSING_NO)
3431     {
3432       if (charbuf_end - charbuf < cmp_status->length)
3433         emacs_abort ();
3434       for (i = 0; i < cmp_status->length; i++)
3435         *charbuf++ = cmp_status->carryover[i];
3436       coding->annotated = 1;
3437     }
3438
3439   while (1)
3440     {
3441       int c1, c2, c3;
3442
3443       src_base = src;
3444       consumed_chars_base = consumed_chars;
3445
3446       if (charbuf >= charbuf_end)
3447         {
3448           if (byte_after_cr >= 0)
3449             src_base--;
3450           break;
3451         }
3452
3453       if (byte_after_cr >= 0)
3454         c1 = byte_after_cr, byte_after_cr = -1;
3455       else
3456         ONE_MORE_BYTE (c1);
3457       if (c1 < 0)
3458         goto invalid_code;
3459
3460       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3461         {
3462           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3463           char_offset++;
3464           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3465           continue;
3466         }
3467
3468       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3469         {
3470           if (c1 == ISO_CODE_ESC)
3471             {
3472               if (src + 1 >= src_end)
3473                 goto no_more_source;
3474               *charbuf++ = ISO_CODE_ESC;
3475               char_offset++;
3476               if (src[0] == '%' && src[1] == '@')
3477                 {
3478                   src += 2;
3479                   consumed_chars += 2;
3480                   char_offset += 2;
3481                   /* We are sure charbuf can contain two more chars. */
3482                   *charbuf++ = '%';
3483                   *charbuf++ = '@';
3484                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3485                 }
3486             }
3487           else
3488             {
3489               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3490               char_offset++;
3491             }
3492           continue;
3493         }
3494
3495       if ((cmp_status->state == COMPOSING_RULE
3496            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3497           && c1 != ISO_CODE_ESC)
3498         {
3499           int rule;
3500
3501           DECODE_COMPOSITION_RULE (rule);
3502           STORE_COMPOSITION_RULE (rule);
3503           continue;
3504         }
3505
3506       /* We produce at most one character.  */
3507       switch (iso_code_class [c1])
3508         {
3509         case ISO_0x20_or_0x7F:
3510           if (charset_id_0 < 0
3511               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3512             /* This is SPACE or DEL.  */
3513             charset = CHARSET_FROM_ID (charset_ascii);
3514           else
3515             charset = CHARSET_FROM_ID (charset_id_0);
3516           break;
3517
3518         case ISO_graphic_plane_0:
3519           if (charset_id_0 < 0)
3520             charset = CHARSET_FROM_ID (charset_ascii);
3521           else
3522             charset = CHARSET_FROM_ID (charset_id_0);
3523           break;
3524
3525         case ISO_0xA0_or_0xFF:
3526           if (charset_id_1 < 0
3527               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3528               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3529             goto invalid_code;
3530           /* This is a graphic character, we fall down ... */
3531
3532         case ISO_graphic_plane_1:
3533           if (charset_id_1 < 0)
3534             goto invalid_code;
3535           charset = CHARSET_FROM_ID (charset_id_1);
3536           break;
3537
3538         case ISO_control_0:
3539           if (eol_dos && c1 == '\r')
3540             ONE_MORE_BYTE (byte_after_cr);
3541           MAYBE_FINISH_COMPOSITION ();
3542           charset = CHARSET_FROM_ID (charset_ascii);
3543           break;
3544
3545         case ISO_control_1:
3546           goto invalid_code;
3547
3548         case ISO_shift_out:
3549           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3550               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3551             goto invalid_code;
3552           CODING_ISO_INVOCATION (coding, 0) = 1;
3553           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3554           continue;
3555
3556         case ISO_shift_in:
3557           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3558             goto invalid_code;
3559           CODING_ISO_INVOCATION (coding, 0) = 0;
3560           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3561           continue;
3562
3563         case ISO_single_shift_2_7:
3564           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3565             goto invalid_code;
3566         case ISO_single_shift_2:
3567           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3568             goto invalid_code;
3569           /* SS2 is handled as an escape sequence of ESC 'N' */
3570           c1 = 'N';
3571           goto label_escape_sequence;
3572
3573         case ISO_single_shift_3:
3574           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3575             goto invalid_code;
3576           /* SS2 is handled as an escape sequence of ESC 'O' */
3577           c1 = 'O';
3578           goto label_escape_sequence;
3579
3580         case ISO_control_sequence_introducer:
3581           /* CSI is handled as an escape sequence of ESC '[' ...  */
3582           c1 = '[';
3583           goto label_escape_sequence;
3584
3585         case ISO_escape:
3586           ONE_MORE_BYTE (c1);
3587         label_escape_sequence:
3588           /* Escape sequences handled here are invocation,
3589              designation, direction specification, and character
3590              composition specification.  */
3591           switch (c1)
3592             {
3593             case '&':           /* revision of following character set */
3594               ONE_MORE_BYTE (c1);
3595               if (!(c1 >= '@' && c1 <= '~'))
3596                 goto invalid_code;
3597               ONE_MORE_BYTE (c1);
3598               if (c1 != ISO_CODE_ESC)
3599                 goto invalid_code;
3600               ONE_MORE_BYTE (c1);
3601               goto label_escape_sequence;
3602
3603             case '$':           /* designation of 2-byte character set */
3604               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3605                 goto invalid_code;
3606               {
3607                 int reg, chars96;
3608
3609                 ONE_MORE_BYTE (c1);
3610                 if (c1 >= '@' && c1 <= 'B')
3611                   {     /* designation of JISX0208.1978, GB2312.1980,
3612                            or JISX0208.1980 */
3613                     reg = 0, chars96 = 0;
3614                   }
3615                 else if (c1 >= 0x28 && c1 <= 0x2B)
3616                   { /* designation of DIMENSION2_CHARS94 character set */
3617                     reg = c1 - 0x28, chars96 = 0;
3618                     ONE_MORE_BYTE (c1);
3619                   }
3620                 else if (c1 >= 0x2C && c1 <= 0x2F)
3621                   { /* designation of DIMENSION2_CHARS96 character set */
3622                     reg = c1 - 0x2C, chars96 = 1;
3623                     ONE_MORE_BYTE (c1);
3624                   }
3625                 else
3626                   goto invalid_code;
3627                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3628                 /* We must update these variables now.  */
3629                 if (reg == 0)
3630                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3631                 else if (reg == 1)
3632                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3633                 if (chars96 < 0)
3634                   goto invalid_code;
3635               }
3636               continue;
3637
3638             case 'n':           /* invocation of locking-shift-2 */
3639               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3641                 goto invalid_code;
3642               CODING_ISO_INVOCATION (coding, 0) = 2;
3643               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644               continue;
3645
3646             case 'o':           /* invocation of locking-shift-3 */
3647               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3649                 goto invalid_code;
3650               CODING_ISO_INVOCATION (coding, 0) = 3;
3651               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652               continue;
3653
3654             case 'N':           /* invocation of single-shift-2 */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3656                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3657                 goto invalid_code;
3658               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3659               if (charset_id_2 < 0)
3660                 charset = CHARSET_FROM_ID (charset_ascii);
3661               else
3662                 charset = CHARSET_FROM_ID (charset_id_2);
3663               ONE_MORE_BYTE (c1);
3664               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3665                 goto invalid_code;
3666               break;
3667
3668             case 'O':           /* invocation of single-shift-3 */
3669               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3670                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3671                 goto invalid_code;
3672               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3673               if (charset_id_3 < 0)
3674                 charset = CHARSET_FROM_ID (charset_ascii);
3675               else
3676                 charset = CHARSET_FROM_ID (charset_id_3);
3677               ONE_MORE_BYTE (c1);
3678               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3679                 goto invalid_code;
3680               break;
3681
3682             case '0': case '2': case '3': case '4': /* start composition */
3683               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3684                 goto invalid_code;
3685               if (last_id != charset_ascii)
3686                 {
3687                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3688                   last_id = charset_ascii;
3689                   last_offset = char_offset;
3690                 }
3691               DECODE_COMPOSITION_START (c1);
3692               continue;
3693
3694             case '1':           /* end composition */
3695               if (cmp_status->state == COMPOSING_NO)
3696                 goto invalid_code;
3697               DECODE_COMPOSITION_END ();
3698               continue;
3699
3700             case '[':           /* specification of direction */
3701               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3702                 goto invalid_code;
3703               /* For the moment, nested direction is not supported.
3704                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3705                  left-to-right, and nonzero means right-to-left.  */
3706               ONE_MORE_BYTE (c1);
3707               switch (c1)
3708                 {
3709                 case ']':       /* end of the current direction */
3710                   coding->mode &= ~CODING_MODE_DIRECTION;
3711
3712                 case '0':       /* end of the current direction */
3713                 case '1':       /* start of left-to-right direction */
3714                   ONE_MORE_BYTE (c1);
3715                   if (c1 == ']')
3716                     coding->mode &= ~CODING_MODE_DIRECTION;
3717                   else
3718                     goto invalid_code;
3719                   break;
3720
3721                 case '2':       /* start of right-to-left direction */
3722                   ONE_MORE_BYTE (c1);
3723                   if (c1 == ']')
3724                     coding->mode |= CODING_MODE_DIRECTION;
3725                   else
3726                     goto invalid_code;
3727                   break;
3728
3729                 default:
3730                   goto invalid_code;
3731                 }
3732               continue;
3733
3734             case '%':
3735               ONE_MORE_BYTE (c1);
3736               if (c1 == '/')
3737                 {
3738                   /* CTEXT extended segment:
3739                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3740                      We keep these bytes as is for the moment.
3741                      They may be decoded by post-read-conversion.  */
3742                   int dim, M, L;
3743                   int size;
3744
3745                   ONE_MORE_BYTE (dim);
3746                   if (dim < '0' || dim > '4')
3747                     goto invalid_code;
3748                   ONE_MORE_BYTE (M);
3749                   if (M < 128)
3750                     goto invalid_code;
3751                   ONE_MORE_BYTE (L);
3752                   if (L < 128)
3753                     goto invalid_code;
3754                   size = ((M - 128) * 128) + (L - 128);
3755                   if (charbuf + 6 > charbuf_end)
3756                     goto break_loop;
3757                   *charbuf++ = ISO_CODE_ESC;
3758                   *charbuf++ = '%';
3759                   *charbuf++ = '/';
3760                   *charbuf++ = dim;
3761                   *charbuf++ = BYTE8_TO_CHAR (M);
3762                   *charbuf++ = BYTE8_TO_CHAR (L);
3763                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3764                 }
3765               else if (c1 == 'G')
3766                 {
3767                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3768                      ESC % G --UTF-8-BYTES-- ESC % @
3769                      We keep these bytes as is for the moment.
3770                      They may be decoded by post-read-conversion.  */
3771                   if (charbuf + 3 > charbuf_end)
3772                     goto break_loop;
3773                   *charbuf++ = ISO_CODE_ESC;
3774                   *charbuf++ = '%';
3775                   *charbuf++ = 'G';
3776                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3777                 }
3778               else
3779                 goto invalid_code;
3780               continue;
3781               break;
3782
3783             default:
3784               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3785                 goto invalid_code;
3786               {
3787                 int reg, chars96;
3788
3789                 if (c1 >= 0x28 && c1 <= 0x2B)
3790                   { /* designation of DIMENSION1_CHARS94 character set */
3791                     reg = c1 - 0x28, chars96 = 0;
3792                     ONE_MORE_BYTE (c1);
3793                   }
3794                 else if (c1 >= 0x2C && c1 <= 0x2F)
3795                   { /* designation of DIMENSION1_CHARS96 character set */
3796                     reg = c1 - 0x2C, chars96 = 1;
3797                     ONE_MORE_BYTE (c1);
3798                   }
3799                 else
3800                   goto invalid_code;
3801                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3802                 /* We must update these variables now.  */
3803                 if (reg == 0)
3804                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3805                 else if (reg == 1)
3806                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3807                 if (chars96 < 0)
3808                   goto invalid_code;
3809               }
3810               continue;
3811             }
3812           break;
3813
3814         default:
3815           emacs_abort ();
3816         }
3817
3818       if (cmp_status->state == COMPOSING_NO
3819           && charset->id != charset_ascii
3820           && last_id != charset->id)
3821         {
3822           if (last_id != charset_ascii)
3823             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3824           last_id = charset->id;
3825           last_offset = char_offset;
3826         }
3827
3828       /* Now we know CHARSET and 1st position code C1 of a character.
3829          Produce a decoded character while getting 2nd and 3rd
3830          position codes C2, C3 if necessary.  */
3831       if (CHARSET_DIMENSION (charset) > 1)
3832         {
3833           ONE_MORE_BYTE (c2);
3834           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3835               || ((c1 & 0x80) != (c2 & 0x80)))
3836             /* C2 is not in a valid range.  */
3837             goto invalid_code;
3838           if (CHARSET_DIMENSION (charset) == 2)
3839             c1 = (c1 << 8) | c2;
3840           else
3841             {
3842               ONE_MORE_BYTE (c3);
3843               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3844                   || ((c1 & 0x80) != (c3 & 0x80)))
3845                 /* C3 is not in a valid range.  */
3846                 goto invalid_code;
3847               c1 = (c1 << 16) | (c2 << 8) | c2;
3848             }
3849         }
3850       c1 &= 0x7F7F7F;
3851       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3852       if (c < 0)
3853         {
3854           MAYBE_FINISH_COMPOSITION ();
3855           for (; src_base < src; src_base++, char_offset++)
3856             {
3857               if (ASCII_BYTE_P (*src_base))
3858                 *charbuf++ = *src_base;
3859               else
3860                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3861             }
3862         }
3863       else if (cmp_status->state == COMPOSING_NO)
3864         {
3865           *charbuf++ = c;
3866           char_offset++;
3867         }
3868       else if ((cmp_status->state == COMPOSING_CHAR
3869                 ? cmp_status->nchars
3870                 : cmp_status->ncomps)
3871                >= MAX_COMPOSITION_COMPONENTS)
3872         {
3873           /* Too long composition.  */
3874           MAYBE_FINISH_COMPOSITION ();
3875           *charbuf++ = c;
3876           char_offset++;
3877         }
3878       else
3879         STORE_COMPOSITION_CHAR (c);
3880       continue;
3881
3882     invalid_code:
3883       MAYBE_FINISH_COMPOSITION ();
3884       src = src_base;
3885       consumed_chars = consumed_chars_base;
3886       ONE_MORE_BYTE (c);
3887       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3888       char_offset++;
3889       coding->errors++;
3890       continue;
3891
3892     break_loop:
3893       break;
3894     }
3895
3896  no_more_source:
3897   if (cmp_status->state != COMPOSING_NO)
3898     {
3899       if (coding->mode & CODING_MODE_LAST_BLOCK)
3900         MAYBE_FINISH_COMPOSITION ();
3901       else
3902         {
3903           charbuf -= cmp_status->length;
3904           for (i = 0; i < cmp_status->length; i++)
3905             cmp_status->carryover[i] = charbuf[i];
3906         }
3907     }
3908   else if (last_id != charset_ascii)
3909     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3910   coding->consumed_char += consumed_chars_base;
3911   coding->consumed = src_base - coding->source;
3912   coding->charbuf_used = charbuf - coding->charbuf;
3913 }
3914
3915
3916 /* ISO2022 encoding stuff.  */
3917
3918 /*
3919    It is not enough to say just "ISO2022" on encoding, we have to
3920    specify more details.  In Emacs, each coding system of ISO2022
3921    variant has the following specifications:
3922         1. Initial designation to G0 thru G3.
3923         2. Allows short-form designation?
3924         3. ASCII should be designated to G0 before control characters?
3925         4. ASCII should be designated to G0 at end of line?
3926         5. 7-bit environment or 8-bit environment?
3927         6. Use locking-shift?
3928         7. Use Single-shift?
3929    And the following two are only for Japanese:
3930         8. Use ASCII in place of JIS0201-1976-Roman?
3931         9. Use JISX0208-1983 in place of JISX0208-1978?
3932    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3933    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3934    details.
3935 */
3936
3937 /* Produce codes (escape sequence) for designating CHARSET to graphic
3938    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3939    '@', 'A', or 'B' and the coding system CODING allows, produce
3940    designation sequence of short-form.  */
3941
3942 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3943   do {                                                                  \
3944     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3945     const char *intermediate_char_94 = "()*+";                          \
3946     const char *intermediate_char_96 = ",-./";                          \
3947     int revision = -1;                                                  \
3948                                                                         \
3949     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3950       revision = CHARSET_ISO_REVISION (charset);                        \
3951                                                                         \
3952     if (revision >= 0)                                                  \
3953       {                                                                 \
3954         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3955         EMIT_ONE_BYTE ('@' + revision);                                 \
3956       }                                                                 \
3957     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3958     if (CHARSET_DIMENSION (charset) == 1)                               \
3959       {                                                                 \
3960         int b;                                                          \
3961         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3962           b = intermediate_char_94[reg];                                \
3963         else                                                            \
3964           b = intermediate_char_96[reg];                                \
3965         EMIT_ONE_ASCII_BYTE (b);                                        \
3966       }                                                                 \
3967     else                                                                \
3968       {                                                                 \
3969         EMIT_ONE_ASCII_BYTE ('$');                                      \
3970         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3971           {                                                             \
3972             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3973                 || reg != 0                                             \
3974                 || final_char < '@' || final_char > 'B')                \
3975               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3976           }                                                             \
3977         else                                                            \
3978           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3979       }                                                                 \
3980     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3981                                                                         \
3982     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3983   } while (0)
3984
3985
3986 /* The following two macros produce codes (control character or escape
3987    sequence) for ISO2022 single-shift functions (single-shift-2 and
3988    single-shift-3).  */
3989
3990 #define ENCODE_SINGLE_SHIFT_2                                           \
3991   do {                                                                  \
3992     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3993       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3994     else                                                                \
3995       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3996     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3997   } while (0)
3998
3999
4000 #define ENCODE_SINGLE_SHIFT_3                                           \
4001   do {                                                                  \
4002     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4003       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4004     else                                                                \
4005       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4006     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4007   } while (0)
4008
4009
4010 /* The following four macros produce codes (control character or
4011    escape sequence) for ISO2022 locking-shift functions (shift-in,
4012    shift-out, locking-shift-2, and locking-shift-3).  */
4013
4014 #define ENCODE_SHIFT_IN                                 \
4015   do {                                                  \
4016     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4017     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4018   } while (0)
4019
4020
4021 #define ENCODE_SHIFT_OUT                                \
4022   do {                                                  \
4023     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4024     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4025   } while (0)
4026
4027
4028 #define ENCODE_LOCKING_SHIFT_2                          \
4029   do {                                                  \
4030     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4031     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4032   } while (0)
4033
4034
4035 #define ENCODE_LOCKING_SHIFT_3                          \
4036   do {                                                  \
4037     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4038     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4039   } while (0)
4040
4041
4042 /* Produce codes for a DIMENSION1 character whose character set is
4043    CHARSET and whose position-code is C1.  Designation and invocation
4044    sequences are also produced in advance if necessary.  */
4045
4046 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4047   do {                                                                  \
4048     int id = CHARSET_ID (charset);                                      \
4049                                                                         \
4050     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4051         && id == charset_ascii)                                         \
4052       {                                                                 \
4053         id = charset_jisx0201_roman;                                    \
4054         charset = CHARSET_FROM_ID (id);                                 \
4055       }                                                                 \
4056                                                                         \
4057     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4058       {                                                                 \
4059         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4060           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4061         else                                                            \
4062           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4063         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4064         break;                                                          \
4065       }                                                                 \
4066     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4067       {                                                                 \
4068         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4069         break;                                                          \
4070       }                                                                 \
4071     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4072       {                                                                 \
4073         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4074         break;                                                          \
4075       }                                                                 \
4076     else                                                                \
4077       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4078          must invoke it, or, at first, designate it to some graphic     \
4079          register.  Then repeat the loop to actually produce the        \
4080          character.  */                                                 \
4081       dst = encode_invocation_designation (charset, coding, dst,        \
4082                                            &produced_chars);            \
4083   } while (1)
4084
4085
4086 /* Produce codes for a DIMENSION2 character whose character set is
4087    CHARSET and whose position-codes are C1 and C2.  Designation and
4088    invocation codes are also produced in advance if necessary.  */
4089
4090 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4091   do {                                                                  \
4092     int id = CHARSET_ID (charset);                                      \
4093                                                                         \
4094     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4095         && id == charset_jisx0208)                                      \
4096       {                                                                 \
4097         id = charset_jisx0208_1978;                                     \
4098         charset = CHARSET_FROM_ID (id);                                 \
4099       }                                                                 \
4100                                                                         \
4101     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4102       {                                                                 \
4103         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4104           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4105         else                                                            \
4106           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4107         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4108         break;                                                          \
4109       }                                                                 \
4110     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4111       {                                                                 \
4112         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4113         break;                                                          \
4114       }                                                                 \
4115     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4116       {                                                                 \
4117         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4118         break;                                                          \
4119       }                                                                 \
4120     else                                                                \
4121       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4122          must invoke it, or, at first, designate it to some graphic     \
4123          register.  Then repeat the loop to actually produce the        \
4124          character.  */                                                 \
4125       dst = encode_invocation_designation (charset, coding, dst,        \
4126                                            &produced_chars);            \
4127   } while (1)
4128
4129
4130 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4131   do {                                                                     \
4132     unsigned code;                                                         \
4133     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4134                                                                            \
4135     if (CHARSET_DIMENSION (charset) == 1)                                  \
4136       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4137     else                                                                   \
4138       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4139   } while (0)
4140
4141
4142 /* Produce designation and invocation codes at a place pointed by DST
4143    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4144    Return new DST.  */
4145
4146 static unsigned char *
4147 encode_invocation_designation (struct charset *charset,
4148                                struct coding_system *coding,
4149                                unsigned char *dst, ptrdiff_t *p_nchars)
4150 {
4151   bool multibytep = coding->dst_multibyte;
4152   ptrdiff_t produced_chars = *p_nchars;
4153   int reg;                      /* graphic register number */
4154   int id = CHARSET_ID (charset);
4155
4156   /* At first, check designations.  */
4157   for (reg = 0; reg < 4; reg++)
4158     if (id == CODING_ISO_DESIGNATION (coding, reg))
4159       break;
4160
4161   if (reg >= 4)
4162     {
4163       /* CHARSET is not yet designated to any graphic registers.  */
4164       /* At first check the requested designation.  */
4165       reg = CODING_ISO_REQUEST (coding, id);
4166       if (reg < 0)
4167         /* Since CHARSET requests no special designation, designate it
4168            to graphic register 0.  */
4169         reg = 0;
4170
4171       ENCODE_DESIGNATION (charset, reg, coding);
4172     }
4173
4174   if (CODING_ISO_INVOCATION (coding, 0) != reg
4175       && CODING_ISO_INVOCATION (coding, 1) != reg)
4176     {
4177       /* Since the graphic register REG is not invoked to any graphic
4178          planes, invoke it to graphic plane 0.  */
4179       switch (reg)
4180         {
4181         case 0:                 /* graphic register 0 */
4182           ENCODE_SHIFT_IN;
4183           break;
4184
4185         case 1:                 /* graphic register 1 */
4186           ENCODE_SHIFT_OUT;
4187           break;
4188
4189         case 2:                 /* graphic register 2 */
4190           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4191             ENCODE_SINGLE_SHIFT_2;
4192           else
4193             ENCODE_LOCKING_SHIFT_2;
4194           break;
4195
4196         case 3:                 /* graphic register 3 */
4197           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4198             ENCODE_SINGLE_SHIFT_3;
4199           else
4200             ENCODE_LOCKING_SHIFT_3;
4201           break;
4202         }
4203     }
4204
4205   *p_nchars = produced_chars;
4206   return dst;
4207 }
4208
4209
4210 /* Produce codes for designation and invocation to reset the graphic
4211    planes and registers to initial state.  */
4212 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4213   do {                                                                  \
4214     int reg;                                                            \
4215     struct charset *charset;                                            \
4216                                                                         \
4217     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4218       ENCODE_SHIFT_IN;                                                  \
4219     for (reg = 0; reg < 4; reg++)                                       \
4220       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4221           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4222               != CODING_ISO_INITIAL (coding, reg)))                     \
4223         {                                                               \
4224           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4225           ENCODE_DESIGNATION (charset, reg, coding);                    \
4226         }                                                               \
4227   } while (0)
4228
4229
4230 /* Produce designation sequences of charsets in the line started from
4231    CHARBUF to a place pointed by DST, and return the number of
4232    produced bytes.  DST should not directly point a buffer text area
4233    which may be relocated by char_charset call.
4234
4235    If the current block ends before any end-of-line, we may fail to
4236    find all the necessary designations.  */
4237
4238 static ptrdiff_t
4239 encode_designation_at_bol (struct coding_system *coding,
4240                            int *charbuf, int *charbuf_end,
4241                            unsigned char *dst)
4242 {
4243   unsigned char *orig = dst;
4244   struct charset *charset;
4245   /* Table of charsets to be designated to each graphic register.  */
4246   int r[4];
4247   int c, found = 0, reg;
4248   ptrdiff_t produced_chars = 0;
4249   bool multibytep = coding->dst_multibyte;
4250   Lisp_Object attrs;
4251   Lisp_Object charset_list;
4252
4253   attrs = CODING_ID_ATTRS (coding->id);
4254   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4255   if (EQ (charset_list, Qiso_2022))
4256     charset_list = Viso_2022_charset_list;
4257
4258   for (reg = 0; reg < 4; reg++)
4259     r[reg] = -1;
4260
4261   while (charbuf < charbuf_end && found < 4)
4262     {
4263       int id;
4264
4265       c = *charbuf++;
4266       if (c == '\n')
4267         break;
4268       charset = char_charset (c, charset_list, NULL);
4269       id = CHARSET_ID (charset);
4270       reg = CODING_ISO_REQUEST (coding, id);
4271       if (reg >= 0 && r[reg] < 0)
4272         {
4273           found++;
4274           r[reg] = id;
4275         }
4276     }
4277
4278   if (found)
4279     {
4280       for (reg = 0; reg < 4; reg++)
4281         if (r[reg] >= 0
4282             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4283           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4284     }
4285
4286   return dst - orig;
4287 }
4288
4289 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4290
4291 static bool
4292 encode_coding_iso_2022 (struct coding_system *coding)
4293 {
4294   bool multibytep = coding->dst_multibyte;
4295   int *charbuf = coding->charbuf;
4296   int *charbuf_end = charbuf + coding->charbuf_used;
4297   unsigned char *dst = coding->destination + coding->produced;
4298   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4299   int safe_room = 16;
4300   bool bol_designation
4301     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4302        && CODING_ISO_BOL (coding));
4303   ptrdiff_t produced_chars = 0;
4304   Lisp_Object attrs, eol_type, charset_list;
4305   bool ascii_compatible;
4306   int c;
4307   int preferred_charset_id = -1;
4308
4309   CODING_GET_INFO (coding, attrs, charset_list);
4310   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4311   if (VECTORP (eol_type))
4312     eol_type = Qunix;
4313
4314   setup_iso_safe_charsets (attrs);
4315   /* Charset list may have been changed.  */
4316   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4317   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4318
4319   ascii_compatible
4320     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4321        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4322                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4323
4324   while (charbuf < charbuf_end)
4325     {
4326       ASSURE_DESTINATION (safe_room);
4327
4328       if (bol_designation)
4329         {
4330           /* We have to produce designation sequences if any now.  */
4331           unsigned char desig_buf[16];
4332           int nbytes;
4333           ptrdiff_t offset;
4334
4335           charset_map_loaded = 0;
4336           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4337                                               desig_buf);
4338           if (charset_map_loaded
4339               && (offset = coding_change_destination (coding)))
4340             {
4341               dst += offset;
4342               dst_end += offset;
4343             }
4344           memcpy (dst, desig_buf, nbytes);
4345           dst += nbytes;
4346           /* We are sure that designation sequences are all ASCII bytes.  */
4347           produced_chars += nbytes;
4348           bol_designation = 0;
4349           ASSURE_DESTINATION (safe_room);
4350         }
4351
4352       c = *charbuf++;
4353
4354       if (c < 0)
4355         {
4356           /* Handle an annotation.  */
4357           switch (*charbuf)
4358             {
4359             case CODING_ANNOTATE_COMPOSITION_MASK:
4360               /* Not yet implemented.  */
4361               break;
4362             case CODING_ANNOTATE_CHARSET_MASK:
4363               preferred_charset_id = charbuf[2];
4364               if (preferred_charset_id >= 0
4365                   && NILP (Fmemq (make_number (preferred_charset_id),
4366                                   charset_list)))
4367                 preferred_charset_id = -1;
4368               break;
4369             default:
4370               emacs_abort ();
4371             }
4372           charbuf += -c - 1;
4373           continue;
4374         }
4375
4376       /* Now encode the character C.  */
4377       if (c < 0x20 || c == 0x7F)
4378         {
4379           if (c == '\n'
4380               || (c == '\r' && EQ (eol_type, Qmac)))
4381             {
4382               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4383                 ENCODE_RESET_PLANE_AND_REGISTER ();
4384               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4385                 {
4386                   int i;
4387
4388                   for (i = 0; i < 4; i++)
4389                     CODING_ISO_DESIGNATION (coding, i)
4390                       = CODING_ISO_INITIAL (coding, i);
4391                 }
4392               bol_designation = ((CODING_ISO_FLAGS (coding)
4393                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4394                                  != 0);
4395             }
4396           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4397             ENCODE_RESET_PLANE_AND_REGISTER ();
4398           EMIT_ONE_ASCII_BYTE (c);
4399         }
4400       else if (ASCII_CHAR_P (c))
4401         {
4402           if (ascii_compatible)
4403             EMIT_ONE_ASCII_BYTE (c);
4404           else
4405             {
4406               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4407               ENCODE_ISO_CHARACTER (charset, c);
4408             }
4409         }
4410       else if (CHAR_BYTE8_P (c))
4411         {
4412           c = CHAR_TO_BYTE8 (c);
4413           EMIT_ONE_BYTE (c);
4414         }
4415       else
4416         {
4417           struct charset *charset;
4418
4419           if (preferred_charset_id >= 0)
4420             {
4421               bool result;
4422
4423               charset = CHARSET_FROM_ID (preferred_charset_id);
4424               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4425               if (! result)
4426                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4427                                      NULL, charset);
4428             }
4429           else
4430             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431                                  NULL, charset);
4432           if (!charset)
4433             {
4434               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4435                 {
4436                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4437                   charset = CHARSET_FROM_ID (charset_ascii);
4438                 }
4439               else
4440                 {
4441                   c = coding->default_char;
4442                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4443                                        charset_list, NULL, charset);
4444                 }
4445             }
4446           ENCODE_ISO_CHARACTER (charset, c);
4447         }
4448     }
4449
4450   if (coding->mode & CODING_MODE_LAST_BLOCK
4451       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4452     {
4453       ASSURE_DESTINATION (safe_room);
4454       ENCODE_RESET_PLANE_AND_REGISTER ();
4455     }
4456   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4457   CODING_ISO_BOL (coding) = bol_designation;
4458   coding->produced_char += produced_chars;
4459   coding->produced = dst - coding->destination;
4460   return 0;
4461 }
4462
4463 \f
4464 /*** 8,9. SJIS and BIG5 handlers ***/
4465
4466 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4467    quite widely.  So, for the moment, Emacs supports them in the bare
4468    C code.  But, in the future, they may be supported only by CCL.  */
4469
4470 /* SJIS is a coding system encoding three character sets: ASCII, right
4471    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4472    as is.  A character of charset katakana-jisx0201 is encoded by
4473    "position-code + 0x80".  A character of charset japanese-jisx0208
4474    is encoded in 2-byte but two position-codes are divided and shifted
4475    so that it fit in the range below.
4476
4477    --- CODE RANGE of SJIS ---
4478    (character set)      (range)
4479    ASCII                0x00 .. 0x7F
4480    KATAKANA-JISX0201    0xA0 .. 0xDF
4481    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4482             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4483    -------------------------------
4484
4485 */
4486
4487 /* BIG5 is a coding system encoding two character sets: ASCII and
4488    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4489    character set and is encoded in two-byte.
4490
4491    --- CODE RANGE of BIG5 ---
4492    (character set)      (range)
4493    ASCII                0x00 .. 0x7F
4494    Big5 (1st byte)      0xA1 .. 0xFE
4495         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4496    --------------------------
4497
4498   */
4499
4500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4501    Return true if a text is encoded in SJIS.  */
4502
4503 static bool
4504 detect_coding_sjis (struct coding_system *coding,
4505                     struct coding_detection_info *detect_info)
4506 {
4507   const unsigned char *src = coding->source, *src_base;
4508   const unsigned char *src_end = coding->source + coding->src_bytes;
4509   bool multibytep = coding->src_multibyte;
4510   ptrdiff_t consumed_chars = 0;
4511   int found = 0;
4512   int c;
4513   Lisp_Object attrs, charset_list;
4514   int max_first_byte_of_2_byte_code;
4515
4516   CODING_GET_INFO (coding, attrs, charset_list);
4517   max_first_byte_of_2_byte_code
4518     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4519
4520   detect_info->checked |= CATEGORY_MASK_SJIS;
4521   /* A coding system of this category is always ASCII compatible.  */
4522   src += coding->head_ascii;
4523
4524   while (1)
4525     {
4526       src_base = src;
4527       ONE_MORE_BYTE (c);
4528       if (c < 0x80)
4529         continue;
4530       if ((c >= 0x81 && c <= 0x9F)
4531           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4532         {
4533           ONE_MORE_BYTE (c);
4534           if (c < 0x40 || c == 0x7F || c > 0xFC)
4535             break;
4536           found = CATEGORY_MASK_SJIS;
4537         }
4538       else if (c >= 0xA0 && c < 0xE0)
4539         found = CATEGORY_MASK_SJIS;
4540       else
4541         break;
4542     }
4543   detect_info->rejected |= CATEGORY_MASK_SJIS;
4544   return 0;
4545
4546  no_more_source:
4547   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4548     {
4549       detect_info->rejected |= CATEGORY_MASK_SJIS;
4550       return 0;
4551     }
4552   detect_info->found |= found;
4553   return 1;
4554 }
4555
4556 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4557    Return true if a text is encoded in BIG5.  */
4558
4559 static bool
4560 detect_coding_big5 (struct coding_system *coding,
4561                     struct coding_detection_info *detect_info)
4562 {
4563   const unsigned char *src = coding->source, *src_base;
4564   const unsigned char *src_end = coding->source + coding->src_bytes;
4565   bool multibytep = coding->src_multibyte;
4566   ptrdiff_t consumed_chars = 0;
4567   int found = 0;
4568   int c;
4569
4570   detect_info->checked |= CATEGORY_MASK_BIG5;
4571   /* A coding system of this category is always ASCII compatible.  */
4572   src += coding->head_ascii;
4573
4574   while (1)
4575     {
4576       src_base = src;
4577       ONE_MORE_BYTE (c);
4578       if (c < 0x80)
4579         continue;
4580       if (c >= 0xA1)
4581         {
4582           ONE_MORE_BYTE (c);
4583           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4584             return 0;
4585           found = CATEGORY_MASK_BIG5;
4586         }
4587       else
4588         break;
4589     }
4590   detect_info->rejected |= CATEGORY_MASK_BIG5;
4591   return 0;
4592
4593  no_more_source:
4594   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4595     {
4596       detect_info->rejected |= CATEGORY_MASK_BIG5;
4597       return 0;
4598     }
4599   detect_info->found |= found;
4600   return 1;
4601 }
4602
4603 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4604
4605 static void
4606 decode_coding_sjis (struct coding_system *coding)
4607 {
4608   const unsigned char *src = coding->source + coding->consumed;
4609   const unsigned char *src_end = coding->source + coding->src_bytes;
4610   const unsigned char *src_base;
4611   int *charbuf = coding->charbuf + coding->charbuf_used;
4612   /* We may produce one charset annotation in one loop and one more at
4613      the end.  */
4614   int *charbuf_end
4615     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4616   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4617   bool multibytep = coding->src_multibyte;
4618   struct charset *charset_roman, *charset_kanji, *charset_kana;
4619   struct charset *charset_kanji2;
4620   Lisp_Object attrs, charset_list, val;
4621   ptrdiff_t char_offset = coding->produced_char;
4622   ptrdiff_t last_offset = char_offset;
4623   int last_id = charset_ascii;
4624   bool eol_dos
4625     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4626   int byte_after_cr = -1;
4627
4628   CODING_GET_INFO (coding, attrs, charset_list);
4629
4630   val = charset_list;
4631   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4632   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4633   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4634   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4635
4636   while (1)
4637     {
4638       int c, c1;
4639       struct charset *charset;
4640
4641       src_base = src;
4642       consumed_chars_base = consumed_chars;
4643
4644       if (charbuf >= charbuf_end)
4645         {
4646           if (byte_after_cr >= 0)
4647             src_base--;
4648           break;
4649         }
4650
4651       if (byte_after_cr >= 0)
4652         c = byte_after_cr, byte_after_cr = -1;
4653       else
4654         ONE_MORE_BYTE (c);
4655       if (c < 0)
4656         goto invalid_code;
4657       if (c < 0x80)
4658         {
4659           if (eol_dos && c == '\r')
4660             ONE_MORE_BYTE (byte_after_cr);
4661           charset = charset_roman;
4662         }
4663       else if (c == 0x80 || c == 0xA0)
4664         goto invalid_code;
4665       else if (c >= 0xA1 && c <= 0xDF)
4666         {
4667           /* SJIS -> JISX0201-Kana */
4668           c &= 0x7F;
4669           charset = charset_kana;
4670         }
4671       else if (c <= 0xEF)
4672         {
4673           /* SJIS -> JISX0208 */
4674           ONE_MORE_BYTE (c1);
4675           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4676             goto invalid_code;
4677           c = (c << 8) | c1;
4678           SJIS_TO_JIS (c);
4679           charset = charset_kanji;
4680         }
4681       else if (c <= 0xFC && charset_kanji2)
4682         {
4683           /* SJIS -> JISX0213-2 */
4684           ONE_MORE_BYTE (c1);
4685           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4686             goto invalid_code;
4687           c = (c << 8) | c1;
4688           SJIS_TO_JIS2 (c);
4689           charset = charset_kanji2;
4690         }
4691       else
4692         goto invalid_code;
4693       if (charset->id != charset_ascii
4694           && last_id != charset->id)
4695         {
4696           if (last_id != charset_ascii)
4697             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4698           last_id = charset->id;
4699           last_offset = char_offset;
4700         }
4701       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4702       *charbuf++ = c;
4703       char_offset++;
4704       continue;
4705
4706     invalid_code:
4707       src = src_base;
4708       consumed_chars = consumed_chars_base;
4709       ONE_MORE_BYTE (c);
4710       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4711       char_offset++;
4712       coding->errors++;
4713     }
4714
4715  no_more_source:
4716   if (last_id != charset_ascii)
4717     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4718   coding->consumed_char += consumed_chars_base;
4719   coding->consumed = src_base - coding->source;
4720   coding->charbuf_used = charbuf - coding->charbuf;
4721 }
4722
4723 static void
4724 decode_coding_big5 (struct coding_system *coding)
4725 {
4726   const unsigned char *src = coding->source + coding->consumed;
4727   const unsigned char *src_end = coding->source + coding->src_bytes;
4728   const unsigned char *src_base;
4729   int *charbuf = coding->charbuf + coding->charbuf_used;
4730   /* We may produce one charset annotation in one loop and one more at
4731      the end.  */
4732   int *charbuf_end
4733     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4734   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4735   bool multibytep = coding->src_multibyte;
4736   struct charset *charset_roman, *charset_big5;
4737   Lisp_Object attrs, charset_list, val;
4738   ptrdiff_t char_offset = coding->produced_char;
4739   ptrdiff_t last_offset = char_offset;
4740   int last_id = charset_ascii;
4741   bool eol_dos
4742     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4743   int byte_after_cr = -1;
4744
4745   CODING_GET_INFO (coding, attrs, charset_list);
4746   val = charset_list;
4747   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4748   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4749
4750   while (1)
4751     {
4752       int c, c1;
4753       struct charset *charset;
4754
4755       src_base = src;
4756       consumed_chars_base = consumed_chars;
4757
4758       if (charbuf >= charbuf_end)
4759         {
4760           if (byte_after_cr >= 0)
4761             src_base--;
4762           break;
4763         }
4764
4765       if (byte_after_cr >= 0)
4766         c = byte_after_cr, byte_after_cr = -1;
4767       else
4768         ONE_MORE_BYTE (c);
4769
4770       if (c < 0)
4771         goto invalid_code;
4772       if (c < 0x80)
4773         {
4774           if (eol_dos && c == '\r')
4775             ONE_MORE_BYTE (byte_after_cr);
4776           charset = charset_roman;
4777         }
4778       else
4779         {
4780           /* BIG5 -> Big5 */
4781           if (c < 0xA1 || c > 0xFE)
4782             goto invalid_code;
4783           ONE_MORE_BYTE (c1);
4784           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4785             goto invalid_code;
4786           c = c << 8 | c1;
4787           charset = charset_big5;
4788         }
4789       if (charset->id != charset_ascii
4790           && last_id != charset->id)
4791         {
4792           if (last_id != charset_ascii)
4793             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4794           last_id = charset->id;
4795           last_offset = char_offset;
4796         }
4797       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4798       *charbuf++ = c;
4799       char_offset++;
4800       continue;
4801
4802     invalid_code:
4803       src = src_base;
4804       consumed_chars = consumed_chars_base;
4805       ONE_MORE_BYTE (c);
4806       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4807       char_offset++;
4808       coding->errors++;
4809     }
4810
4811  no_more_source:
4812   if (last_id != charset_ascii)
4813     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814   coding->consumed_char += consumed_chars_base;
4815   coding->consumed = src_base - coding->source;
4816   coding->charbuf_used = charbuf - coding->charbuf;
4817 }
4818
4819 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4820    This function can encode charsets `ascii', `katakana-jisx0201',
4821    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4822    are sure that all these charsets are registered as official charset
4823    (i.e. do not have extended leading-codes).  Characters of other
4824    charsets are produced without any encoding.  */
4825
4826 static bool
4827 encode_coding_sjis (struct coding_system *coding)
4828 {
4829   bool multibytep = coding->dst_multibyte;
4830   int *charbuf = coding->charbuf;
4831   int *charbuf_end = charbuf + coding->charbuf_used;
4832   unsigned char *dst = coding->destination + coding->produced;
4833   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4834   int safe_room = 4;
4835   ptrdiff_t produced_chars = 0;
4836   Lisp_Object attrs, charset_list, val;
4837   bool ascii_compatible;
4838   struct charset *charset_kanji, *charset_kana;
4839   struct charset *charset_kanji2;
4840   int c;
4841
4842   CODING_GET_INFO (coding, attrs, charset_list);
4843   val = XCDR (charset_list);
4844   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4847
4848   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4849
4850   while (charbuf < charbuf_end)
4851     {
4852       ASSURE_DESTINATION (safe_room);
4853       c = *charbuf++;
4854       /* Now encode the character C.  */
4855       if (ASCII_CHAR_P (c) && ascii_compatible)
4856         EMIT_ONE_ASCII_BYTE (c);
4857       else if (CHAR_BYTE8_P (c))
4858         {
4859           c = CHAR_TO_BYTE8 (c);
4860           EMIT_ONE_BYTE (c);
4861         }
4862       else
4863         {
4864           unsigned code;
4865           struct charset *charset;
4866           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4867                                &code, charset);
4868
4869           if (!charset)
4870             {
4871               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4872                 {
4873                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4874                   charset = CHARSET_FROM_ID (charset_ascii);
4875                 }
4876               else
4877                 {
4878                   c = coding->default_char;
4879                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4880                                        charset_list, &code, charset);
4881                 }
4882             }
4883           if (code == CHARSET_INVALID_CODE (charset))
4884             emacs_abort ();
4885           if (charset == charset_kanji)
4886             {
4887               int c1, c2;
4888               JIS_TO_SJIS (code);
4889               c1 = code >> 8, c2 = code & 0xFF;
4890               EMIT_TWO_BYTES (c1, c2);
4891             }
4892           else if (charset == charset_kana)
4893             EMIT_ONE_BYTE (code | 0x80);
4894           else if (charset_kanji2 && charset == charset_kanji2)
4895             {
4896               int c1, c2;
4897
4898               c1 = code >> 8;
4899               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4900                   || c1 == 0x28
4901                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4902                 {
4903                   JIS_TO_SJIS2 (code);
4904                   c1 = code >> 8, c2 = code & 0xFF;
4905                   EMIT_TWO_BYTES (c1, c2);
4906                 }
4907               else
4908                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4909             }
4910           else
4911             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4912         }
4913     }
4914   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4915   coding->produced_char += produced_chars;
4916   coding->produced = dst - coding->destination;
4917   return 0;
4918 }
4919
4920 static bool
4921 encode_coding_big5 (struct coding_system *coding)
4922 {
4923   bool multibytep = coding->dst_multibyte;
4924   int *charbuf = coding->charbuf;
4925   int *charbuf_end = charbuf + coding->charbuf_used;
4926   unsigned char *dst = coding->destination + coding->produced;
4927   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4928   int safe_room = 4;
4929   ptrdiff_t produced_chars = 0;
4930   Lisp_Object attrs, charset_list, val;
4931   bool ascii_compatible;
4932   struct charset *charset_big5;
4933   int c;
4934
4935   CODING_GET_INFO (coding, attrs, charset_list);
4936   val = XCDR (charset_list);
4937   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4938   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4939
4940   while (charbuf < charbuf_end)
4941     {
4942       ASSURE_DESTINATION (safe_room);
4943       c = *charbuf++;
4944       /* Now encode the character C.  */
4945       if (ASCII_CHAR_P (c) && ascii_compatible)
4946         EMIT_ONE_ASCII_BYTE (c);
4947       else if (CHAR_BYTE8_P (c))
4948         {
4949           c = CHAR_TO_BYTE8 (c);
4950           EMIT_ONE_BYTE (c);
4951         }
4952       else
4953         {
4954           unsigned code;
4955           struct charset *charset;
4956           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4957                                &code, charset);
4958
4959           if (! charset)
4960             {
4961               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4962                 {
4963                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4964                   charset = CHARSET_FROM_ID (charset_ascii);
4965                 }
4966               else
4967                 {
4968                   c = coding->default_char;
4969                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4970                                        charset_list, &code, charset);
4971                 }
4972             }
4973           if (code == CHARSET_INVALID_CODE (charset))
4974             emacs_abort ();
4975           if (charset == charset_big5)
4976             {
4977               int c1, c2;
4978
4979               c1 = code >> 8, c2 = code & 0xFF;
4980               EMIT_TWO_BYTES (c1, c2);
4981             }
4982           else
4983             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4984         }
4985     }
4986   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4987   coding->produced_char += produced_chars;
4988   coding->produced = dst - coding->destination;
4989   return 0;
4990 }
4991
4992 \f
4993 /*** 10. CCL handlers ***/
4994
4995 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4996    Return true if a text is encoded in a coding system of which
4997    encoder/decoder are written in CCL program.  */
4998
4999 static bool
5000 detect_coding_ccl (struct coding_system *coding,
5001                    struct coding_detection_info *detect_info)
5002 {
5003   const unsigned char *src = coding->source, *src_base;
5004   const unsigned char *src_end = coding->source + coding->src_bytes;
5005   bool multibytep = coding->src_multibyte;
5006   ptrdiff_t consumed_chars = 0;
5007   int found = 0;
5008   unsigned char *valids;
5009   ptrdiff_t head_ascii = coding->head_ascii;
5010   Lisp_Object attrs;
5011
5012   detect_info->checked |= CATEGORY_MASK_CCL;
5013
5014   coding = &coding_categories[coding_category_ccl];
5015   valids = CODING_CCL_VALIDS (coding);
5016   attrs = CODING_ID_ATTRS (coding->id);
5017   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5018     src += head_ascii;
5019
5020   while (1)
5021     {
5022       int c;
5023
5024       src_base = src;
5025       ONE_MORE_BYTE (c);
5026       if (c < 0 || ! valids[c])
5027         break;
5028       if ((valids[c] > 1))
5029         found = CATEGORY_MASK_CCL;
5030     }
5031   detect_info->rejected |= CATEGORY_MASK_CCL;
5032   return 0;
5033
5034  no_more_source:
5035   detect_info->found |= found;
5036   return 1;
5037 }
5038
5039 static void
5040 decode_coding_ccl (struct coding_system *coding)
5041 {
5042   const unsigned char *src = coding->source + coding->consumed;
5043   const unsigned char *src_end = coding->source + coding->src_bytes;
5044   int *charbuf = coding->charbuf + coding->charbuf_used;
5045   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5046   ptrdiff_t consumed_chars = 0;
5047   bool multibytep = coding->src_multibyte;
5048   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5049   int source_charbuf[1024];
5050   int source_byteidx[1025];
5051   Lisp_Object attrs, charset_list;
5052
5053   CODING_GET_INFO (coding, attrs, charset_list);
5054
5055   while (1)
5056     {
5057       const unsigned char *p = src;
5058       ptrdiff_t offset;
5059       int i = 0;
5060
5061       if (multibytep)
5062         {
5063           while (i < 1024 && p < src_end)
5064             {
5065               source_byteidx[i] = p - src;
5066               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5067             }
5068           source_byteidx[i] = p - src;
5069         }
5070       else
5071         while (i < 1024 && p < src_end)
5072           source_charbuf[i++] = *p++;
5073
5074       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5075         ccl->last_block = 1;
5076       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5077       charset_map_loaded = 0;
5078       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5079                   charset_list);
5080       if (charset_map_loaded
5081           && (offset = coding_change_source (coding)))
5082         {
5083           p += offset;
5084           src += offset;
5085           src_end += offset;
5086         }
5087       charbuf += ccl->produced;
5088       if (multibytep)
5089         src += source_byteidx[ccl->consumed];
5090       else
5091         src += ccl->consumed;
5092       consumed_chars += ccl->consumed;
5093       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5094         break;
5095     }
5096
5097   switch (ccl->status)
5098     {
5099     case CCL_STAT_SUSPEND_BY_SRC:
5100       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5101       break;
5102     case CCL_STAT_SUSPEND_BY_DST:
5103       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5104       break;
5105     case CCL_STAT_QUIT:
5106     case CCL_STAT_INVALID_CMD:
5107       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5108       break;
5109     default:
5110       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5111       break;
5112     }
5113   coding->consumed_char += consumed_chars;
5114   coding->consumed = src - coding->source;
5115   coding->charbuf_used = charbuf - coding->charbuf;
5116 }
5117
5118 static bool
5119 encode_coding_ccl (struct coding_system *coding)
5120 {
5121   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5122   bool multibytep = coding->dst_multibyte;
5123   int *charbuf = coding->charbuf;
5124   int *charbuf_end = charbuf + coding->charbuf_used;
5125   unsigned char *dst = coding->destination + coding->produced;
5126   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5127   int destination_charbuf[1024];
5128   ptrdiff_t produced_chars = 0;
5129   int i;
5130   Lisp_Object attrs, charset_list;
5131
5132   CODING_GET_INFO (coding, attrs, charset_list);
5133   if (coding->consumed_char == coding->src_chars
5134       && coding->mode & CODING_MODE_LAST_BLOCK)
5135     ccl->last_block = 1;
5136
5137   do
5138     {
5139       ptrdiff_t offset;
5140
5141       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5142       charset_map_loaded = 0;
5143       ccl_driver (ccl, charbuf, destination_charbuf,
5144                   charbuf_end - charbuf, 1024, charset_list);
5145       if (charset_map_loaded
5146           && (offset = coding_change_destination (coding)))
5147         dst += offset;
5148       if (multibytep)
5149         {
5150           ASSURE_DESTINATION (ccl->produced * 2);
5151           for (i = 0; i < ccl->produced; i++)
5152             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5153         }
5154       else
5155         {
5156           ASSURE_DESTINATION (ccl->produced);
5157           for (i = 0; i < ccl->produced; i++)
5158             *dst++ = destination_charbuf[i] & 0xFF;
5159           produced_chars += ccl->produced;
5160         }
5161       charbuf += ccl->consumed;
5162       if (ccl->status == CCL_STAT_QUIT
5163           || ccl->status == CCL_STAT_INVALID_CMD)
5164         break;
5165     }
5166   while (charbuf < charbuf_end);
5167
5168   switch (ccl->status)
5169     {
5170     case CCL_STAT_SUSPEND_BY_SRC:
5171       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5172       break;
5173     case CCL_STAT_SUSPEND_BY_DST:
5174       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5175       break;
5176     case CCL_STAT_QUIT:
5177     case CCL_STAT_INVALID_CMD:
5178       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5179       break;
5180     default:
5181       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5182       break;
5183     }
5184
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10, 11. no-conversion handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5194
5195 static void
5196 decode_coding_raw_text (struct coding_system *coding)
5197 {
5198   bool eol_dos
5199     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5200
5201   coding->chars_at_source = 1;
5202   coding->consumed_char = coding->src_chars;
5203   coding->consumed = coding->src_bytes;
5204   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5205     {
5206       coding->consumed_char--;
5207       coding->consumed--;
5208       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5209     }
5210   else
5211     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5212 }
5213
5214 static bool
5215 encode_coding_raw_text (struct coding_system *coding)
5216 {
5217   bool multibytep = coding->dst_multibyte;
5218   int *charbuf = coding->charbuf;
5219   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5220   unsigned char *dst = coding->destination + coding->produced;
5221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222   ptrdiff_t produced_chars = 0;
5223   int c;
5224
5225   if (multibytep)
5226     {
5227       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5228
5229       if (coding->src_multibyte)
5230         while (charbuf < charbuf_end)
5231           {
5232             ASSURE_DESTINATION (safe_room);
5233             c = *charbuf++;
5234             if (ASCII_CHAR_P (c))
5235               EMIT_ONE_ASCII_BYTE (c);
5236             else if (CHAR_BYTE8_P (c))
5237               {
5238                 c = CHAR_TO_BYTE8 (c);
5239                 EMIT_ONE_BYTE (c);
5240               }
5241             else
5242               {
5243                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5244
5245                 CHAR_STRING_ADVANCE (c, p1);
5246                 do
5247                   {
5248                     EMIT_ONE_BYTE (*p0);
5249                     p0++;
5250                   }
5251                 while (p0 < p1);
5252               }
5253           }
5254       else
5255         while (charbuf < charbuf_end)
5256           {
5257             ASSURE_DESTINATION (safe_room);
5258             c = *charbuf++;
5259             EMIT_ONE_BYTE (c);
5260           }
5261     }
5262   else
5263     {
5264       if (coding->src_multibyte)
5265         {
5266           int safe_room = MAX_MULTIBYTE_LENGTH;
5267
5268           while (charbuf < charbuf_end)
5269             {
5270               ASSURE_DESTINATION (safe_room);
5271               c = *charbuf++;
5272               if (ASCII_CHAR_P (c))
5273                 *dst++ = c;
5274               else if (CHAR_BYTE8_P (c))
5275                 *dst++ = CHAR_TO_BYTE8 (c);
5276               else
5277                 CHAR_STRING_ADVANCE (c, dst);
5278             }
5279         }
5280       else
5281         {
5282           ASSURE_DESTINATION (charbuf_end - charbuf);
5283           while (charbuf < charbuf_end && dst < dst_end)
5284             *dst++ = *charbuf++;
5285         }
5286       produced_chars = dst - (coding->destination + coding->produced);
5287     }
5288   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5289   coding->produced_char += produced_chars;
5290   coding->produced = dst - coding->destination;
5291   return 0;
5292 }
5293
5294 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5295    Return true if a text is encoded in a charset-based coding system.  */
5296
5297 static bool
5298 detect_coding_charset (struct coding_system *coding,
5299                        struct coding_detection_info *detect_info)
5300 {
5301   const unsigned char *src = coding->source, *src_base;
5302   const unsigned char *src_end = coding->source + coding->src_bytes;
5303   bool multibytep = coding->src_multibyte;
5304   ptrdiff_t consumed_chars = 0;
5305   Lisp_Object attrs, valids, name;
5306   int found = 0;
5307   ptrdiff_t head_ascii = coding->head_ascii;
5308   bool check_latin_extra = 0;
5309
5310   detect_info->checked |= CATEGORY_MASK_CHARSET;
5311
5312   coding = &coding_categories[coding_category_charset];
5313   attrs = CODING_ID_ATTRS (coding->id);
5314   valids = AREF (attrs, coding_attr_charset_valids);
5315   name = CODING_ID_NAME (coding->id);
5316   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5317                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5318       || strncmp (SSDATA (SYMBOL_NAME (name)),
5319                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5320     check_latin_extra = 1;
5321
5322   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5323     src += head_ascii;
5324
5325   while (1)
5326     {
5327       int c;
5328       Lisp_Object val;
5329       struct charset *charset;
5330       int dim, idx;
5331
5332       src_base = src;
5333       ONE_MORE_BYTE (c);
5334       if (c < 0)
5335         continue;
5336       val = AREF (valids, c);
5337       if (NILP (val))
5338         break;
5339       if (c >= 0x80)
5340         {
5341           if (c < 0xA0
5342               && check_latin_extra
5343               && (!VECTORP (Vlatin_extra_code_table)
5344                   || NILP (AREF (Vlatin_extra_code_table, c))))
5345             break;
5346           found = CATEGORY_MASK_CHARSET;
5347         }
5348       if (INTEGERP (val))
5349         {
5350           charset = CHARSET_FROM_ID (XFASTINT (val));
5351           dim = CHARSET_DIMENSION (charset);
5352           for (idx = 1; idx < dim; idx++)
5353             {
5354               if (src == src_end)
5355                 goto too_short;
5356               ONE_MORE_BYTE (c);
5357               if (c < charset->code_space[(dim - 1 - idx) * 4]
5358                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5359                 break;
5360             }
5361           if (idx < dim)
5362             break;
5363         }
5364       else
5365         {
5366           idx = 1;
5367           for (; CONSP (val); val = XCDR (val))
5368             {
5369               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5370               dim = CHARSET_DIMENSION (charset);
5371               while (idx < dim)
5372                 {
5373                   if (src == src_end)
5374                     goto too_short;
5375                   ONE_MORE_BYTE (c);
5376                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5377                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5378                     break;
5379                   idx++;
5380                 }
5381               if (idx == dim)
5382                 {
5383                   val = Qnil;
5384                   break;
5385                 }
5386             }
5387           if (CONSP (val))
5388             break;
5389         }
5390     }
5391  too_short:
5392   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5393   return 0;
5394
5395  no_more_source:
5396   detect_info->found |= found;
5397   return 1;
5398 }
5399
5400 static void
5401 decode_coding_charset (struct coding_system *coding)
5402 {
5403   const unsigned char *src = coding->source + coding->consumed;
5404   const unsigned char *src_end = coding->source + coding->src_bytes;
5405   const unsigned char *src_base;
5406   int *charbuf = coding->charbuf + coding->charbuf_used;
5407   /* We may produce one charset annotation in one loop and one more at
5408      the end.  */
5409   int *charbuf_end
5410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5411   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5412   bool multibytep = coding->src_multibyte;
5413   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5414   Lisp_Object valids;
5415   ptrdiff_t char_offset = coding->produced_char;
5416   ptrdiff_t last_offset = char_offset;
5417   int last_id = charset_ascii;
5418   bool eol_dos
5419     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5420   int byte_after_cr = -1;
5421
5422   valids = AREF (attrs, coding_attr_charset_valids);
5423
5424   while (1)
5425     {
5426       int c;
5427       Lisp_Object val;
5428       struct charset *charset;
5429       int dim;
5430       int len = 1;
5431       unsigned code;
5432
5433       src_base = src;
5434       consumed_chars_base = consumed_chars;
5435
5436       if (charbuf >= charbuf_end)
5437         {
5438           if (byte_after_cr >= 0)
5439             src_base--;
5440           break;
5441         }
5442
5443       if (byte_after_cr >= 0)
5444         {
5445           c = byte_after_cr;
5446           byte_after_cr = -1;
5447         }
5448       else
5449         {
5450           ONE_MORE_BYTE (c);
5451           if (eol_dos && c == '\r')
5452             ONE_MORE_BYTE (byte_after_cr);
5453         }
5454       if (c < 0)
5455         goto invalid_code;
5456       code = c;
5457
5458       val = AREF (valids, c);
5459       if (! INTEGERP (val) && ! CONSP (val))
5460         goto invalid_code;
5461       if (INTEGERP (val))
5462         {
5463           charset = CHARSET_FROM_ID (XFASTINT (val));
5464           dim = CHARSET_DIMENSION (charset);
5465           while (len < dim)
5466             {
5467               ONE_MORE_BYTE (c);
5468               code = (code << 8) | c;
5469               len++;
5470             }
5471           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5472                               charset, code, c);
5473         }
5474       else
5475         {
5476           /* VAL is a list of charset IDs.  It is assured that the
5477              list is sorted by charset dimensions (smaller one
5478              comes first).  */
5479           while (CONSP (val))
5480             {
5481               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5482               dim = CHARSET_DIMENSION (charset);
5483               while (len < dim)
5484                 {
5485                   ONE_MORE_BYTE (c);
5486                   code = (code << 8) | c;
5487                   len++;
5488                 }
5489               CODING_DECODE_CHAR (coding, src, src_base,
5490                                   src_end, charset, code, c);
5491               if (c >= 0)
5492                 break;
5493               val = XCDR (val);
5494             }
5495         }
5496       if (c < 0)
5497         goto invalid_code;
5498       if (charset->id != charset_ascii
5499           && last_id != charset->id)
5500         {
5501           if (last_id != charset_ascii)
5502             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5503           last_id = charset->id;
5504           last_offset = char_offset;
5505         }
5506
5507       *charbuf++ = c;
5508       char_offset++;
5509       continue;
5510
5511     invalid_code:
5512       src = src_base;
5513       consumed_chars = consumed_chars_base;
5514       ONE_MORE_BYTE (c);
5515       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5516       char_offset++;
5517       coding->errors++;
5518     }
5519
5520  no_more_source:
5521   if (last_id != charset_ascii)
5522     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5523   coding->consumed_char += consumed_chars_base;
5524   coding->consumed = src_base - coding->source;
5525   coding->charbuf_used = charbuf - coding->charbuf;
5526 }
5527
5528 static bool
5529 encode_coding_charset (struct coding_system *coding)
5530 {
5531   bool multibytep = coding->dst_multibyte;
5532   int *charbuf = coding->charbuf;
5533   int *charbuf_end = charbuf + coding->charbuf_used;
5534   unsigned char *dst = coding->destination + coding->produced;
5535   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5536   int safe_room = MAX_MULTIBYTE_LENGTH;
5537   ptrdiff_t produced_chars = 0;
5538   Lisp_Object attrs, charset_list;
5539   bool ascii_compatible;
5540   int c;
5541
5542   CODING_GET_INFO (coding, attrs, charset_list);
5543   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5544
5545   while (charbuf < charbuf_end)
5546     {
5547       struct charset *charset;
5548       unsigned code;
5549
5550       ASSURE_DESTINATION (safe_room);
5551       c = *charbuf++;
5552       if (ascii_compatible && ASCII_CHAR_P (c))
5553         EMIT_ONE_ASCII_BYTE (c);
5554       else if (CHAR_BYTE8_P (c))
5555         {
5556           c = CHAR_TO_BYTE8 (c);
5557           EMIT_ONE_BYTE (c);
5558         }
5559       else
5560         {
5561           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5562                                &code, charset);
5563
5564           if (charset)
5565             {
5566               if (CHARSET_DIMENSION (charset) == 1)
5567                 EMIT_ONE_BYTE (code);
5568               else if (CHARSET_DIMENSION (charset) == 2)
5569                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5570               else if (CHARSET_DIMENSION (charset) == 3)
5571                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5572               else
5573                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5574                                  (code >> 8) & 0xFF, code & 0xFF);
5575             }
5576           else
5577             {
5578               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5579                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5580               else
5581                 c = coding->default_char;
5582               EMIT_ONE_BYTE (c);
5583             }
5584         }
5585     }
5586
5587   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5588   coding->produced_char += produced_chars;
5589   coding->produced = dst - coding->destination;
5590   return 0;
5591 }
5592
5593 \f
5594 /*** 7. C library functions ***/
5595
5596 /* Setup coding context CODING from information about CODING_SYSTEM.
5597    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5598    CODING_SYSTEM is invalid, signal an error.  */
5599
5600 void
5601 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5602 {
5603   Lisp_Object attrs;
5604   Lisp_Object eol_type;
5605   Lisp_Object coding_type;
5606   Lisp_Object val;
5607
5608   if (NILP (coding_system))
5609     coding_system = Qundecided;
5610
5611   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5612
5613   attrs = CODING_ID_ATTRS (coding->id);
5614   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5615
5616   coding->mode = 0;
5617   coding->head_ascii = -1;
5618   if (VECTORP (eol_type))
5619     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5620                             | CODING_REQUIRE_DETECTION_MASK);
5621   else if (! EQ (eol_type, Qunix))
5622     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5623                             | CODING_REQUIRE_ENCODING_MASK);
5624   else
5625     coding->common_flags = 0;
5626   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5627     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5628   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5629     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5630   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5631     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5632
5633   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5634   coding->max_charset_id = SCHARS (val) - 1;
5635   coding->safe_charsets = SDATA (val);
5636   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5637   coding->carryover_bytes = 0;
5638
5639   coding_type = CODING_ATTR_TYPE (attrs);
5640   if (EQ (coding_type, Qundecided))
5641     {
5642       coding->detector = NULL;
5643       coding->decoder = decode_coding_raw_text;
5644       coding->encoder = encode_coding_raw_text;
5645       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5646     }
5647   else if (EQ (coding_type, Qiso_2022))
5648     {
5649       int i;
5650       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5651
5652       /* Invoke graphic register 0 to plane 0.  */
5653       CODING_ISO_INVOCATION (coding, 0) = 0;
5654       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5655       CODING_ISO_INVOCATION (coding, 1)
5656         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5657       /* Setup the initial status of designation.  */
5658       for (i = 0; i < 4; i++)
5659         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5660       /* Not single shifting initially.  */
5661       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5662       /* Beginning of buffer should also be regarded as bol. */
5663       CODING_ISO_BOL (coding) = 1;
5664       coding->detector = detect_coding_iso_2022;
5665       coding->decoder = decode_coding_iso_2022;
5666       coding->encoder = encode_coding_iso_2022;
5667       if (flags & CODING_ISO_FLAG_SAFE)
5668         coding->mode |= CODING_MODE_SAFE_ENCODING;
5669       coding->common_flags
5670         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5671             | CODING_REQUIRE_FLUSHING_MASK);
5672       if (flags & CODING_ISO_FLAG_COMPOSITION)
5673         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5674       if (flags & CODING_ISO_FLAG_DESIGNATION)
5675         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5676       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5677         {
5678           setup_iso_safe_charsets (attrs);
5679           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5680           coding->max_charset_id = SCHARS (val) - 1;
5681           coding->safe_charsets = SDATA (val);
5682         }
5683       CODING_ISO_FLAGS (coding) = flags;
5684       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5685       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5686       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5687       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5688     }
5689   else if (EQ (coding_type, Qcharset))
5690     {
5691       coding->detector = detect_coding_charset;
5692       coding->decoder = decode_coding_charset;
5693       coding->encoder = encode_coding_charset;
5694       coding->common_flags
5695         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5696     }
5697   else if (EQ (coding_type, Qutf_8))
5698     {
5699       val = AREF (attrs, coding_attr_utf_bom);
5700       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5701                                    : EQ (val, Qt) ? utf_with_bom
5702                                    : utf_without_bom);
5703       coding->detector = detect_coding_utf_8;
5704       coding->decoder = decode_coding_utf_8;
5705       coding->encoder = encode_coding_utf_8;
5706       coding->common_flags
5707         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5709         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5710     }
5711   else if (EQ (coding_type, Qutf_16))
5712     {
5713       val = AREF (attrs, coding_attr_utf_bom);
5714       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5715                                     : EQ (val, Qt) ? utf_with_bom
5716                                     : utf_without_bom);
5717       val = AREF (attrs, coding_attr_utf_16_endian);
5718       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5719                                        : utf_16_little_endian);
5720       CODING_UTF_16_SURROGATE (coding) = 0;
5721       coding->detector = detect_coding_utf_16;
5722       coding->decoder = decode_coding_utf_16;
5723       coding->encoder = encode_coding_utf_16;
5724       coding->common_flags
5725         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5726       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5727         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5728     }
5729   else if (EQ (coding_type, Qccl))
5730     {
5731       coding->detector = detect_coding_ccl;
5732       coding->decoder = decode_coding_ccl;
5733       coding->encoder = encode_coding_ccl;
5734       coding->common_flags
5735         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5736             | CODING_REQUIRE_FLUSHING_MASK);
5737     }
5738   else if (EQ (coding_type, Qemacs_mule))
5739     {
5740       coding->detector = detect_coding_emacs_mule;
5741       coding->decoder = decode_coding_emacs_mule;
5742       coding->encoder = encode_coding_emacs_mule;
5743       coding->common_flags
5744         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5745       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5746           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5747         {
5748           Lisp_Object tail, safe_charsets;
5749           int max_charset_id = 0;
5750
5751           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5752                tail = XCDR (tail))
5753             if (max_charset_id < XFASTINT (XCAR (tail)))
5754               max_charset_id = XFASTINT (XCAR (tail));
5755           safe_charsets = make_uninit_string (max_charset_id + 1);
5756           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5757           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5758                tail = XCDR (tail))
5759             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5760           coding->max_charset_id = max_charset_id;
5761           coding->safe_charsets = SDATA (safe_charsets);
5762         }
5763       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5764       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5765     }
5766   else if (EQ (coding_type, Qshift_jis))
5767     {
5768       coding->detector = detect_coding_sjis;
5769       coding->decoder = decode_coding_sjis;
5770       coding->encoder = encode_coding_sjis;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5773     }
5774   else if (EQ (coding_type, Qbig5))
5775     {
5776       coding->detector = detect_coding_big5;
5777       coding->decoder = decode_coding_big5;
5778       coding->encoder = encode_coding_big5;
5779       coding->common_flags
5780         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781     }
5782   else                          /* EQ (coding_type, Qraw_text) */
5783     {
5784       coding->detector = NULL;
5785       coding->decoder = decode_coding_raw_text;
5786       coding->encoder = encode_coding_raw_text;
5787       if (! EQ (eol_type, Qunix))
5788         {
5789           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5790           if (! VECTORP (eol_type))
5791             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5792         }
5793
5794     }
5795
5796   return;
5797 }
5798
5799 /* Return a list of charsets supported by CODING.  */
5800
5801 Lisp_Object
5802 coding_charset_list (struct coding_system *coding)
5803 {
5804   Lisp_Object attrs, charset_list;
5805
5806   CODING_GET_INFO (coding, attrs, charset_list);
5807   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5808     {
5809       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5810
5811       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5812         charset_list = Viso_2022_charset_list;
5813     }
5814   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5815     {
5816       charset_list = Vemacs_mule_charset_list;
5817     }
5818   return charset_list;
5819 }
5820
5821
5822 /* Return a list of charsets supported by CODING-SYSTEM.  */
5823
5824 Lisp_Object
5825 coding_system_charset_list (Lisp_Object coding_system)
5826 {
5827   ptrdiff_t id;
5828   Lisp_Object attrs, charset_list;
5829
5830   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5831   attrs = CODING_ID_ATTRS (id);
5832
5833   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5834     {
5835       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5836
5837       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5838         charset_list = Viso_2022_charset_list;
5839       else
5840         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5841     }
5842   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5843     {
5844       charset_list = Vemacs_mule_charset_list;
5845     }
5846   else
5847     {
5848       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849     }
5850   return charset_list;
5851 }
5852
5853
5854 /* Return raw-text or one of its subsidiaries that has the same
5855    eol_type as CODING-SYSTEM.  */
5856
5857 Lisp_Object
5858 raw_text_coding_system (Lisp_Object coding_system)
5859 {
5860   Lisp_Object spec, attrs;
5861   Lisp_Object eol_type, raw_text_eol_type;
5862
5863   if (NILP (coding_system))
5864     return Qraw_text;
5865   spec = CODING_SYSTEM_SPEC (coding_system);
5866   attrs = AREF (spec, 0);
5867
5868   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5869     return coding_system;
5870
5871   eol_type = AREF (spec, 2);
5872   if (VECTORP (eol_type))
5873     return Qraw_text;
5874   spec = CODING_SYSTEM_SPEC (Qraw_text);
5875   raw_text_eol_type = AREF (spec, 2);
5876   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5877           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5878           : AREF (raw_text_eol_type, 2));
5879 }
5880
5881
5882 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5883    the subsidiary that has the same eol-spec as PARENT (if it is not
5884    nil and specifies end-of-line format) or the system's setting
5885    (system_eol_type).  */
5886
5887 Lisp_Object
5888 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5889 {
5890   Lisp_Object spec, eol_type;
5891
5892   if (NILP (coding_system))
5893     coding_system = Qraw_text;
5894   spec = CODING_SYSTEM_SPEC (coding_system);
5895   eol_type = AREF (spec, 2);
5896   if (VECTORP (eol_type))
5897     {
5898       Lisp_Object parent_eol_type;
5899
5900       if (! NILP (parent))
5901         {
5902           Lisp_Object parent_spec;
5903
5904           parent_spec = CODING_SYSTEM_SPEC (parent);
5905           parent_eol_type = AREF (parent_spec, 2);
5906           if (VECTORP (parent_eol_type))
5907             parent_eol_type = system_eol_type;
5908         }
5909       else
5910         parent_eol_type = system_eol_type;
5911       if (EQ (parent_eol_type, Qunix))
5912         coding_system = AREF (eol_type, 0);
5913       else if (EQ (parent_eol_type, Qdos))
5914         coding_system = AREF (eol_type, 1);
5915       else if (EQ (parent_eol_type, Qmac))
5916         coding_system = AREF (eol_type, 2);
5917     }
5918   return coding_system;
5919 }
5920
5921
5922 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5923    decided for writing to a process.  If not, complement them, and
5924    return a new coding system.  */
5925
5926 Lisp_Object
5927 complement_process_encoding_system (Lisp_Object coding_system)
5928 {
5929   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5930   Lisp_Object spec, attrs;
5931   int i;
5932
5933   for (i = 0; i < 3; i++)
5934     {
5935       if (i == 1)
5936         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5937       else if (i == 2)
5938         coding_system = preferred_coding_system ();
5939       spec = CODING_SYSTEM_SPEC (coding_system);
5940       if (NILP (spec))
5941         continue;
5942       attrs = AREF (spec, 0);
5943       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5944         coding_base = CODING_ATTR_BASE_NAME (attrs);
5945       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5946         eol_base = coding_system;
5947       if (! NILP (coding_base) && ! NILP (eol_base))
5948         break;
5949     }
5950
5951   if (i > 0)
5952     /* The original CODING_SYSTEM didn't specify text-conversion or
5953        eol-conversion.  Be sure that we return a fully complemented
5954        coding system.  */
5955     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5956   return coding_system;
5957 }
5958
5959
5960 /* Emacs has a mechanism to automatically detect a coding system if it
5961    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5962    it's impossible to distinguish some coding systems accurately
5963    because they use the same range of codes.  So, at first, coding
5964    systems are categorized into 7, those are:
5965
5966    o coding-category-emacs-mule
5967
5968         The category for a coding system which has the same code range
5969         as Emacs' internal format.  Assigned the coding-system (Lisp
5970         symbol) `emacs-mule' by default.
5971
5972    o coding-category-sjis
5973
5974         The category for a coding system which has the same code range
5975         as SJIS.  Assigned the coding-system (Lisp
5976         symbol) `japanese-shift-jis' by default.
5977
5978    o coding-category-iso-7
5979
5980         The category for a coding system which has the same code range
5981         as ISO2022 of 7-bit environment.  This doesn't use any locking
5982         shift and single shift functions.  This can encode/decode all
5983         charsets.  Assigned the coding-system (Lisp symbol)
5984         `iso-2022-7bit' by default.
5985
5986    o coding-category-iso-7-tight
5987
5988         Same as coding-category-iso-7 except that this can
5989         encode/decode only the specified charsets.
5990
5991    o coding-category-iso-8-1
5992
5993         The category for a coding system which has the same code range
5994         as ISO2022 of 8-bit environment and graphic plane 1 used only
5995         for DIMENSION1 charset.  This doesn't use any locking shift
5996         and single shift functions.  Assigned the coding-system (Lisp
5997         symbol) `iso-latin-1' by default.
5998
5999    o coding-category-iso-8-2
6000
6001         The category for a coding system which has the same code range
6002         as ISO2022 of 8-bit environment and graphic plane 1 used only
6003         for DIMENSION2 charset.  This doesn't use any locking shift
6004         and single shift functions.  Assigned the coding-system (Lisp
6005         symbol) `japanese-iso-8bit' by default.
6006
6007    o coding-category-iso-7-else
6008
6009         The category for a coding system which has the same code range
6010         as ISO2022 of 7-bit environment but uses locking shift or
6011         single shift functions.  Assigned the coding-system (Lisp
6012         symbol) `iso-2022-7bit-lock' by default.
6013
6014    o coding-category-iso-8-else
6015
6016         The category for a coding system which has the same code range
6017         as ISO2022 of 8-bit environment but uses locking shift or
6018         single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `iso-2022-8bit-ss2' by default.
6020
6021    o coding-category-big5
6022
6023         The category for a coding system which has the same code range
6024         as BIG5.  Assigned the coding-system (Lisp symbol)
6025         `cn-big5' by default.
6026
6027    o coding-category-utf-8
6028
6029         The category for a coding system which has the same code range
6030         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6031         symbol) `utf-8' by default.
6032
6033    o coding-category-utf-16-be
6034
6035         The category for a coding system in which a text has an
6036         Unicode signature (cf. Unicode Standard) in the order of BIG
6037         endian at the head.  Assigned the coding-system (Lisp symbol)
6038         `utf-16-be' by default.
6039
6040    o coding-category-utf-16-le
6041
6042         The category for a coding system in which a text has an
6043         Unicode signature (cf. Unicode Standard) in the order of
6044         LITTLE endian at the head.  Assigned the coding-system (Lisp
6045         symbol) `utf-16-le' by default.
6046
6047    o coding-category-ccl
6048
6049         The category for a coding system of which encoder/decoder is
6050         written in CCL programs.  The default value is nil, i.e., no
6051         coding system is assigned.
6052
6053    o coding-category-binary
6054
6055         The category for a coding system not categorized in any of the
6056         above.  Assigned the coding-system (Lisp symbol)
6057         `no-conversion' by default.
6058
6059    Each of them is a Lisp symbol and the value is an actual
6060    `coding-system's (this is also a Lisp symbol) assigned by a user.
6061    What Emacs does actually is to detect a category of coding system.
6062    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6063    decide only one possible category, it selects a category of the
6064    highest priority.  Priorities of categories are also specified by a
6065    user in a Lisp variable `coding-category-list'.
6066
6067 */
6068
6069 #define EOL_SEEN_NONE   0
6070 #define EOL_SEEN_LF     1
6071 #define EOL_SEEN_CR     2
6072 #define EOL_SEEN_CRLF   4
6073
6074
6075 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6076                                            int eol_seen);
6077
6078
6079 /* Return the number of ASCII characters at the head of the source.
6080    By side effects, set coding->head_ascii and coding->eol_seen.  The
6081    value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6082    EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6083    all the source bytes are ASCII.  */
6084
6085 static int
6086 check_ascii (struct coding_system *coding)
6087 {
6088   const unsigned char *src, *end;
6089   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6090   int eol_seen;
6091
6092   eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6093               : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6094               : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6095               : EOL_SEEN_CR);
6096   coding_set_source (coding);
6097   src = coding->source;
6098   end = src + coding->src_bytes;
6099
6100   if (inhibit_eol_conversion
6101       || eol_seen != EOL_SEEN_NONE)
6102     {
6103       /* We don't have to check EOL format.  */
6104       while (src < end && !( *src & 0x80)) src++;
6105       if (inhibit_eol_conversion)
6106         {
6107           eol_seen = EOL_SEEN_LF;
6108           adjust_coding_eol_type (coding, eol_seen);
6109         }
6110     }
6111   else
6112     {
6113       end--;                /* We look ahead one byte for "CR LF".  */
6114       while (src < end)
6115         {
6116           int c = *src;
6117
6118           if (c & 0x80)
6119             break;
6120           src++;
6121           if (c == '\r')
6122             {
6123               if (*src == '\n')
6124                 {
6125                   eol_seen |= EOL_SEEN_CRLF;
6126                   src++;
6127                 }
6128               else
6129                 eol_seen |= EOL_SEEN_CR;
6130             }
6131           else if (c == '\n')
6132             eol_seen |= EOL_SEEN_LF;
6133         }
6134       if (src == end)
6135         {
6136           int c = *src;
6137
6138           /* All bytes but the last one C are ASCII.  */
6139           if (! (c & 0x80))
6140             {
6141               if (c == '\r')
6142                 eol_seen |= EOL_SEEN_CR;
6143               else if (c  == '\n')
6144                 eol_seen |= EOL_SEEN_LF;
6145               src++;
6146             }
6147         }
6148     }
6149   coding->head_ascii = src - coding->source;
6150   coding->eol_seen = eol_seen;
6151   return (coding->head_ascii);
6152 }
6153
6154
6155 /* Return the number of characters at the source if all the bytes are
6156    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6157    effects, update coding->eol_seen.  The value of coding->eol_seen is
6158    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6159    the value is reliable only when all the source bytes are valid
6160    UTF-8.  */
6161
6162 static int
6163 check_utf_8 (struct coding_system *coding)
6164 {
6165   const unsigned char *src, *end;
6166   int eol_seen = coding->eol_seen;
6167   int nchars = coding->head_ascii;
6168
6169   if (coding->head_ascii < 0)
6170     check_ascii (coding);
6171   else
6172     coding_set_source (coding);
6173   src = coding->source + coding->head_ascii;
6174   /* We look ahead one byte for CR LF.  */
6175   end = coding->source + coding->src_bytes - 1;
6176
6177   while (src < end)
6178     {
6179       int c = *src;
6180
6181       if (UTF_8_1_OCTET_P (*src))
6182         {
6183           src++;
6184           if (c < 0x20)
6185             {
6186               if (c == '\r')
6187                 {
6188                   if (*src == '\n')
6189                     {
6190                       eol_seen |= EOL_SEEN_CRLF;
6191                       src++;
6192                       nchars++;
6193                     }
6194                   else
6195                     eol_seen |= EOL_SEEN_CR;
6196                 }
6197               else if (c == '\n')
6198                 eol_seen |= EOL_SEEN_LF;
6199             }
6200         }
6201       else if (UTF_8_2_OCTET_LEADING_P (c))
6202         {
6203           if (c < 0xC2          /* overlong sequence */
6204               || src + 1 >= end
6205               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6206             return -1;
6207           src += 2;
6208         }
6209       else if (UTF_8_3_OCTET_LEADING_P (c))
6210         {
6211           if (src + 2 >= end
6212               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6213                     && UTF_8_EXTRA_OCTET_P (src[2])))
6214             return -1;
6215           c = (((c & 0xF) << 12)
6216                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6217           if (c < 0x800                       /* overlong sequence */
6218               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6219             return -1;
6220           src += 3;
6221         }
6222       else if (UTF_8_4_OCTET_LEADING_P (c))
6223         {
6224           if (src + 3 >= end
6225               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6226                     && UTF_8_EXTRA_OCTET_P (src[2])
6227                     && UTF_8_EXTRA_OCTET_P (src[3])))
6228             return -1;
6229           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6230                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6231           if (c < 0x10000       /* overlong sequence */
6232               || c >= 0x110000) /* non-Unicode character  */
6233             return -1;
6234           src += 4;
6235         }
6236       else
6237         return -1;
6238       nchars++;
6239     }
6240
6241   if (src == end)
6242     {
6243       if (! UTF_8_1_OCTET_P (*src))
6244         return -1;
6245       nchars++;
6246       if (*src == '\r')
6247         eol_seen |= EOL_SEEN_CR;
6248       else if (*src  == '\n')
6249         eol_seen |= EOL_SEEN_LF;
6250     }
6251   coding->eol_seen = eol_seen;
6252   return nchars;
6253 }
6254
6255
6256 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6257    SOURCE is encoded.  If CATEGORY is one of
6258    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6259    two-byte, else they are encoded by one-byte.
6260
6261    Return one of EOL_SEEN_XXX.  */
6262
6263 #define MAX_EOL_CHECK_COUNT 3
6264
6265 static int
6266 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6267             enum coding_category category)
6268 {
6269   const unsigned char *src = source, *src_end = src + src_bytes;
6270   unsigned char c;
6271   int total  = 0;
6272   int eol_seen = EOL_SEEN_NONE;
6273
6274   if ((1 << category) & CATEGORY_MASK_UTF_16)
6275     {
6276       bool msb = category == (coding_category_utf_16_le
6277                               | coding_category_utf_16_le_nosig);
6278       bool lsb = !msb;
6279
6280       while (src + 1 < src_end)
6281         {
6282           c = src[lsb];
6283           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6284             {
6285               int this_eol;
6286
6287               if (c == '\n')
6288                 this_eol = EOL_SEEN_LF;
6289               else if (src + 3 >= src_end
6290                        || src[msb + 2] != 0
6291                        || src[lsb + 2] != '\n')
6292                 this_eol = EOL_SEEN_CR;
6293               else
6294                 {
6295                   this_eol = EOL_SEEN_CRLF;
6296                   src += 2;
6297                 }
6298
6299               if (eol_seen == EOL_SEEN_NONE)
6300                 /* This is the first end-of-line.  */
6301                 eol_seen = this_eol;
6302               else if (eol_seen != this_eol)
6303                 {
6304                   /* The found type is different from what found before.
6305                      Allow for stray ^M characters in DOS EOL files.  */
6306                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6307                       || (eol_seen == EOL_SEEN_CRLF
6308                           && this_eol == EOL_SEEN_CR))
6309                     eol_seen = EOL_SEEN_CRLF;
6310                   else
6311                     {
6312                       eol_seen = EOL_SEEN_LF;
6313                       break;
6314                     }
6315                 }
6316               if (++total == MAX_EOL_CHECK_COUNT)
6317                 break;
6318             }
6319           src += 2;
6320         }
6321     }
6322   else
6323     while (src < src_end)
6324       {
6325         c = *src++;
6326         if (c == '\n' || c == '\r')
6327           {
6328             int this_eol;
6329
6330             if (c == '\n')
6331               this_eol = EOL_SEEN_LF;
6332             else if (src >= src_end || *src != '\n')
6333               this_eol = EOL_SEEN_CR;
6334             else
6335               this_eol = EOL_SEEN_CRLF, src++;
6336
6337             if (eol_seen == EOL_SEEN_NONE)
6338               /* This is the first end-of-line.  */
6339               eol_seen = this_eol;
6340             else if (eol_seen != this_eol)
6341               {
6342                 /* The found type is different from what found before.
6343                    Allow for stray ^M characters in DOS EOL files.  */
6344                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6345                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6346                   eol_seen = EOL_SEEN_CRLF;
6347                 else
6348                   {
6349                     eol_seen = EOL_SEEN_LF;
6350                     break;
6351                   }
6352               }
6353             if (++total == MAX_EOL_CHECK_COUNT)
6354               break;
6355           }
6356       }
6357   return eol_seen;
6358 }
6359
6360
6361 static Lisp_Object
6362 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6363 {
6364   Lisp_Object eol_type;
6365
6366   eol_type = CODING_ID_EOL_TYPE (coding->id);
6367   if (! VECTORP (eol_type))
6368     /* Already adjusted.  */
6369     return eol_type;
6370   if (eol_seen & EOL_SEEN_LF)
6371     {
6372       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6373       eol_type = Qunix;
6374     }
6375   else if (eol_seen & EOL_SEEN_CRLF)
6376     {
6377       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6378       eol_type = Qdos;
6379     }
6380   else if (eol_seen & EOL_SEEN_CR)
6381     {
6382       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6383       eol_type = Qmac;
6384     }
6385   return eol_type;
6386 }
6387
6388 /* Detect how a text specified in CODING is encoded.  If a coding
6389    system is detected, update fields of CODING by the detected coding
6390    system.  */
6391
6392 static void
6393 detect_coding (struct coding_system *coding)
6394 {
6395   const unsigned char *src, *src_end;
6396   unsigned int saved_mode = coding->mode;
6397
6398   coding->consumed = coding->consumed_char = 0;
6399   coding->produced = coding->produced_char = 0;
6400   coding_set_source (coding);
6401
6402   src_end = coding->source + coding->src_bytes;
6403
6404   /* If we have not yet decided the text encoding type, detect it
6405      now.  */
6406   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6407     {
6408       int c, i;
6409       struct coding_detection_info detect_info;
6410       bool null_byte_found = 0, eight_bit_found = 0;
6411
6412       coding->head_ascii = 0;
6413       coding->eol_seen = EOL_SEEN_NONE;
6414       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6415       for (src = coding->source; src < src_end; src++)
6416         {
6417           c = *src;
6418           if (c & 0x80)
6419             {
6420               eight_bit_found = 1;
6421               if (null_byte_found)
6422                 break;
6423             }
6424           else if (c < 0x20)
6425             {
6426               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6427                   && ! inhibit_iso_escape_detection
6428                   && ! detect_info.checked)
6429                 {
6430                   if (detect_coding_iso_2022 (coding, &detect_info))
6431                     {
6432                       /* We have scanned the whole data.  */
6433                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6434                         {
6435                           /* We didn't find an 8-bit code.  We may
6436                              have found a null-byte, but it's very
6437                              rare that a binary file conforms to
6438                              ISO-2022.  */
6439                           src = src_end;
6440                           coding->head_ascii = src - coding->source;
6441                         }
6442                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6443                       break;
6444                     }
6445                 }
6446               else if (! c && !inhibit_null_byte_detection)
6447                 {
6448                   null_byte_found = 1;
6449                   if (eight_bit_found)
6450                     break;
6451                 }
6452               else if (! disable_ascii_optimization
6453                        && ! inhibit_eol_conversion)
6454                 {
6455                   if (c == '\r')
6456                     {
6457                       if (src < src_end && src[1] == '\n')
6458                         {
6459                           coding->eol_seen |= EOL_SEEN_CRLF;
6460                           src++;
6461                           if (! eight_bit_found)
6462                             coding->head_ascii++;
6463                         }
6464                       else
6465                         coding->eol_seen |= EOL_SEEN_CR;
6466                     }
6467                   else if (c == '\n')
6468                     {
6469                       coding->eol_seen |= EOL_SEEN_LF;
6470                     }
6471                 }
6472
6473               if (! eight_bit_found)
6474                 coding->head_ascii++;
6475             }
6476           else if (! eight_bit_found)
6477             coding->head_ascii++;
6478         }
6479
6480       if (null_byte_found || eight_bit_found
6481           || coding->head_ascii < coding->src_bytes
6482           || detect_info.found)
6483         {
6484           enum coding_category category;
6485           struct coding_system *this;
6486
6487           if (coding->head_ascii == coding->src_bytes)
6488             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6489             for (i = 0; i < coding_category_raw_text; i++)
6490               {
6491                 category = coding_priorities[i];
6492                 this = coding_categories + category;
6493                 if (detect_info.found & (1 << category))
6494                   break;
6495               }
6496           else
6497             {
6498               if (null_byte_found)
6499                 {
6500                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6501                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6502                 }
6503               for (i = 0; i < coding_category_raw_text; i++)
6504                 {
6505                   category = coding_priorities[i];
6506                   this = coding_categories + category;
6507                   /* Some of this->detector (e.g. detect_coding_sjis)
6508                      require this information.  */
6509                   coding->id = this->id;
6510                   if (this->id < 0)
6511                     {
6512                       /* No coding system of this category is defined.  */
6513                       detect_info.rejected |= (1 << category);
6514                     }
6515                   else if (category >= coding_category_raw_text)
6516                     continue;
6517                   else if (detect_info.checked & (1 << category))
6518                     {
6519                       if (detect_info.found & (1 << category))
6520                         break;
6521                     }
6522                   else if ((*(this->detector)) (coding, &detect_info)
6523                            && detect_info.found & (1 << category))
6524                     {
6525                       if (category == coding_category_utf_16_auto)
6526                         {
6527                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6528                             category = coding_category_utf_16_le;
6529                           else
6530                             category = coding_category_utf_16_be;
6531                         }
6532                       break;
6533                     }
6534                 }
6535             }
6536
6537           if (i < coding_category_raw_text)
6538             setup_coding_system (CODING_ID_NAME (this->id), coding);
6539           else if (null_byte_found)
6540             setup_coding_system (Qno_conversion, coding);
6541           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6542                    == CATEGORY_MASK_ANY)
6543             setup_coding_system (Qraw_text, coding);
6544           else if (detect_info.rejected)
6545             for (i = 0; i < coding_category_raw_text; i++)
6546               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6547                 {
6548                   this = coding_categories + coding_priorities[i];
6549                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6550                   break;
6551                 }
6552         }
6553     }
6554   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6555            == coding_category_utf_8_auto)
6556     {
6557       Lisp_Object coding_systems;
6558       struct coding_detection_info detect_info;
6559
6560       coding_systems
6561         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6562       detect_info.found = detect_info.rejected = 0;
6563       if (check_ascii (coding) == coding->src_bytes)
6564         {
6565           int head_ascii = coding->head_ascii;
6566
6567           if (coding->eol_seen != EOL_SEEN_NONE)
6568             adjust_coding_eol_type (coding, coding->eol_seen);
6569           setup_coding_system (XCDR (coding_systems), coding);
6570           coding->head_ascii = head_ascii;
6571         }
6572       else
6573         {
6574           if (CONSP (coding_systems)
6575               && detect_coding_utf_8 (coding, &detect_info))
6576             {
6577               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6578                 setup_coding_system (XCAR (coding_systems), coding);
6579               else
6580                 setup_coding_system (XCDR (coding_systems), coding);
6581             }
6582         }
6583     }
6584   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6585            == coding_category_utf_16_auto)
6586     {
6587       Lisp_Object coding_systems;
6588       struct coding_detection_info detect_info;
6589
6590       coding_systems
6591         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6592       detect_info.found = detect_info.rejected = 0;
6593       coding->head_ascii = 0;
6594       coding->eol_seen = EOL_SEEN_NONE;
6595       if (CONSP (coding_systems)
6596           && detect_coding_utf_16 (coding, &detect_info))
6597         {
6598           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6599             setup_coding_system (XCAR (coding_systems), coding);
6600           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6601             setup_coding_system (XCDR (coding_systems), coding);
6602         }
6603     }
6604   coding->mode = saved_mode;
6605 }
6606
6607
6608 static void
6609 decode_eol (struct coding_system *coding)
6610 {
6611   Lisp_Object eol_type;
6612   unsigned char *p, *pbeg, *pend;
6613
6614   eol_type = CODING_ID_EOL_TYPE (coding->id);
6615   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6616     return;
6617
6618   if (NILP (coding->dst_object))
6619     pbeg = coding->destination;
6620   else
6621     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6622   pend = pbeg + coding->produced;
6623
6624   if (VECTORP (eol_type))
6625     {
6626       int eol_seen = EOL_SEEN_NONE;
6627
6628       for (p = pbeg; p < pend; p++)
6629         {
6630           if (*p == '\n')
6631             eol_seen |= EOL_SEEN_LF;
6632           else if (*p == '\r')
6633             {
6634               if (p + 1 < pend && *(p + 1) == '\n')
6635                 {
6636                   eol_seen |= EOL_SEEN_CRLF;
6637                   p++;
6638                 }
6639               else
6640                 eol_seen |= EOL_SEEN_CR;
6641             }
6642         }
6643       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6644       if ((eol_seen & EOL_SEEN_CRLF) != 0
6645           && (eol_seen & EOL_SEEN_CR) != 0
6646           && (eol_seen & EOL_SEEN_LF) == 0)
6647         eol_seen = EOL_SEEN_CRLF;
6648       else if (eol_seen != EOL_SEEN_NONE
6649           && eol_seen != EOL_SEEN_LF
6650           && eol_seen != EOL_SEEN_CRLF
6651           && eol_seen != EOL_SEEN_CR)
6652         eol_seen = EOL_SEEN_LF;
6653       if (eol_seen != EOL_SEEN_NONE)
6654         eol_type = adjust_coding_eol_type (coding, eol_seen);
6655     }
6656
6657   if (EQ (eol_type, Qmac))
6658     {
6659       for (p = pbeg; p < pend; p++)
6660         if (*p == '\r')
6661           *p = '\n';
6662     }
6663   else if (EQ (eol_type, Qdos))
6664     {
6665       ptrdiff_t n = 0;
6666
6667       if (NILP (coding->dst_object))
6668         {
6669           /* Start deleting '\r' from the tail to minimize the memory
6670              movement.  */
6671           for (p = pend - 2; p >= pbeg; p--)
6672             if (*p == '\r')
6673               {
6674                 memmove (p, p + 1, pend-- - p - 1);
6675                 n++;
6676               }
6677         }
6678       else
6679         {
6680           ptrdiff_t pos_byte = coding->dst_pos_byte;
6681           ptrdiff_t pos = coding->dst_pos;
6682           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6683
6684           while (pos < pos_end)
6685             {
6686               p = BYTE_POS_ADDR (pos_byte);
6687               if (*p == '\r' && p[1] == '\n')
6688                 {
6689                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6690                   n++;
6691                   pos_end--;
6692                 }
6693               pos++;
6694               if (coding->dst_multibyte)
6695                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6696               else
6697                 pos_byte++;
6698             }
6699         }
6700       coding->produced -= n;
6701       coding->produced_char -= n;
6702     }
6703 }
6704
6705
6706 /* Return a translation table (or list of them) from coding system
6707    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6708    not ENCODEP). */
6709
6710 static Lisp_Object
6711 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6712 {
6713   Lisp_Object standard, translation_table;
6714   Lisp_Object val;
6715
6716   if (NILP (Venable_character_translation))
6717     {
6718       if (max_lookup)
6719         *max_lookup = 0;
6720       return Qnil;
6721     }
6722   if (encodep)
6723     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6724       standard = Vstandard_translation_table_for_encode;
6725   else
6726     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6727       standard = Vstandard_translation_table_for_decode;
6728   if (NILP (translation_table))
6729     translation_table = standard;
6730   else
6731     {
6732       if (SYMBOLP (translation_table))
6733         translation_table = Fget (translation_table, Qtranslation_table);
6734       else if (CONSP (translation_table))
6735         {
6736           translation_table = Fcopy_sequence (translation_table);
6737           for (val = translation_table; CONSP (val); val = XCDR (val))
6738             if (SYMBOLP (XCAR (val)))
6739               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6740         }
6741       if (CHAR_TABLE_P (standard))
6742         {
6743           if (CONSP (translation_table))
6744             translation_table = nconc2 (translation_table,
6745                                         Fcons (standard, Qnil));
6746           else
6747             translation_table = Fcons (translation_table,
6748                                        Fcons (standard, Qnil));
6749         }
6750     }
6751
6752   if (max_lookup)
6753     {
6754       *max_lookup = 1;
6755       if (CHAR_TABLE_P (translation_table)
6756           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6757         {
6758           val = XCHAR_TABLE (translation_table)->extras[1];
6759           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6760             *max_lookup = XFASTINT (val);
6761         }
6762       else if (CONSP (translation_table))
6763         {
6764           Lisp_Object tail;
6765
6766           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6767             if (CHAR_TABLE_P (XCAR (tail))
6768                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6769               {
6770                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6771                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6772                   *max_lookup = XFASTINT (tailval);
6773               }
6774         }
6775     }
6776   return translation_table;
6777 }
6778
6779 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6780   do {                                                          \
6781     trans = Qnil;                                               \
6782     if (CHAR_TABLE_P (table))                                   \
6783       {                                                         \
6784         trans = CHAR_TABLE_REF (table, c);                      \
6785         if (CHARACTERP (trans))                                 \
6786           c = XFASTINT (trans), trans = Qnil;                   \
6787       }                                                         \
6788     else if (CONSP (table))                                     \
6789       {                                                         \
6790         Lisp_Object tail;                                       \
6791                                                                 \
6792         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6793           if (CHAR_TABLE_P (XCAR (tail)))                       \
6794             {                                                   \
6795               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6796               if (CHARACTERP (trans))                           \
6797                 c = XFASTINT (trans), trans = Qnil;             \
6798               else if (! NILP (trans))                          \
6799                 break;                                          \
6800             }                                                   \
6801       }                                                         \
6802   } while (0)
6803
6804
6805 /* Return a translation of character(s) at BUF according to TRANS.
6806    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6807    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6808    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6809    translation is found, and Qnil if not found..
6810    If BUF is too short to lookup characters in FROM, return Qt.  */
6811
6812 static Lisp_Object
6813 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6814 {
6815
6816   if (INTEGERP (trans))
6817     return trans;
6818   for (; CONSP (trans); trans = XCDR (trans))
6819     {
6820       Lisp_Object val = XCAR (trans);
6821       Lisp_Object from = XCAR (val);
6822       ptrdiff_t len = ASIZE (from);
6823       ptrdiff_t i;
6824
6825       for (i = 0; i < len; i++)
6826         {
6827           if (buf + i == buf_end)
6828             return Qt;
6829           if (XINT (AREF (from, i)) != buf[i])
6830             break;
6831         }
6832       if (i == len)
6833         return val;
6834     }
6835   return Qnil;
6836 }
6837
6838
6839 static int
6840 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6841                bool last_block)
6842 {
6843   unsigned char *dst = coding->destination + coding->produced;
6844   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6845   ptrdiff_t produced;
6846   ptrdiff_t produced_chars = 0;
6847   int carryover = 0;
6848
6849   if (! coding->chars_at_source)
6850     {
6851       /* Source characters are in coding->charbuf.  */
6852       int *buf = coding->charbuf;
6853       int *buf_end = buf + coding->charbuf_used;
6854
6855       if (EQ (coding->src_object, coding->dst_object))
6856         {
6857           coding_set_source (coding);
6858           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6859         }
6860
6861       while (buf < buf_end)
6862         {
6863           int c = *buf;
6864           ptrdiff_t i;
6865
6866           if (c >= 0)
6867             {
6868               ptrdiff_t from_nchars = 1, to_nchars = 1;
6869               Lisp_Object trans = Qnil;
6870
6871               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6872               if (! NILP (trans))
6873                 {
6874                   trans = get_translation (trans, buf, buf_end);
6875                   if (INTEGERP (trans))
6876                     c = XINT (trans);
6877                   else if (CONSP (trans))
6878                     {
6879                       from_nchars = ASIZE (XCAR (trans));
6880                       trans = XCDR (trans);
6881                       if (INTEGERP (trans))
6882                         c = XINT (trans);
6883                       else
6884                         {
6885                           to_nchars = ASIZE (trans);
6886                           c = XINT (AREF (trans, 0));
6887                         }
6888                     }
6889                   else if (EQ (trans, Qt) && ! last_block)
6890                     break;
6891                 }
6892
6893               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6894                 {
6895                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6896                        / MAX_MULTIBYTE_LENGTH)
6897                       < to_nchars)
6898                     memory_full (SIZE_MAX);
6899                   dst = alloc_destination (coding,
6900                                            buf_end - buf
6901                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6902                                            dst);
6903                   if (EQ (coding->src_object, coding->dst_object))
6904                     {
6905                       coding_set_source (coding);
6906                       dst_end = (((unsigned char *) coding->source)
6907                                  + coding->consumed);
6908                     }
6909                   else
6910                     dst_end = coding->destination + coding->dst_bytes;
6911                 }
6912
6913               for (i = 0; i < to_nchars; i++)
6914                 {
6915                   if (i > 0)
6916                     c = XINT (AREF (trans, i));
6917                   if (coding->dst_multibyte
6918                       || ! CHAR_BYTE8_P (c))
6919                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6920                   else
6921                     *dst++ = CHAR_TO_BYTE8 (c);
6922                 }
6923               produced_chars += to_nchars;
6924               buf += from_nchars;
6925             }
6926           else
6927             /* This is an annotation datum.  (-C) is the length.  */
6928             buf += -c;
6929         }
6930       carryover = buf_end - buf;
6931     }
6932   else
6933     {
6934       /* Source characters are at coding->source.  */
6935       const unsigned char *src = coding->source;
6936       const unsigned char *src_end = src + coding->consumed;
6937
6938       if (EQ (coding->dst_object, coding->src_object))
6939         dst_end = (unsigned char *) src;
6940       if (coding->src_multibyte != coding->dst_multibyte)
6941         {
6942           if (coding->src_multibyte)
6943             {
6944               bool multibytep = 1;
6945               ptrdiff_t consumed_chars = 0;
6946
6947               while (1)
6948                 {
6949                   const unsigned char *src_base = src;
6950                   int c;
6951
6952                   ONE_MORE_BYTE (c);
6953                   if (dst == dst_end)
6954                     {
6955                       if (EQ (coding->src_object, coding->dst_object))
6956                         dst_end = (unsigned char *) src;
6957                       if (dst == dst_end)
6958                         {
6959                           ptrdiff_t offset = src - coding->source;
6960
6961                           dst = alloc_destination (coding, src_end - src + 1,
6962                                                    dst);
6963                           dst_end = coding->destination + coding->dst_bytes;
6964                           coding_set_source (coding);
6965                           src = coding->source + offset;
6966                           src_end = coding->source + coding->consumed;
6967                           if (EQ (coding->src_object, coding->dst_object))
6968                             dst_end = (unsigned char *) src;
6969                         }
6970                     }
6971                   *dst++ = c;
6972                   produced_chars++;
6973                 }
6974             no_more_source:
6975               ;
6976             }
6977           else
6978             while (src < src_end)
6979               {
6980                 bool multibytep = 1;
6981                 int c = *src++;
6982
6983                 if (dst >= dst_end - 1)
6984                   {
6985                     if (EQ (coding->src_object, coding->dst_object))
6986                       dst_end = (unsigned char *) src;
6987                     if (dst >= dst_end - 1)
6988                       {
6989                         ptrdiff_t offset = src - coding->source;
6990                         ptrdiff_t more_bytes;
6991
6992                         if (EQ (coding->src_object, coding->dst_object))
6993                           more_bytes = ((src_end - src) / 2) + 2;
6994                         else
6995                           more_bytes = src_end - src + 2;
6996                         dst = alloc_destination (coding, more_bytes, dst);
6997                         dst_end = coding->destination + coding->dst_bytes;
6998                         coding_set_source (coding);
6999                         src = coding->source + offset;
7000                         src_end = coding->source + coding->consumed;
7001                         if (EQ (coding->src_object, coding->dst_object))
7002                           dst_end = (unsigned char *) src;
7003                       }
7004                   }
7005                 EMIT_ONE_BYTE (c);
7006               }
7007         }
7008       else
7009         {
7010           if (!EQ (coding->src_object, coding->dst_object))
7011             {
7012               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7013
7014               if (require > 0)
7015                 {
7016                   ptrdiff_t offset = src - coding->source;
7017
7018                   dst = alloc_destination (coding, require, dst);
7019                   coding_set_source (coding);
7020                   src = coding->source + offset;
7021                   src_end = coding->source + coding->consumed;
7022                 }
7023             }
7024           produced_chars = coding->consumed_char;
7025           while (src < src_end)
7026             *dst++ = *src++;
7027         }
7028     }
7029
7030   produced = dst - (coding->destination + coding->produced);
7031   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7032     insert_from_gap (produced_chars, produced, 0);
7033   coding->produced += produced;
7034   coding->produced_char += produced_chars;
7035   return carryover;
7036 }
7037
7038 /* Compose text in CODING->object according to the annotation data at
7039    CHARBUF.  CHARBUF is an array:
7040      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7041  */
7042
7043 static void
7044 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7045 {
7046   int len;
7047   ptrdiff_t to;
7048   enum composition_method method;
7049   Lisp_Object components;
7050
7051   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7052   to = pos + charbuf[2];
7053   method = (enum composition_method) (charbuf[4]);
7054
7055   if (method == COMPOSITION_RELATIVE)
7056     components = Qnil;
7057   else
7058     {
7059       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7060       int i, j;
7061
7062       if (method == COMPOSITION_WITH_RULE)
7063         len = charbuf[2] * 3 - 2;
7064       charbuf += MAX_ANNOTATION_LENGTH;
7065       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7066       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7067         {
7068           if (charbuf[i] >= 0)
7069             args[j] = make_number (charbuf[i]);
7070           else
7071             {
7072               i++;
7073               args[j] = make_number (charbuf[i] % 0x100);
7074             }
7075         }
7076       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7077     }
7078   compose_text (pos, to, components, Qnil, coding->dst_object);
7079 }
7080
7081
7082 /* Put `charset' property on text in CODING->object according to
7083    the annotation data at CHARBUF.  CHARBUF is an array:
7084      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7085  */
7086
7087 static void
7088 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7089 {
7090   ptrdiff_t from = pos - charbuf[2];
7091   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7092
7093   Fput_text_property (make_number (from), make_number (pos),
7094                       Qcharset, CHARSET_NAME (charset),
7095                       coding->dst_object);
7096 }
7097
7098
7099 #define CHARBUF_SIZE 0x4000
7100
7101 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7102   do {                                                                  \
7103     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7104     coding->charbuf_size = CHARBUF_SIZE;                                \
7105   } while (0)
7106
7107
7108 static void
7109 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7110 {
7111   int *charbuf = coding->charbuf;
7112   int *charbuf_end = charbuf + coding->charbuf_used;
7113
7114   if (NILP (coding->dst_object))
7115     return;
7116
7117   while (charbuf < charbuf_end)
7118     {
7119       if (*charbuf >= 0)
7120         pos++, charbuf++;
7121       else
7122         {
7123           int len = -*charbuf;
7124
7125           if (len > 2)
7126             switch (charbuf[1])
7127               {
7128               case CODING_ANNOTATE_COMPOSITION_MASK:
7129                 produce_composition (coding, charbuf, pos);
7130                 break;
7131               case CODING_ANNOTATE_CHARSET_MASK:
7132                 produce_charset (coding, charbuf, pos);
7133                 break;
7134               }
7135           charbuf += len;
7136         }
7137     }
7138 }
7139
7140 /* Decode the data at CODING->src_object into CODING->dst_object.
7141    CODING->src_object is a buffer, a string, or nil.
7142    CODING->dst_object is a buffer.
7143
7144    If CODING->src_object is a buffer, it must be the current buffer.
7145    In this case, if CODING->src_pos is positive, it is a position of
7146    the source text in the buffer, otherwise, the source text is in the
7147    gap area of the buffer, and CODING->src_pos specifies the offset of
7148    the text from GPT (which must be the same as PT).  If this is the
7149    same buffer as CODING->dst_object, CODING->src_pos must be
7150    negative.
7151
7152    If CODING->src_object is a string, CODING->src_pos is an index to
7153    that string.
7154
7155    If CODING->src_object is nil, CODING->source must already point to
7156    the non-relocatable memory area.  In this case, CODING->src_pos is
7157    an offset from CODING->source.
7158
7159    The decoded data is inserted at the current point of the buffer
7160    CODING->dst_object.
7161 */
7162
7163 static void
7164 decode_coding (struct coding_system *coding)
7165 {
7166   Lisp_Object attrs;
7167   Lisp_Object undo_list;
7168   Lisp_Object translation_table;
7169   struct ccl_spec cclspec;
7170   int carryover;
7171   int i;
7172
7173   USE_SAFE_ALLOCA;
7174
7175   if (BUFFERP (coding->src_object)
7176       && coding->src_pos > 0
7177       && coding->src_pos < GPT
7178       && coding->src_pos + coding->src_chars > GPT)
7179     move_gap_both (coding->src_pos, coding->src_pos_byte);
7180
7181   undo_list = Qt;
7182   if (BUFFERP (coding->dst_object))
7183     {
7184       set_buffer_internal (XBUFFER (coding->dst_object));
7185       if (GPT != PT)
7186         move_gap_both (PT, PT_BYTE);
7187
7188       /* We must disable undo_list in order to record the whole insert
7189          transaction via record_insert at the end.  But doing so also
7190          disables the recording of the first change to the undo_list.
7191          Therefore we check for first change here and record it via
7192          record_first_change if needed.  */
7193       if (MODIFF <= SAVE_MODIFF)
7194         record_first_change ();
7195
7196       undo_list = BVAR (current_buffer, undo_list);
7197       bset_undo_list (current_buffer, Qt);
7198     }
7199
7200   coding->consumed = coding->consumed_char = 0;
7201   coding->produced = coding->produced_char = 0;
7202   coding->chars_at_source = 0;
7203   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7204   coding->errors = 0;
7205
7206   ALLOC_CONVERSION_WORK_AREA (coding);
7207
7208   attrs = CODING_ID_ATTRS (coding->id);
7209   translation_table = get_translation_table (attrs, 0, NULL);
7210
7211   carryover = 0;
7212   if (coding->decoder == decode_coding_ccl)
7213     {
7214       coding->spec.ccl = &cclspec;
7215       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7216     }
7217   do
7218     {
7219       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7220
7221       coding_set_source (coding);
7222       coding->annotated = 0;
7223       coding->charbuf_used = carryover;
7224       (*(coding->decoder)) (coding);
7225       coding_set_destination (coding);
7226       carryover = produce_chars (coding, translation_table, 0);
7227       if (coding->annotated)
7228         produce_annotation (coding, pos);
7229       for (i = 0; i < carryover; i++)
7230         coding->charbuf[i]
7231           = coding->charbuf[coding->charbuf_used - carryover + i];
7232     }
7233   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7234          || (coding->consumed < coding->src_bytes
7235              && (coding->result == CODING_RESULT_SUCCESS
7236                  || coding->result == CODING_RESULT_INVALID_SRC)));
7237
7238   if (carryover > 0)
7239     {
7240       coding_set_destination (coding);
7241       coding->charbuf_used = carryover;
7242       produce_chars (coding, translation_table, 1);
7243     }
7244
7245   coding->carryover_bytes = 0;
7246   if (coding->consumed < coding->src_bytes)
7247     {
7248       int nbytes = coding->src_bytes - coding->consumed;
7249       const unsigned char *src;
7250
7251       coding_set_source (coding);
7252       coding_set_destination (coding);
7253       src = coding->source + coding->consumed;
7254
7255       if (coding->mode & CODING_MODE_LAST_BLOCK)
7256         {
7257           /* Flush out unprocessed data as binary chars.  We are sure
7258              that the number of data is less than the size of
7259              coding->charbuf.  */
7260           coding->charbuf_used = 0;
7261           coding->chars_at_source = 0;
7262
7263           while (nbytes-- > 0)
7264             {
7265               int c = *src++;
7266
7267               if (c & 0x80)
7268                 c = BYTE8_TO_CHAR (c);
7269               coding->charbuf[coding->charbuf_used++] = c;
7270             }
7271           produce_chars (coding, Qnil, 1);
7272         }
7273       else
7274         {
7275           /* Record unprocessed bytes in coding->carryover.  We are
7276              sure that the number of data is less than the size of
7277              coding->carryover.  */
7278           unsigned char *p = coding->carryover;
7279
7280           if (nbytes > sizeof coding->carryover)
7281             nbytes = sizeof coding->carryover;
7282           coding->carryover_bytes = nbytes;
7283           while (nbytes-- > 0)
7284             *p++ = *src++;
7285         }
7286       coding->consumed = coding->src_bytes;
7287     }
7288
7289   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7290       && !inhibit_eol_conversion)
7291     decode_eol (coding);
7292   if (BUFFERP (coding->dst_object))
7293     {
7294       bset_undo_list (current_buffer, undo_list);
7295       record_insert (coding->dst_pos, coding->produced_char);
7296     }
7297
7298   SAFE_FREE ();
7299 }
7300
7301
7302 /* Extract an annotation datum from a composition starting at POS and
7303    ending before LIMIT of CODING->src_object (buffer or string), store
7304    the data in BUF, set *STOP to a starting position of the next
7305    composition (if any) or to LIMIT, and return the address of the
7306    next element of BUF.
7307
7308    If such an annotation is not found, set *STOP to a starting
7309    position of a composition after POS (if any) or to LIMIT, and
7310    return BUF.  */
7311
7312 static int *
7313 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7314                                struct coding_system *coding, int *buf,
7315                                ptrdiff_t *stop)
7316 {
7317   ptrdiff_t start, end;
7318   Lisp_Object prop;
7319
7320   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7321       || end > limit)
7322     *stop = limit;
7323   else if (start > pos)
7324     *stop = start;
7325   else
7326     {
7327       if (start == pos)
7328         {
7329           /* We found a composition.  Store the corresponding
7330              annotation data in BUF.  */
7331           int *head = buf;
7332           enum composition_method method = COMPOSITION_METHOD (prop);
7333           int nchars = COMPOSITION_LENGTH (prop);
7334
7335           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7336           if (method != COMPOSITION_RELATIVE)
7337             {
7338               Lisp_Object components;
7339               ptrdiff_t i, len, i_byte;
7340
7341               components = COMPOSITION_COMPONENTS (prop);
7342               if (VECTORP (components))
7343                 {
7344                   len = ASIZE (components);
7345                   for (i = 0; i < len; i++)
7346                     *buf++ = XINT (AREF (components, i));
7347                 }
7348               else if (STRINGP (components))
7349                 {
7350                   len = SCHARS (components);
7351                   i = i_byte = 0;
7352                   while (i < len)
7353                     {
7354                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7355                       buf++;
7356                     }
7357                 }
7358               else if (INTEGERP (components))
7359                 {
7360                   len = 1;
7361                   *buf++ = XINT (components);
7362                 }
7363               else if (CONSP (components))
7364                 {
7365                   for (len = 0; CONSP (components);
7366                        len++, components = XCDR (components))
7367                     *buf++ = XINT (XCAR (components));
7368                 }
7369               else
7370                 emacs_abort ();
7371               *head -= len;
7372             }
7373         }
7374
7375       if (find_composition (end, limit, &start, &end, &prop,
7376                             coding->src_object)
7377           && end <= limit)
7378         *stop = start;
7379       else
7380         *stop = limit;
7381     }
7382   return buf;
7383 }
7384
7385
7386 /* Extract an annotation datum from a text property `charset' at POS of
7387    CODING->src_object (buffer of string), store the data in BUF, set
7388    *STOP to the position where the value of `charset' property changes
7389    (limiting by LIMIT), and return the address of the next element of
7390    BUF.
7391
7392    If the property value is nil, set *STOP to the position where the
7393    property value is non-nil (limiting by LIMIT), and return BUF.  */
7394
7395 static int *
7396 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7397                            struct coding_system *coding, int *buf,
7398                            ptrdiff_t *stop)
7399 {
7400   Lisp_Object val, next;
7401   int id;
7402
7403   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7404   if (! NILP (val) && CHARSETP (val))
7405     id = XINT (CHARSET_SYMBOL_ID (val));
7406   else
7407     id = -1;
7408   ADD_CHARSET_DATA (buf, 0, id);
7409   next = Fnext_single_property_change (make_number (pos), Qcharset,
7410                                        coding->src_object,
7411                                        make_number (limit));
7412   *stop = XINT (next);
7413   return buf;
7414 }
7415
7416
7417 static void
7418 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7419                int max_lookup)
7420 {
7421   int *buf = coding->charbuf;
7422   int *buf_end = coding->charbuf + coding->charbuf_size;
7423   const unsigned char *src = coding->source + coding->consumed;
7424   const unsigned char *src_end = coding->source + coding->src_bytes;
7425   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7426   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7427   bool multibytep = coding->src_multibyte;
7428   Lisp_Object eol_type;
7429   int c;
7430   ptrdiff_t stop, stop_composition, stop_charset;
7431   int *lookup_buf = NULL;
7432
7433   if (! NILP (translation_table))
7434     lookup_buf = alloca (sizeof (int) * max_lookup);
7435
7436   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7437   if (VECTORP (eol_type))
7438     eol_type = Qunix;
7439
7440   /* Note: composition handling is not yet implemented.  */
7441   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7442
7443   if (NILP (coding->src_object))
7444     stop = stop_composition = stop_charset = end_pos;
7445   else
7446     {
7447       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7448         stop = stop_composition = pos;
7449       else
7450         stop = stop_composition = end_pos;
7451       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7452         stop = stop_charset = pos;
7453       else
7454         stop_charset = end_pos;
7455     }
7456
7457   /* Compensate for CRLF and conversion.  */
7458   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7459   while (buf < buf_end)
7460     {
7461       Lisp_Object trans;
7462
7463       if (pos == stop)
7464         {
7465           if (pos == end_pos)
7466             break;
7467           if (pos == stop_composition)
7468             buf = handle_composition_annotation (pos, end_pos, coding,
7469                                                  buf, &stop_composition);
7470           if (pos == stop_charset)
7471             buf = handle_charset_annotation (pos, end_pos, coding,
7472                                              buf, &stop_charset);
7473           stop = (stop_composition < stop_charset
7474                   ? stop_composition : stop_charset);
7475         }
7476
7477       if (! multibytep)
7478         {
7479           int bytes;
7480
7481           if (coding->encoder == encode_coding_raw_text
7482               || coding->encoder == encode_coding_ccl)
7483             c = *src++, pos++;
7484           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7485             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7486           else
7487             c = BYTE8_TO_CHAR (*src), src++, pos++;
7488         }
7489       else
7490         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7491       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7492         c = '\n';
7493       if (! EQ (eol_type, Qunix))
7494         {
7495           if (c == '\n')
7496             {
7497               if (EQ (eol_type, Qdos))
7498                 *buf++ = '\r';
7499               else
7500                 c = '\r';
7501             }
7502         }
7503
7504       trans = Qnil;
7505       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7506       if (NILP (trans))
7507         *buf++ = c;
7508       else
7509         {
7510           ptrdiff_t from_nchars = 1, to_nchars = 1;
7511           int *lookup_buf_end;
7512           const unsigned char *p = src;
7513           int i;
7514
7515           lookup_buf[0] = c;
7516           for (i = 1; i < max_lookup && p < src_end; i++)
7517             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7518           lookup_buf_end = lookup_buf + i;
7519           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7520           if (INTEGERP (trans))
7521             c = XINT (trans);
7522           else if (CONSP (trans))
7523             {
7524               from_nchars = ASIZE (XCAR (trans));
7525               trans = XCDR (trans);
7526               if (INTEGERP (trans))
7527                 c = XINT (trans);
7528               else
7529                 {
7530                   to_nchars = ASIZE (trans);
7531                   if (buf_end - buf < to_nchars)
7532                     break;
7533                   c = XINT (AREF (trans, 0));
7534                 }
7535             }
7536           else
7537             break;
7538           *buf++ = c;
7539           for (i = 1; i < to_nchars; i++)
7540             *buf++ = XINT (AREF (trans, i));
7541           for (i = 1; i < from_nchars; i++, pos++)
7542             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7543         }
7544     }
7545
7546   coding->consumed = src - coding->source;
7547   coding->consumed_char = pos - coding->src_pos;
7548   coding->charbuf_used = buf - coding->charbuf;
7549   coding->chars_at_source = 0;
7550 }
7551
7552
7553 /* Encode the text at CODING->src_object into CODING->dst_object.
7554    CODING->src_object is a buffer or a string.
7555    CODING->dst_object is a buffer or nil.
7556
7557    If CODING->src_object is a buffer, it must be the current buffer.
7558    In this case, if CODING->src_pos is positive, it is a position of
7559    the source text in the buffer, otherwise. the source text is in the
7560    gap area of the buffer, and coding->src_pos specifies the offset of
7561    the text from GPT (which must be the same as PT).  If this is the
7562    same buffer as CODING->dst_object, CODING->src_pos must be
7563    negative and CODING should not have `pre-write-conversion'.
7564
7565    If CODING->src_object is a string, CODING should not have
7566    `pre-write-conversion'.
7567
7568    If CODING->dst_object is a buffer, the encoded data is inserted at
7569    the current point of that buffer.
7570
7571    If CODING->dst_object is nil, the encoded data is placed at the
7572    memory area specified by CODING->destination.  */
7573
7574 static void
7575 encode_coding (struct coding_system *coding)
7576 {
7577   Lisp_Object attrs;
7578   Lisp_Object translation_table;
7579   int max_lookup;
7580   struct ccl_spec cclspec;
7581
7582   USE_SAFE_ALLOCA;
7583
7584   attrs = CODING_ID_ATTRS (coding->id);
7585   if (coding->encoder == encode_coding_raw_text)
7586     translation_table = Qnil, max_lookup = 0;
7587   else
7588     translation_table = get_translation_table (attrs, 1, &max_lookup);
7589
7590   if (BUFFERP (coding->dst_object))
7591     {
7592       set_buffer_internal (XBUFFER (coding->dst_object));
7593       coding->dst_multibyte
7594         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7595     }
7596
7597   coding->consumed = coding->consumed_char = 0;
7598   coding->produced = coding->produced_char = 0;
7599   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7600   coding->errors = 0;
7601
7602   ALLOC_CONVERSION_WORK_AREA (coding);
7603
7604   if (coding->encoder == encode_coding_ccl)
7605     {
7606       coding->spec.ccl = &cclspec;
7607       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7608     }
7609   do {
7610     coding_set_source (coding);
7611     consume_chars (coding, translation_table, max_lookup);
7612     coding_set_destination (coding);
7613     (*(coding->encoder)) (coding);
7614   } while (coding->consumed_char < coding->src_chars);
7615
7616   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7617     insert_from_gap (coding->produced_char, coding->produced, 0);
7618
7619   SAFE_FREE ();
7620 }
7621
7622
7623 /* Name (or base name) of work buffer for code conversion.  */
7624 static Lisp_Object Vcode_conversion_workbuf_name;
7625
7626 /* A working buffer used by the top level conversion.  Once it is
7627    created, it is never destroyed.  It has the name
7628    Vcode_conversion_workbuf_name.  The other working buffers are
7629    destroyed after the use is finished, and their names are modified
7630    versions of Vcode_conversion_workbuf_name.  */
7631 static Lisp_Object Vcode_conversion_reused_workbuf;
7632
7633 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7634 static bool reused_workbuf_in_use;
7635
7636
7637 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7638    multibyteness of returning buffer.  */
7639
7640 static Lisp_Object
7641 make_conversion_work_buffer (bool multibyte)
7642 {
7643   Lisp_Object name, workbuf;
7644   struct buffer *current;
7645
7646   if (reused_workbuf_in_use)
7647     {
7648       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7649       workbuf = Fget_buffer_create (name);
7650     }
7651   else
7652     {
7653       reused_workbuf_in_use = 1;
7654       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7655         Vcode_conversion_reused_workbuf
7656           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7657       workbuf = Vcode_conversion_reused_workbuf;
7658     }
7659   current = current_buffer;
7660   set_buffer_internal (XBUFFER (workbuf));
7661   /* We can't allow modification hooks to run in the work buffer.  For
7662      instance, directory_files_internal assumes that file decoding
7663      doesn't compile new regexps.  */
7664   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7665   Ferase_buffer ();
7666   bset_undo_list (current_buffer, Qt);
7667   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7668   set_buffer_internal (current);
7669   return workbuf;
7670 }
7671
7672
7673 static Lisp_Object
7674 code_conversion_restore (Lisp_Object arg)
7675 {
7676   Lisp_Object current, workbuf;
7677   struct gcpro gcpro1;
7678
7679   GCPRO1 (arg);
7680   current = XCAR (arg);
7681   workbuf = XCDR (arg);
7682   if (! NILP (workbuf))
7683     {
7684       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7685         reused_workbuf_in_use = 0;
7686       else
7687         Fkill_buffer (workbuf);
7688     }
7689   set_buffer_internal (XBUFFER (current));
7690   UNGCPRO;
7691   return Qnil;
7692 }
7693
7694 Lisp_Object
7695 code_conversion_save (bool with_work_buf, bool multibyte)
7696 {
7697   Lisp_Object workbuf = Qnil;
7698
7699   if (with_work_buf)
7700     workbuf = make_conversion_work_buffer (multibyte);
7701   record_unwind_protect (code_conversion_restore,
7702                          Fcons (Fcurrent_buffer (), workbuf));
7703   return workbuf;
7704 }
7705
7706 void
7707 decode_coding_gap (struct coding_system *coding,
7708                    ptrdiff_t chars, ptrdiff_t bytes)
7709 {
7710   ptrdiff_t count = SPECPDL_INDEX ();
7711   Lisp_Object attrs;
7712
7713   coding->src_object = Fcurrent_buffer ();
7714   coding->src_chars = chars;
7715   coding->src_bytes = bytes;
7716   coding->src_pos = -chars;
7717   coding->src_pos_byte = -bytes;
7718   coding->src_multibyte = chars < bytes;
7719   coding->dst_object = coding->src_object;
7720   coding->dst_pos = PT;
7721   coding->dst_pos_byte = PT_BYTE;
7722   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7723
7724   if (CODING_REQUIRE_DETECTION (coding))
7725     detect_coding (coding);
7726   attrs = CODING_ID_ATTRS (coding->id);
7727   if (! disable_ascii_optimization
7728       && ! coding->src_multibyte
7729       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7730       && NILP (CODING_ATTR_POST_READ (attrs))
7731       && NILP (get_translation_table (attrs, 0, NULL)))
7732     {
7733       chars = coding->head_ascii;
7734       if (chars < 0)
7735         chars = check_ascii (coding);
7736       if (chars != bytes)
7737         {
7738           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7739             chars = check_utf_8 (coding);
7740           else
7741             chars = -1;
7742         }
7743       if (chars >= 0)
7744         {
7745           if (coding->eol_seen != EOL_SEEN_NONE)
7746             adjust_coding_eol_type (coding, coding->eol_seen);
7747
7748           if (coding->eol_seen == EOL_SEEN_CR)
7749             {
7750               unsigned char *src_end = GAP_END_ADDR;
7751               unsigned char *src = src_end - coding->src_bytes;
7752
7753               while (src < src_end)
7754                 {
7755                   if (*src++ == '\r')
7756                     src[-1] = '\n';
7757                 }
7758             }
7759           else if (coding->eol_seen == EOL_SEEN_CRLF)
7760             {
7761               unsigned char *src = GAP_END_ADDR;
7762               unsigned char *src_beg = src - coding->src_bytes;
7763               unsigned char *dst = src;
7764               ptrdiff_t diff;
7765
7766               while (src_beg < src)
7767                 {
7768                   *--dst = *--src;
7769                   if (*src == '\n')
7770                     src--;
7771                 }
7772               diff = dst - src;
7773               bytes -= diff;
7774               chars -= diff;
7775             }
7776           coding->produced = bytes;
7777           coding->produced_char = chars;
7778           insert_from_gap (chars, bytes, 1);
7779           return;
7780         }
7781     }
7782   code_conversion_save (0, 0);
7783
7784   coding->mode |= CODING_MODE_LAST_BLOCK;
7785   current_buffer->text->inhibit_shrinking = 1;
7786   decode_coding (coding);
7787   current_buffer->text->inhibit_shrinking = 0;
7788
7789   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7790     {
7791       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7792       Lisp_Object val;
7793
7794       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7795       val = call1 (CODING_ATTR_POST_READ (attrs),
7796                    make_number (coding->produced_char));
7797       CHECK_NATNUM (val);
7798       coding->produced_char += Z - prev_Z;
7799       coding->produced += Z_BYTE - prev_Z_BYTE;
7800     }
7801
7802   unbind_to (count, Qnil);
7803 }
7804
7805
7806 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7807    SRC_OBJECT into DST_OBJECT by coding context CODING.
7808
7809    SRC_OBJECT is a buffer, a string, or Qnil.
7810
7811    If it is a buffer, the text is at point of the buffer.  FROM and TO
7812    are positions in the buffer.
7813
7814    If it is a string, the text is at the beginning of the string.
7815    FROM and TO are indices to the string.
7816
7817    If it is nil, the text is at coding->source.  FROM and TO are
7818    indices to coding->source.
7819
7820    DST_OBJECT is a buffer, Qt, or Qnil.
7821
7822    If it is a buffer, the decoded text is inserted at point of the
7823    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7824    is deleted.
7825
7826    If it is Qt, a string is made from the decoded text, and
7827    set in CODING->dst_object.
7828
7829    If it is Qnil, the decoded text is stored at CODING->destination.
7830    The caller must allocate CODING->dst_bytes bytes at
7831    CODING->destination by xmalloc.  If the decoded text is longer than
7832    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7833  */
7834
7835 void
7836 decode_coding_object (struct coding_system *coding,
7837                       Lisp_Object src_object,
7838                       ptrdiff_t from, ptrdiff_t from_byte,
7839                       ptrdiff_t to, ptrdiff_t to_byte,
7840                       Lisp_Object dst_object)
7841 {
7842   ptrdiff_t count = SPECPDL_INDEX ();
7843   unsigned char *destination IF_LINT (= NULL);
7844   ptrdiff_t dst_bytes IF_LINT (= 0);
7845   ptrdiff_t chars = to - from;
7846   ptrdiff_t bytes = to_byte - from_byte;
7847   Lisp_Object attrs;
7848   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7849   bool need_marker_adjustment = 0;
7850   Lisp_Object old_deactivate_mark;
7851
7852   old_deactivate_mark = Vdeactivate_mark;
7853
7854   if (NILP (dst_object))
7855     {
7856       destination = coding->destination;
7857       dst_bytes = coding->dst_bytes;
7858     }
7859
7860   coding->src_object = src_object;
7861   coding->src_chars = chars;
7862   coding->src_bytes = bytes;
7863   coding->src_multibyte = chars < bytes;
7864
7865   if (STRINGP (src_object))
7866     {
7867       coding->src_pos = from;
7868       coding->src_pos_byte = from_byte;
7869     }
7870   else if (BUFFERP (src_object))
7871     {
7872       set_buffer_internal (XBUFFER (src_object));
7873       if (from != GPT)
7874         move_gap_both (from, from_byte);
7875       if (EQ (src_object, dst_object))
7876         {
7877           struct Lisp_Marker *tail;
7878
7879           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7880             {
7881               tail->need_adjustment
7882                 = tail->charpos == (tail->insertion_type ? from : to);
7883               need_marker_adjustment |= tail->need_adjustment;
7884             }
7885           saved_pt = PT, saved_pt_byte = PT_BYTE;
7886           TEMP_SET_PT_BOTH (from, from_byte);
7887           current_buffer->text->inhibit_shrinking = 1;
7888           del_range_both (from, from_byte, to, to_byte, 1);
7889           coding->src_pos = -chars;
7890           coding->src_pos_byte = -bytes;
7891         }
7892       else
7893         {
7894           coding->src_pos = from;
7895           coding->src_pos_byte = from_byte;
7896         }
7897     }
7898
7899   if (CODING_REQUIRE_DETECTION (coding))
7900     detect_coding (coding);
7901   attrs = CODING_ID_ATTRS (coding->id);
7902
7903   if (EQ (dst_object, Qt)
7904       || (! NILP (CODING_ATTR_POST_READ (attrs))
7905           && NILP (dst_object)))
7906     {
7907       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7908       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7909       coding->dst_pos = BEG;
7910       coding->dst_pos_byte = BEG_BYTE;
7911     }
7912   else if (BUFFERP (dst_object))
7913     {
7914       code_conversion_save (0, 0);
7915       coding->dst_object = dst_object;
7916       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7917       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7918       coding->dst_multibyte
7919         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7920     }
7921   else
7922     {
7923       code_conversion_save (0, 0);
7924       coding->dst_object = Qnil;
7925       /* Most callers presume this will return a multibyte result, and they
7926          won't use `binary' or `raw-text' anyway, so let's not worry about
7927          CODING_FOR_UNIBYTE.  */
7928       coding->dst_multibyte = 1;
7929     }
7930
7931   decode_coding (coding);
7932
7933   if (BUFFERP (coding->dst_object))
7934     set_buffer_internal (XBUFFER (coding->dst_object));
7935
7936   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7937     {
7938       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7939       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7940       Lisp_Object val;
7941
7942       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7943       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7944               old_deactivate_mark);
7945       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7946                         make_number (coding->produced_char));
7947       UNGCPRO;
7948       CHECK_NATNUM (val);
7949       coding->produced_char += Z - prev_Z;
7950       coding->produced += Z_BYTE - prev_Z_BYTE;
7951     }
7952
7953   if (EQ (dst_object, Qt))
7954     {
7955       coding->dst_object = Fbuffer_string ();
7956     }
7957   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7958     {
7959       set_buffer_internal (XBUFFER (coding->dst_object));
7960       if (dst_bytes < coding->produced)
7961         {
7962           eassert (coding->produced > 0);
7963           destination = xrealloc (destination, coding->produced);
7964           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7965             move_gap_both (BEGV, BEGV_BYTE);
7966           memcpy (destination, BEGV_ADDR, coding->produced);
7967           coding->destination = destination;
7968         }
7969     }
7970
7971   if (saved_pt >= 0)
7972     {
7973       /* This is the case of:
7974          (BUFFERP (src_object) && EQ (src_object, dst_object))
7975          As we have moved PT while replacing the original buffer
7976          contents, we must recover it now.  */
7977       set_buffer_internal (XBUFFER (src_object));
7978       current_buffer->text->inhibit_shrinking = 0;
7979       if (saved_pt < from)
7980         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7981       else if (saved_pt < from + chars)
7982         TEMP_SET_PT_BOTH (from, from_byte);
7983       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7984         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7985                           saved_pt_byte + (coding->produced - bytes));
7986       else
7987         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7988                           saved_pt_byte + (coding->produced - bytes));
7989
7990       if (need_marker_adjustment)
7991         {
7992           struct Lisp_Marker *tail;
7993
7994           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7995             if (tail->need_adjustment)
7996               {
7997                 tail->need_adjustment = 0;
7998                 if (tail->insertion_type)
7999                   {
8000                     tail->bytepos = from_byte;
8001                     tail->charpos = from;
8002                   }
8003                 else
8004                   {
8005                     tail->bytepos = from_byte + coding->produced;
8006                     tail->charpos
8007                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8008                          ? tail->bytepos : from + coding->produced_char);
8009                   }
8010               }
8011         }
8012     }
8013
8014   Vdeactivate_mark = old_deactivate_mark;
8015   unbind_to (count, coding->dst_object);
8016 }
8017
8018
8019 void
8020 encode_coding_object (struct coding_system *coding,
8021                       Lisp_Object src_object,
8022                       ptrdiff_t from, ptrdiff_t from_byte,
8023                       ptrdiff_t to, ptrdiff_t to_byte,
8024                       Lisp_Object dst_object)
8025 {
8026   ptrdiff_t count = SPECPDL_INDEX ();
8027   ptrdiff_t chars = to - from;
8028   ptrdiff_t bytes = to_byte - from_byte;
8029   Lisp_Object attrs;
8030   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8031   bool need_marker_adjustment = 0;
8032   bool kill_src_buffer = 0;
8033   Lisp_Object old_deactivate_mark;
8034
8035   old_deactivate_mark = Vdeactivate_mark;
8036
8037   coding->src_object = src_object;
8038   coding->src_chars = chars;
8039   coding->src_bytes = bytes;
8040   coding->src_multibyte = chars < bytes;
8041
8042   attrs = CODING_ID_ATTRS (coding->id);
8043
8044   if (EQ (src_object, dst_object))
8045     {
8046       struct Lisp_Marker *tail;
8047
8048       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8049         {
8050           tail->need_adjustment
8051             = tail->charpos == (tail->insertion_type ? from : to);
8052           need_marker_adjustment |= tail->need_adjustment;
8053         }
8054     }
8055
8056   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8057     {
8058       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8059       set_buffer_internal (XBUFFER (coding->src_object));
8060       if (STRINGP (src_object))
8061         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8062       else if (BUFFERP (src_object))
8063         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8064       else
8065         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8066
8067       if (EQ (src_object, dst_object))
8068         {
8069           set_buffer_internal (XBUFFER (src_object));
8070           saved_pt = PT, saved_pt_byte = PT_BYTE;
8071           del_range_both (from, from_byte, to, to_byte, 1);
8072           set_buffer_internal (XBUFFER (coding->src_object));
8073         }
8074
8075       {
8076         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8077
8078         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8079                 old_deactivate_mark);
8080         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8081                     make_number (BEG), make_number (Z));
8082         UNGCPRO;
8083       }
8084       if (XBUFFER (coding->src_object) != current_buffer)
8085         kill_src_buffer = 1;
8086       coding->src_object = Fcurrent_buffer ();
8087       if (BEG != GPT)
8088         move_gap_both (BEG, BEG_BYTE);
8089       coding->src_chars = Z - BEG;
8090       coding->src_bytes = Z_BYTE - BEG_BYTE;
8091       coding->src_pos = BEG;
8092       coding->src_pos_byte = BEG_BYTE;
8093       coding->src_multibyte = Z < Z_BYTE;
8094     }
8095   else if (STRINGP (src_object))
8096     {
8097       code_conversion_save (0, 0);
8098       coding->src_pos = from;
8099       coding->src_pos_byte = from_byte;
8100     }
8101   else if (BUFFERP (src_object))
8102     {
8103       code_conversion_save (0, 0);
8104       set_buffer_internal (XBUFFER (src_object));
8105       if (EQ (src_object, dst_object))
8106         {
8107           saved_pt = PT, saved_pt_byte = PT_BYTE;
8108           coding->src_object = del_range_1 (from, to, 1, 1);
8109           coding->src_pos = 0;
8110           coding->src_pos_byte = 0;
8111         }
8112       else
8113         {
8114           if (from < GPT && to >= GPT)
8115             move_gap_both (from, from_byte);
8116           coding->src_pos = from;
8117           coding->src_pos_byte = from_byte;
8118         }
8119     }
8120   else
8121     code_conversion_save (0, 0);
8122
8123   if (BUFFERP (dst_object))
8124     {
8125       coding->dst_object = dst_object;
8126       if (EQ (src_object, dst_object))
8127         {
8128           coding->dst_pos = from;
8129           coding->dst_pos_byte = from_byte;
8130         }
8131       else
8132         {
8133           struct buffer *current = current_buffer;
8134
8135           set_buffer_temp (XBUFFER (dst_object));
8136           coding->dst_pos = PT;
8137           coding->dst_pos_byte = PT_BYTE;
8138           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8139           set_buffer_temp (current);
8140         }
8141       coding->dst_multibyte
8142         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8143     }
8144   else if (EQ (dst_object, Qt))
8145     {
8146       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8147       coding->dst_object = Qnil;
8148       coding->destination = xmalloc (dst_bytes);
8149       coding->dst_bytes = dst_bytes;
8150       coding->dst_multibyte = 0;
8151     }
8152   else
8153     {
8154       coding->dst_object = Qnil;
8155       coding->dst_multibyte = 0;
8156     }
8157
8158   encode_coding (coding);
8159
8160   if (EQ (dst_object, Qt))
8161     {
8162       if (BUFFERP (coding->dst_object))
8163         coding->dst_object = Fbuffer_string ();
8164       else
8165         {
8166           coding->dst_object
8167             = make_unibyte_string ((char *) coding->destination,
8168                                    coding->produced);
8169           xfree (coding->destination);
8170         }
8171     }
8172
8173   if (saved_pt >= 0)
8174     {
8175       /* This is the case of:
8176          (BUFFERP (src_object) && EQ (src_object, dst_object))
8177          As we have moved PT while replacing the original buffer
8178          contents, we must recover it now.  */
8179       set_buffer_internal (XBUFFER (src_object));
8180       if (saved_pt < from)
8181         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8182       else if (saved_pt < from + chars)
8183         TEMP_SET_PT_BOTH (from, from_byte);
8184       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8185         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8186                           saved_pt_byte + (coding->produced - bytes));
8187       else
8188         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8189                           saved_pt_byte + (coding->produced - bytes));
8190
8191       if (need_marker_adjustment)
8192         {
8193           struct Lisp_Marker *tail;
8194
8195           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8196             if (tail->need_adjustment)
8197               {
8198                 tail->need_adjustment = 0;
8199                 if (tail->insertion_type)
8200                   {
8201                     tail->bytepos = from_byte;
8202                     tail->charpos = from;
8203                   }
8204                 else
8205                   {
8206                     tail->bytepos = from_byte + coding->produced;
8207                     tail->charpos
8208                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8209                          ? tail->bytepos : from + coding->produced_char);
8210                   }
8211               }
8212         }
8213     }
8214
8215   if (kill_src_buffer)
8216     Fkill_buffer (coding->src_object);
8217
8218   Vdeactivate_mark = old_deactivate_mark;
8219   unbind_to (count, Qnil);
8220 }
8221
8222
8223 Lisp_Object
8224 preferred_coding_system (void)
8225 {
8226   int id = coding_categories[coding_priorities[0]].id;
8227
8228   return CODING_ID_NAME (id);
8229 }
8230
8231 #if defined (WINDOWSNT) || defined (CYGWIN)
8232
8233 Lisp_Object
8234 from_unicode (Lisp_Object str)
8235 {
8236   CHECK_STRING (str);
8237   if (!STRING_MULTIBYTE (str) &&
8238       SBYTES (str) & 1)
8239     {
8240       str = Fsubstring (str, make_number (0), make_number (-1));
8241     }
8242
8243   return code_convert_string_norecord (str, Qutf_16le, 0);
8244 }
8245
8246 Lisp_Object
8247 from_unicode_buffer (const wchar_t* wstr)
8248 {
8249     return from_unicode (
8250         make_unibyte_string (
8251             (char*) wstr,
8252             /* we get one of the two final 0 bytes for free. */
8253             1 + sizeof (wchar_t) * wcslen (wstr)));
8254 }
8255
8256 wchar_t *
8257 to_unicode (Lisp_Object str, Lisp_Object *buf)
8258 {
8259   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8260   /* We need to make another copy (in addition to the one made by
8261      code_convert_string_norecord) to ensure that the final string is
8262      _doubly_ zero terminated --- that is, that the string is
8263      terminated by two zero bytes and one utf-16le null character.
8264      Because strings are already terminated with a single zero byte,
8265      we just add one additional zero. */
8266   str = make_uninit_string (SBYTES (*buf) + 1);
8267   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8268   SDATA (str) [SBYTES (*buf)] = '\0';
8269   *buf = str;
8270   return WCSDATA (*buf);
8271 }
8272
8273 #endif /* WINDOWSNT || CYGWIN */
8274
8275 \f
8276 #ifdef emacs
8277 /*** 8. Emacs Lisp library functions ***/
8278
8279 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8280        doc: /* Return t if OBJECT is nil or a coding-system.
8281 See the documentation of `define-coding-system' for information
8282 about coding-system objects.  */)
8283   (Lisp_Object object)
8284 {
8285   if (NILP (object)
8286       || CODING_SYSTEM_ID (object) >= 0)
8287     return Qt;
8288   if (! SYMBOLP (object)
8289       || NILP (Fget (object, Qcoding_system_define_form)))
8290     return Qnil;
8291   return Qt;
8292 }
8293
8294 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8295        Sread_non_nil_coding_system, 1, 1, 0,
8296        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8297   (Lisp_Object prompt)
8298 {
8299   Lisp_Object val;
8300   do
8301     {
8302       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8303                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8304     }
8305   while (SCHARS (val) == 0);
8306   return (Fintern (val, Qnil));
8307 }
8308
8309 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8310        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8311 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8312 Ignores case when completing coding systems (all Emacs coding systems
8313 are lower-case).  */)
8314   (Lisp_Object prompt, Lisp_Object default_coding_system)
8315 {
8316   Lisp_Object val;
8317   ptrdiff_t count = SPECPDL_INDEX ();
8318
8319   if (SYMBOLP (default_coding_system))
8320     default_coding_system = SYMBOL_NAME (default_coding_system);
8321   specbind (Qcompletion_ignore_case, Qt);
8322   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8323                           Qt, Qnil, Qcoding_system_history,
8324                           default_coding_system, Qnil);
8325   unbind_to (count, Qnil);
8326   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8327 }
8328
8329 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8330        1, 1, 0,
8331        doc: /* Check validity of CODING-SYSTEM.
8332 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8333 It is valid if it is nil or a symbol defined as a coding system by the
8334 function `define-coding-system'.  */)
8335   (Lisp_Object coding_system)
8336 {
8337   Lisp_Object define_form;
8338
8339   define_form = Fget (coding_system, Qcoding_system_define_form);
8340   if (! NILP (define_form))
8341     {
8342       Fput (coding_system, Qcoding_system_define_form, Qnil);
8343       safe_eval (define_form);
8344     }
8345   if (!NILP (Fcoding_system_p (coding_system)))
8346     return coding_system;
8347   xsignal1 (Qcoding_system_error, coding_system);
8348 }
8349
8350 \f
8351 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8352    HIGHEST, return the coding system of the highest
8353    priority among the detected coding systems.  Otherwise return a
8354    list of detected coding systems sorted by their priorities.  If
8355    MULTIBYTEP, it is assumed that the bytes are in correct
8356    multibyte form but contains only ASCII and eight-bit chars.
8357    Otherwise, the bytes are raw bytes.
8358
8359    CODING-SYSTEM controls the detection as below:
8360
8361    If it is nil, detect both text-format and eol-format.  If the
8362    text-format part of CODING-SYSTEM is already specified
8363    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8364    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8365    detect only text-format.  */
8366
8367 Lisp_Object
8368 detect_coding_system (const unsigned char *src,
8369                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8370                       bool highest, bool multibytep,
8371                       Lisp_Object coding_system)
8372 {
8373   const unsigned char *src_end = src + src_bytes;
8374   Lisp_Object attrs, eol_type;
8375   Lisp_Object val = Qnil;
8376   struct coding_system coding;
8377   ptrdiff_t id;
8378   struct coding_detection_info detect_info;
8379   enum coding_category base_category;
8380   bool null_byte_found = 0, eight_bit_found = 0;
8381
8382   if (NILP (coding_system))
8383     coding_system = Qundecided;
8384   setup_coding_system (coding_system, &coding);
8385   attrs = CODING_ID_ATTRS (coding.id);
8386   eol_type = CODING_ID_EOL_TYPE (coding.id);
8387   coding_system = CODING_ATTR_BASE_NAME (attrs);
8388
8389   coding.source = src;
8390   coding.src_chars = src_chars;
8391   coding.src_bytes = src_bytes;
8392   coding.src_multibyte = multibytep;
8393   coding.consumed = 0;
8394   coding.mode |= CODING_MODE_LAST_BLOCK;
8395   coding.head_ascii = 0;
8396
8397   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8398
8399   /* At first, detect text-format if necessary.  */
8400   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8401   if (base_category == coding_category_undecided)
8402     {
8403       enum coding_category category IF_LINT (= 0);
8404       struct coding_system *this IF_LINT (= NULL);
8405       int c, i;
8406
8407       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8408       for (; src < src_end; src++)
8409         {
8410           c = *src;
8411           if (c & 0x80)
8412             {
8413               eight_bit_found = 1;
8414               if (null_byte_found)
8415                 break;
8416             }
8417           else if (c < 0x20)
8418             {
8419               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8420                   && ! inhibit_iso_escape_detection
8421                   && ! detect_info.checked)
8422                 {
8423                   if (detect_coding_iso_2022 (&coding, &detect_info))
8424                     {
8425                       /* We have scanned the whole data.  */
8426                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8427                         {
8428                           /* We didn't find an 8-bit code.  We may
8429                              have found a null-byte, but it's very
8430                              rare that a binary file confirm to
8431                              ISO-2022.  */
8432                           src = src_end;
8433                           coding.head_ascii = src - coding.source;
8434                         }
8435                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8436                       break;
8437                     }
8438                 }
8439               else if (! c && !inhibit_null_byte_detection)
8440                 {
8441                   null_byte_found = 1;
8442                   if (eight_bit_found)
8443                     break;
8444                 }
8445               if (! eight_bit_found)
8446                 coding.head_ascii++;
8447             }
8448           else if (! eight_bit_found)
8449             coding.head_ascii++;
8450         }
8451
8452       if (null_byte_found || eight_bit_found
8453           || coding.head_ascii < coding.src_bytes
8454           || detect_info.found)
8455         {
8456           if (coding.head_ascii == coding.src_bytes)
8457             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8458             for (i = 0; i < coding_category_raw_text; i++)
8459               {
8460                 category = coding_priorities[i];
8461                 this = coding_categories + category;
8462                 if (detect_info.found & (1 << category))
8463                   break;
8464               }
8465           else
8466             {
8467               if (null_byte_found)
8468                 {
8469                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8470                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8471                 }
8472               for (i = 0; i < coding_category_raw_text; i++)
8473                 {
8474                   category = coding_priorities[i];
8475                   this = coding_categories + category;
8476
8477                   if (this->id < 0)
8478                     {
8479                       /* No coding system of this category is defined.  */
8480                       detect_info.rejected |= (1 << category);
8481                     }
8482                   else if (category >= coding_category_raw_text)
8483                     continue;
8484                   else if (detect_info.checked & (1 << category))
8485                     {
8486                       if (highest
8487                           && (detect_info.found & (1 << category)))
8488                         break;
8489                     }
8490                   else if ((*(this->detector)) (&coding, &detect_info)
8491                            && highest
8492                            && (detect_info.found & (1 << category)))
8493                     {
8494                       if (category == coding_category_utf_16_auto)
8495                         {
8496                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8497                             category = coding_category_utf_16_le;
8498                           else
8499                             category = coding_category_utf_16_be;
8500                         }
8501                       break;
8502                     }
8503                 }
8504             }
8505         }
8506
8507       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8508           || null_byte_found)
8509         {
8510           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8511           id = CODING_SYSTEM_ID (Qno_conversion);
8512           val = Fcons (make_number (id), Qnil);
8513         }
8514       else if (! detect_info.rejected && ! detect_info.found)
8515         {
8516           detect_info.found = CATEGORY_MASK_ANY;
8517           id = coding_categories[coding_category_undecided].id;
8518           val = Fcons (make_number (id), Qnil);
8519         }
8520       else if (highest)
8521         {
8522           if (detect_info.found)
8523             {
8524               detect_info.found = 1 << category;
8525               val = Fcons (make_number (this->id), Qnil);
8526             }
8527           else
8528             for (i = 0; i < coding_category_raw_text; i++)
8529               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8530                 {
8531                   detect_info.found = 1 << coding_priorities[i];
8532                   id = coding_categories[coding_priorities[i]].id;
8533                   val = Fcons (make_number (id), Qnil);
8534                   break;
8535                 }
8536         }
8537       else
8538         {
8539           int mask = detect_info.rejected | detect_info.found;
8540           int found = 0;
8541
8542           for (i = coding_category_raw_text - 1; i >= 0; i--)
8543             {
8544               category = coding_priorities[i];
8545               if (! (mask & (1 << category)))
8546                 {
8547                   found |= 1 << category;
8548                   id = coding_categories[category].id;
8549                   if (id >= 0)
8550                     val = Fcons (make_number (id), val);
8551                 }
8552             }
8553           for (i = coding_category_raw_text - 1; i >= 0; i--)
8554             {
8555               category = coding_priorities[i];
8556               if (detect_info.found & (1 << category))
8557                 {
8558                   id = coding_categories[category].id;
8559                   val = Fcons (make_number (id), val);
8560                 }
8561             }
8562           detect_info.found |= found;
8563         }
8564     }
8565   else if (base_category == coding_category_utf_8_auto)
8566     {
8567       if (detect_coding_utf_8 (&coding, &detect_info))
8568         {
8569           struct coding_system *this;
8570
8571           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8572             this = coding_categories + coding_category_utf_8_sig;
8573           else
8574             this = coding_categories + coding_category_utf_8_nosig;
8575           val = Fcons (make_number (this->id), Qnil);
8576         }
8577     }
8578   else if (base_category == coding_category_utf_16_auto)
8579     {
8580       if (detect_coding_utf_16 (&coding, &detect_info))
8581         {
8582           struct coding_system *this;
8583
8584           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8585             this = coding_categories + coding_category_utf_16_le;
8586           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8587             this = coding_categories + coding_category_utf_16_be;
8588           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8589             this = coding_categories + coding_category_utf_16_be_nosig;
8590           else
8591             this = coding_categories + coding_category_utf_16_le_nosig;
8592           val = Fcons (make_number (this->id), Qnil);
8593         }
8594     }
8595   else
8596     {
8597       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8598       val = Fcons (make_number (coding.id), Qnil);
8599     }
8600
8601   /* Then, detect eol-format if necessary.  */
8602   {
8603     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8604     Lisp_Object tail;
8605
8606     if (VECTORP (eol_type))
8607       {
8608         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8609           {
8610             if (null_byte_found)
8611               normal_eol = EOL_SEEN_LF;
8612             else
8613               normal_eol = detect_eol (coding.source, src_bytes,
8614                                        coding_category_raw_text);
8615           }
8616         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8617                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8618           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8619                                       coding_category_utf_16_be);
8620         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8621                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8622           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8623                                       coding_category_utf_16_le);
8624       }
8625     else
8626       {
8627         if (EQ (eol_type, Qunix))
8628           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8629         else if (EQ (eol_type, Qdos))
8630           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8631         else
8632           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8633       }
8634
8635     for (tail = val; CONSP (tail); tail = XCDR (tail))
8636       {
8637         enum coding_category category;
8638         int this_eol;
8639
8640         id = XINT (XCAR (tail));
8641         attrs = CODING_ID_ATTRS (id);
8642         category = XINT (CODING_ATTR_CATEGORY (attrs));
8643         eol_type = CODING_ID_EOL_TYPE (id);
8644         if (VECTORP (eol_type))
8645           {
8646             if (category == coding_category_utf_16_be
8647                 || category == coding_category_utf_16_be_nosig)
8648               this_eol = utf_16_be_eol;
8649             else if (category == coding_category_utf_16_le
8650                      || category == coding_category_utf_16_le_nosig)
8651               this_eol = utf_16_le_eol;
8652             else
8653               this_eol = normal_eol;
8654
8655             if (this_eol == EOL_SEEN_LF)
8656               XSETCAR (tail, AREF (eol_type, 0));
8657             else if (this_eol == EOL_SEEN_CRLF)
8658               XSETCAR (tail, AREF (eol_type, 1));
8659             else if (this_eol == EOL_SEEN_CR)
8660               XSETCAR (tail, AREF (eol_type, 2));
8661             else
8662               XSETCAR (tail, CODING_ID_NAME (id));
8663           }
8664         else
8665           XSETCAR (tail, CODING_ID_NAME (id));
8666       }
8667   }
8668
8669   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8670 }
8671
8672
8673 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8674        2, 3, 0,
8675        doc: /* Detect coding system of the text in the region between START and END.
8676 Return a list of possible coding systems ordered by priority.
8677 The coding systems to try and their priorities follows what
8678 the function `coding-system-priority-list' (which see) returns.
8679
8680 If only ASCII characters are found (except for such ISO-2022 control
8681 characters as ESC), it returns a list of single element `undecided'
8682 or its subsidiary coding system according to a detected end-of-line
8683 format.
8684
8685 If optional argument HIGHEST is non-nil, return the coding system of
8686 highest priority.  */)
8687   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8688 {
8689   ptrdiff_t from, to;
8690   ptrdiff_t from_byte, to_byte;
8691
8692   validate_region (&start, &end);
8693   from = XINT (start), to = XINT (end);
8694   from_byte = CHAR_TO_BYTE (from);
8695   to_byte = CHAR_TO_BYTE (to);
8696
8697   if (from < GPT && to >= GPT)
8698     move_gap_both (to, to_byte);
8699
8700   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8701                                to - from, to_byte - from_byte,
8702                                !NILP (highest),
8703                                !NILP (BVAR (current_buffer
8704                                       , enable_multibyte_characters)),
8705                                Qnil);
8706 }
8707
8708 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8709        1, 2, 0,
8710        doc: /* Detect coding system of the text in STRING.
8711 Return a list of possible coding systems ordered by priority.
8712 The coding systems to try and their priorities follows what
8713 the function `coding-system-priority-list' (which see) returns.
8714
8715 If only ASCII characters are found (except for such ISO-2022 control
8716 characters as ESC), it returns a list of single element `undecided'
8717 or its subsidiary coding system according to a detected end-of-line
8718 format.
8719
8720 If optional argument HIGHEST is non-nil, return the coding system of
8721 highest priority.  */)
8722   (Lisp_Object string, Lisp_Object highest)
8723 {
8724   CHECK_STRING (string);
8725
8726   return detect_coding_system (SDATA (string),
8727                                SCHARS (string), SBYTES (string),
8728                                !NILP (highest), STRING_MULTIBYTE (string),
8729                                Qnil);
8730 }
8731
8732
8733 static bool
8734 char_encodable_p (int c, Lisp_Object attrs)
8735 {
8736   Lisp_Object tail;
8737   struct charset *charset;
8738   Lisp_Object translation_table;
8739
8740   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8741   if (! NILP (translation_table))
8742     c = translate_char (translation_table, c);
8743   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8744        CONSP (tail); tail = XCDR (tail))
8745     {
8746       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8747       if (CHAR_CHARSET_P (c, charset))
8748         break;
8749     }
8750   return (! NILP (tail));
8751 }
8752
8753
8754 /* Return a list of coding systems that safely encode the text between
8755    START and END.  If EXCLUDE is non-nil, it is a list of coding
8756    systems not to check.  The returned list doesn't contain any such
8757    coding systems.  In any case, if the text contains only ASCII or is
8758    unibyte, return t.  */
8759
8760 DEFUN ("find-coding-systems-region-internal",
8761        Ffind_coding_systems_region_internal,
8762        Sfind_coding_systems_region_internal, 2, 3, 0,
8763        doc: /* Internal use only.  */)
8764   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8765 {
8766   Lisp_Object coding_attrs_list, safe_codings;
8767   ptrdiff_t start_byte, end_byte;
8768   const unsigned char *p, *pbeg, *pend;
8769   int c;
8770   Lisp_Object tail, elt, work_table;
8771
8772   if (STRINGP (start))
8773     {
8774       if (!STRING_MULTIBYTE (start)
8775           || SCHARS (start) == SBYTES (start))
8776         return Qt;
8777       start_byte = 0;
8778       end_byte = SBYTES (start);
8779     }
8780   else
8781     {
8782       CHECK_NUMBER_COERCE_MARKER (start);
8783       CHECK_NUMBER_COERCE_MARKER (end);
8784       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8785         args_out_of_range (start, end);
8786       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8787         return Qt;
8788       start_byte = CHAR_TO_BYTE (XINT (start));
8789       end_byte = CHAR_TO_BYTE (XINT (end));
8790       if (XINT (end) - XINT (start) == end_byte - start_byte)
8791         return Qt;
8792
8793       if (XINT (start) < GPT && XINT (end) > GPT)
8794         {
8795           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8796             move_gap_both (XINT (start), start_byte);
8797           else
8798             move_gap_both (XINT (end), end_byte);
8799         }
8800     }
8801
8802   coding_attrs_list = Qnil;
8803   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8804     if (NILP (exclude)
8805         || NILP (Fmemq (XCAR (tail), exclude)))
8806       {
8807         Lisp_Object attrs;
8808
8809         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8810         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8811             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8812           {
8813             ASET (attrs, coding_attr_trans_tbl,
8814                   get_translation_table (attrs, 1, NULL));
8815             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8816           }
8817       }
8818
8819   if (STRINGP (start))
8820     p = pbeg = SDATA (start);
8821   else
8822     p = pbeg = BYTE_POS_ADDR (start_byte);
8823   pend = p + (end_byte - start_byte);
8824
8825   while (p < pend && ASCII_BYTE_P (*p)) p++;
8826   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8827
8828   work_table = Fmake_char_table (Qnil, Qnil);
8829   while (p < pend)
8830     {
8831       if (ASCII_BYTE_P (*p))
8832         p++;
8833       else
8834         {
8835           c = STRING_CHAR_ADVANCE (p);
8836           if (!NILP (char_table_ref (work_table, c)))
8837             /* This character was already checked.  Ignore it.  */
8838             continue;
8839
8840           charset_map_loaded = 0;
8841           for (tail = coding_attrs_list; CONSP (tail);)
8842             {
8843               elt = XCAR (tail);
8844               if (NILP (elt))
8845                 tail = XCDR (tail);
8846               else if (char_encodable_p (c, elt))
8847                 tail = XCDR (tail);
8848               else if (CONSP (XCDR (tail)))
8849                 {
8850                   XSETCAR (tail, XCAR (XCDR (tail)));
8851                   XSETCDR (tail, XCDR (XCDR (tail)));
8852                 }
8853               else
8854                 {
8855                   XSETCAR (tail, Qnil);
8856                   tail = XCDR (tail);
8857                 }
8858             }
8859           if (charset_map_loaded)
8860             {
8861               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8862
8863               if (STRINGP (start))
8864                 pbeg = SDATA (start);
8865               else
8866                 pbeg = BYTE_POS_ADDR (start_byte);
8867               p = pbeg + p_offset;
8868               pend = pbeg + pend_offset;
8869             }
8870           char_table_set (work_table, c, Qt);
8871         }
8872     }
8873
8874   safe_codings = list2 (Qraw_text, Qno_conversion);
8875   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8876     if (! NILP (XCAR (tail)))
8877       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8878
8879   return safe_codings;
8880 }
8881
8882
8883 DEFUN ("unencodable-char-position", Funencodable_char_position,
8884        Sunencodable_char_position, 3, 5, 0,
8885        doc: /*
8886 Return position of first un-encodable character in a region.
8887 START and END specify the region and CODING-SYSTEM specifies the
8888 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8889
8890 If optional 4th argument COUNT is non-nil, it specifies at most how
8891 many un-encodable characters to search.  In this case, the value is a
8892 list of positions.
8893
8894 If optional 5th argument STRING is non-nil, it is a string to search
8895 for un-encodable characters.  In that case, START and END are indexes
8896 to the string.  */)
8897   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8898 {
8899   EMACS_INT n;
8900   struct coding_system coding;
8901   Lisp_Object attrs, charset_list, translation_table;
8902   Lisp_Object positions;
8903   ptrdiff_t from, to;
8904   const unsigned char *p, *stop, *pend;
8905   bool ascii_compatible;
8906
8907   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8908   attrs = CODING_ID_ATTRS (coding.id);
8909   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8910     return Qnil;
8911   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8912   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8913   translation_table = get_translation_table (attrs, 1, NULL);
8914
8915   if (NILP (string))
8916     {
8917       validate_region (&start, &end);
8918       from = XINT (start);
8919       to = XINT (end);
8920       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8921           || (ascii_compatible
8922               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8923         return Qnil;
8924       p = CHAR_POS_ADDR (from);
8925       pend = CHAR_POS_ADDR (to);
8926       if (from < GPT && to >= GPT)
8927         stop = GPT_ADDR;
8928       else
8929         stop = pend;
8930     }
8931   else
8932     {
8933       CHECK_STRING (string);
8934       CHECK_NATNUM (start);
8935       CHECK_NATNUM (end);
8936       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8937         args_out_of_range_3 (string, start, end);
8938       from = XINT (start);
8939       to = XINT (end);
8940       if (! STRING_MULTIBYTE (string))
8941         return Qnil;
8942       p = SDATA (string) + string_char_to_byte (string, from);
8943       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8944       if (ascii_compatible && (to - from) == (pend - p))
8945         return Qnil;
8946     }
8947
8948   if (NILP (count))
8949     n = 1;
8950   else
8951     {
8952       CHECK_NATNUM (count);
8953       n = XINT (count);
8954     }
8955
8956   positions = Qnil;
8957   charset_map_loaded = 0;
8958   while (1)
8959     {
8960       int c;
8961
8962       if (ascii_compatible)
8963         while (p < stop && ASCII_BYTE_P (*p))
8964           p++, from++;
8965       if (p >= stop)
8966         {
8967           if (p >= pend)
8968             break;
8969           stop = pend;
8970           p = GAP_END_ADDR;
8971         }
8972
8973       c = STRING_CHAR_ADVANCE (p);
8974       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8975           && ! char_charset (translate_char (translation_table, c),
8976                              charset_list, NULL))
8977         {
8978           positions = Fcons (make_number (from), positions);
8979           n--;
8980           if (n == 0)
8981             break;
8982         }
8983
8984       from++;
8985       if (charset_map_loaded && NILP (string))
8986         {
8987           p = CHAR_POS_ADDR (from);
8988           pend = CHAR_POS_ADDR (to);
8989           if (from < GPT && to >= GPT)
8990             stop = GPT_ADDR;
8991           else
8992             stop = pend;
8993           charset_map_loaded = 0;
8994         }
8995     }
8996
8997   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8998 }
8999
9000
9001 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9002        Scheck_coding_systems_region, 3, 3, 0,
9003        doc: /* Check if the region is encodable by coding systems.
9004
9005 START and END are buffer positions specifying the region.
9006 CODING-SYSTEM-LIST is a list of coding systems to check.
9007
9008 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9009 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9010 whole region, POS0, POS1, ... are buffer positions where non-encodable
9011 characters are found.
9012
9013 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9014 value is nil.
9015
9016 START may be a string.  In that case, check if the string is
9017 encodable, and the value contains indices to the string instead of
9018 buffer positions.  END is ignored.
9019
9020 If the current buffer (or START if it is a string) is unibyte, the value
9021 is nil.  */)
9022   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9023 {
9024   Lisp_Object list;
9025   ptrdiff_t start_byte, end_byte;
9026   ptrdiff_t pos;
9027   const unsigned char *p, *pbeg, *pend;
9028   int c;
9029   Lisp_Object tail, elt, attrs;
9030
9031   if (STRINGP (start))
9032     {
9033       if (!STRING_MULTIBYTE (start)
9034           || SCHARS (start) == SBYTES (start))
9035         return Qnil;
9036       start_byte = 0;
9037       end_byte = SBYTES (start);
9038       pos = 0;
9039     }
9040   else
9041     {
9042       CHECK_NUMBER_COERCE_MARKER (start);
9043       CHECK_NUMBER_COERCE_MARKER (end);
9044       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9045         args_out_of_range (start, end);
9046       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9047         return Qnil;
9048       start_byte = CHAR_TO_BYTE (XINT (start));
9049       end_byte = CHAR_TO_BYTE (XINT (end));
9050       if (XINT (end) - XINT (start) == end_byte - start_byte)
9051         return Qnil;
9052
9053       if (XINT (start) < GPT && XINT (end) > GPT)
9054         {
9055           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9056             move_gap_both (XINT (start), start_byte);
9057           else
9058             move_gap_both (XINT (end), end_byte);
9059         }
9060       pos = XINT (start);
9061     }
9062
9063   list = Qnil;
9064   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9065     {
9066       elt = XCAR (tail);
9067       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9068       ASET (attrs, coding_attr_trans_tbl,
9069             get_translation_table (attrs, 1, NULL));
9070       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9071     }
9072
9073   if (STRINGP (start))
9074     p = pbeg = SDATA (start);
9075   else
9076     p = pbeg = BYTE_POS_ADDR (start_byte);
9077   pend = p + (end_byte - start_byte);
9078
9079   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9080   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9081
9082   while (p < pend)
9083     {
9084       if (ASCII_BYTE_P (*p))
9085         p++;
9086       else
9087         {
9088           c = STRING_CHAR_ADVANCE (p);
9089
9090           charset_map_loaded = 0;
9091           for (tail = list; CONSP (tail); tail = XCDR (tail))
9092             {
9093               elt = XCDR (XCAR (tail));
9094               if (! char_encodable_p (c, XCAR (elt)))
9095                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9096             }
9097           if (charset_map_loaded)
9098             {
9099               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9100
9101               if (STRINGP (start))
9102                 pbeg = SDATA (start);
9103               else
9104                 pbeg = BYTE_POS_ADDR (start_byte);
9105               p = pbeg + p_offset;
9106               pend = pbeg + pend_offset;
9107             }
9108         }
9109       pos++;
9110     }
9111
9112   tail = list;
9113   list = Qnil;
9114   for (; CONSP (tail); tail = XCDR (tail))
9115     {
9116       elt = XCAR (tail);
9117       if (CONSP (XCDR (XCDR (elt))))
9118         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9119                       list);
9120     }
9121
9122   return list;
9123 }
9124
9125
9126 static Lisp_Object
9127 code_convert_region (Lisp_Object start, Lisp_Object end,
9128                      Lisp_Object coding_system, Lisp_Object dst_object,
9129                      bool encodep, bool norecord)
9130 {
9131   struct coding_system coding;
9132   ptrdiff_t from, from_byte, to, to_byte;
9133   Lisp_Object src_object;
9134
9135   if (NILP (coding_system))
9136     coding_system = Qno_conversion;
9137   else
9138     CHECK_CODING_SYSTEM (coding_system);
9139   src_object = Fcurrent_buffer ();
9140   if (NILP (dst_object))
9141     dst_object = src_object;
9142   else if (! EQ (dst_object, Qt))
9143     CHECK_BUFFER (dst_object);
9144
9145   validate_region (&start, &end);
9146   from = XFASTINT (start);
9147   from_byte = CHAR_TO_BYTE (from);
9148   to = XFASTINT (end);
9149   to_byte = CHAR_TO_BYTE (to);
9150
9151   setup_coding_system (coding_system, &coding);
9152   coding.mode |= CODING_MODE_LAST_BLOCK;
9153
9154   if (encodep)
9155     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9156                           dst_object);
9157   else
9158     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9159                           dst_object);
9160   if (! norecord)
9161     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9162
9163   return (BUFFERP (dst_object)
9164           ? make_number (coding.produced_char)
9165           : coding.dst_object);
9166 }
9167
9168
9169 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9170        3, 4, "r\nzCoding system: ",
9171        doc: /* Decode the current region from the specified coding system.
9172 When called from a program, takes four arguments:
9173         START, END, CODING-SYSTEM, and DESTINATION.
9174 START and END are buffer positions.
9175
9176 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9177 If nil, the region between START and END is replaced by the decoded text.
9178 If buffer, the decoded text is inserted in that buffer after point (point
9179 does not move).
9180 In those cases, the length of the decoded text is returned.
9181 If DESTINATION is t, the decoded text is returned.
9182
9183 This function sets `last-coding-system-used' to the precise coding system
9184 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9185 not fully specified.)  */)
9186   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9187 {
9188   return code_convert_region (start, end, coding_system, destination, 0, 0);
9189 }
9190
9191 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9192        3, 4, "r\nzCoding system: ",
9193        doc: /* Encode the current region by specified coding system.
9194 When called from a program, takes four arguments:
9195         START, END, CODING-SYSTEM and DESTINATION.
9196 START and END are buffer positions.
9197
9198 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9199 If nil, the region between START and END is replace by the encoded text.
9200 If buffer, the encoded text is inserted in that buffer after point (point
9201 does not move).
9202 In those cases, the length of the encoded text is returned.
9203 If DESTINATION is t, the encoded text is returned.
9204
9205 This function sets `last-coding-system-used' to the precise coding system
9206 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9207 not fully specified.)  */)
9208   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9209 {
9210   return code_convert_region (start, end, coding_system, destination, 1, 0);
9211 }
9212
9213 Lisp_Object
9214 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9215                      Lisp_Object dst_object, bool encodep, bool nocopy,
9216                      bool norecord)
9217 {
9218   struct coding_system coding;
9219   ptrdiff_t chars, bytes;
9220
9221   CHECK_STRING (string);
9222   if (NILP (coding_system))
9223     {
9224       if (! norecord)
9225         Vlast_coding_system_used = Qno_conversion;
9226       if (NILP (dst_object))
9227         return (nocopy ? Fcopy_sequence (string) : string);
9228     }
9229
9230   if (NILP (coding_system))
9231     coding_system = Qno_conversion;
9232   else
9233     CHECK_CODING_SYSTEM (coding_system);
9234   if (NILP (dst_object))
9235     dst_object = Qt;
9236   else if (! EQ (dst_object, Qt))
9237     CHECK_BUFFER (dst_object);
9238
9239   setup_coding_system (coding_system, &coding);
9240   coding.mode |= CODING_MODE_LAST_BLOCK;
9241   chars = SCHARS (string);
9242   bytes = SBYTES (string);
9243   if (encodep)
9244     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9245   else
9246     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9247   if (! norecord)
9248     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9249
9250   return (BUFFERP (dst_object)
9251           ? make_number (coding.produced_char)
9252           : coding.dst_object);
9253 }
9254
9255
9256 /* Encode or decode STRING according to CODING_SYSTEM.
9257    Do not set Vlast_coding_system_used.
9258
9259    This function is called only from macros DECODE_FILE and
9260    ENCODE_FILE, thus we ignore character composition.  */
9261
9262 Lisp_Object
9263 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9264                               bool encodep)
9265 {
9266   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9267 }
9268
9269
9270 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9271        2, 4, 0,
9272        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9273
9274 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9275 if the decoding operation is trivial.
9276
9277 Optional fourth arg BUFFER non-nil means that the decoded text is
9278 inserted in that buffer after point (point does not move).  In this
9279 case, the return value is the length of the decoded text.
9280
9281 This function sets `last-coding-system-used' to the precise coding system
9282 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9283 not fully specified.)  */)
9284   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9285 {
9286   return code_convert_string (string, coding_system, buffer,
9287                               0, ! NILP (nocopy), 0);
9288 }
9289
9290 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9291        2, 4, 0,
9292        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9293
9294 Optional third arg NOCOPY non-nil means it is OK to return STRING
9295 itself if the encoding operation is trivial.
9296
9297 Optional fourth arg BUFFER non-nil means that the encoded text is
9298 inserted in that buffer after point (point does not move).  In this
9299 case, the return value is the length of the encoded text.
9300
9301 This function sets `last-coding-system-used' to the precise coding system
9302 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9303 not fully specified.)  */)
9304   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9305 {
9306   return code_convert_string (string, coding_system, buffer,
9307                               1, ! NILP (nocopy), 0);
9308 }
9309
9310 \f
9311 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9312        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9313 Return the corresponding character.  */)
9314   (Lisp_Object code)
9315 {
9316   Lisp_Object spec, attrs, val;
9317   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9318   EMACS_INT ch;
9319   int c;
9320
9321   CHECK_NATNUM (code);
9322   ch = XFASTINT (code);
9323   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9324   attrs = AREF (spec, 0);
9325
9326   if (ASCII_BYTE_P (ch)
9327       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9328     return code;
9329
9330   val = CODING_ATTR_CHARSET_LIST (attrs);
9331   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9332   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9333   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9334
9335   if (ch <= 0x7F)
9336     {
9337       c = ch;
9338       charset = charset_roman;
9339     }
9340   else if (ch >= 0xA0 && ch < 0xDF)
9341     {
9342       c = ch - 0x80;
9343       charset = charset_kana;
9344     }
9345   else
9346     {
9347       EMACS_INT c1 = ch >> 8;
9348       int c2 = ch & 0xFF;
9349
9350       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9351           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9352         error ("Invalid code: %"pI"d", ch);
9353       c = ch;
9354       SJIS_TO_JIS (c);
9355       charset = charset_kanji;
9356     }
9357   c = DECODE_CHAR (charset, c);
9358   if (c < 0)
9359     error ("Invalid code: %"pI"d", ch);
9360   return make_number (c);
9361 }
9362
9363
9364 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9365        doc: /* Encode a Japanese character CH to shift_jis encoding.
9366 Return the corresponding code in SJIS.  */)
9367   (Lisp_Object ch)
9368 {
9369   Lisp_Object spec, attrs, charset_list;
9370   int c;
9371   struct charset *charset;
9372   unsigned code;
9373
9374   CHECK_CHARACTER (ch);
9375   c = XFASTINT (ch);
9376   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9377   attrs = AREF (spec, 0);
9378
9379   if (ASCII_CHAR_P (c)
9380       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9381     return ch;
9382
9383   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9384   charset = char_charset (c, charset_list, &code);
9385   if (code == CHARSET_INVALID_CODE (charset))
9386     error ("Can't encode by shift_jis encoding: %c", c);
9387   JIS_TO_SJIS (code);
9388
9389   return make_number (code);
9390 }
9391
9392 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9393        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9394 Return the corresponding character.  */)
9395   (Lisp_Object code)
9396 {
9397   Lisp_Object spec, attrs, val;
9398   struct charset *charset_roman, *charset_big5, *charset;
9399   EMACS_INT ch;
9400   int c;
9401
9402   CHECK_NATNUM (code);
9403   ch = XFASTINT (code);
9404   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9405   attrs = AREF (spec, 0);
9406
9407   if (ASCII_BYTE_P (ch)
9408       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9409     return code;
9410
9411   val = CODING_ATTR_CHARSET_LIST (attrs);
9412   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9413   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9414
9415   if (ch <= 0x7F)
9416     {
9417       c = ch;
9418       charset = charset_roman;
9419     }
9420   else
9421     {
9422       EMACS_INT b1 = ch >> 8;
9423       int b2 = ch & 0x7F;
9424       if (b1 < 0xA1 || b1 > 0xFE
9425           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9426         error ("Invalid code: %"pI"d", ch);
9427       c = ch;
9428       charset = charset_big5;
9429     }
9430   c = DECODE_CHAR (charset, c);
9431   if (c < 0)
9432     error ("Invalid code: %"pI"d", ch);
9433   return make_number (c);
9434 }
9435
9436 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9437        doc: /* Encode the Big5 character CH to BIG5 coding system.
9438 Return the corresponding character code in Big5.  */)
9439   (Lisp_Object ch)
9440 {
9441   Lisp_Object spec, attrs, charset_list;
9442   struct charset *charset;
9443   int c;
9444   unsigned code;
9445
9446   CHECK_CHARACTER (ch);
9447   c = XFASTINT (ch);
9448   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9449   attrs = AREF (spec, 0);
9450   if (ASCII_CHAR_P (c)
9451       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9452     return ch;
9453
9454   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9455   charset = char_charset (c, charset_list, &code);
9456   if (code == CHARSET_INVALID_CODE (charset))
9457     error ("Can't encode by Big5 encoding: %c", c);
9458
9459   return make_number (code);
9460 }
9461
9462 \f
9463 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9464        Sset_terminal_coding_system_internal, 1, 2, 0,
9465        doc: /* Internal use only.  */)
9466   (Lisp_Object coding_system, Lisp_Object terminal)
9467 {
9468   struct terminal *term = get_terminal (terminal, 1);
9469   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9470   CHECK_SYMBOL (coding_system);
9471   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9472   /* We had better not send unsafe characters to terminal.  */
9473   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9474   /* Character composition should be disabled.  */
9475   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9476   terminal_coding->src_multibyte = 1;
9477   terminal_coding->dst_multibyte = 0;
9478   tset_charset_list
9479     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9480             ? coding_charset_list (terminal_coding)
9481             : Fcons (make_number (charset_ascii), Qnil)));
9482   return Qnil;
9483 }
9484
9485 DEFUN ("set-safe-terminal-coding-system-internal",
9486        Fset_safe_terminal_coding_system_internal,
9487        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9488        doc: /* Internal use only.  */)
9489   (Lisp_Object coding_system)
9490 {
9491   CHECK_SYMBOL (coding_system);
9492   setup_coding_system (Fcheck_coding_system (coding_system),
9493                        &safe_terminal_coding);
9494   /* Character composition should be disabled.  */
9495   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9496   safe_terminal_coding.src_multibyte = 1;
9497   safe_terminal_coding.dst_multibyte = 0;
9498   return Qnil;
9499 }
9500
9501 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9502        Sterminal_coding_system, 0, 1, 0,
9503        doc: /* Return coding system specified for terminal output on the given terminal.
9504 TERMINAL may be a terminal object, a frame, or nil for the selected
9505 frame's terminal device.  */)
9506   (Lisp_Object terminal)
9507 {
9508   struct coding_system *terminal_coding
9509     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9510   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9511
9512   /* For backward compatibility, return nil if it is `undecided'.  */
9513   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9514 }
9515
9516 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9517        Sset_keyboard_coding_system_internal, 1, 2, 0,
9518        doc: /* Internal use only.  */)
9519   (Lisp_Object coding_system, Lisp_Object terminal)
9520 {
9521   struct terminal *t = get_terminal (terminal, 1);
9522   CHECK_SYMBOL (coding_system);
9523   if (NILP (coding_system))
9524     coding_system = Qno_conversion;
9525   else
9526     Fcheck_coding_system (coding_system);
9527   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9528   /* Character composition should be disabled.  */
9529   TERMINAL_KEYBOARD_CODING (t)->common_flags
9530     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9531   return Qnil;
9532 }
9533
9534 DEFUN ("keyboard-coding-system",
9535        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9536        doc: /* Return coding system specified for decoding keyboard input.  */)
9537   (Lisp_Object terminal)
9538 {
9539   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9540                          (get_terminal (terminal, 1))->id);
9541 }
9542
9543 \f
9544 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9545        Sfind_operation_coding_system,  1, MANY, 0,
9546        doc: /* Choose a coding system for an operation based on the target name.
9547 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9548 DECODING-SYSTEM is the coding system to use for decoding
9549 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9550 for encoding (in case OPERATION does encoding).
9551
9552 The first argument OPERATION specifies an I/O primitive:
9553   For file I/O, `insert-file-contents' or `write-region'.
9554   For process I/O, `call-process', `call-process-region', or `start-process'.
9555   For network I/O, `open-network-stream'.
9556
9557 The remaining arguments should be the same arguments that were passed
9558 to the primitive.  Depending on which primitive, one of those arguments
9559 is selected as the TARGET.  For example, if OPERATION does file I/O,
9560 whichever argument specifies the file name is TARGET.
9561
9562 TARGET has a meaning which depends on OPERATION:
9563   For file I/O, TARGET is a file name (except for the special case below).
9564   For process I/O, TARGET is a process name.
9565   For network I/O, TARGET is a service name or a port number.
9566
9567 This function looks up what is specified for TARGET in
9568 `file-coding-system-alist', `process-coding-system-alist',
9569 or `network-coding-system-alist' depending on OPERATION.
9570 They may specify a coding system, a cons of coding systems,
9571 or a function symbol to call.
9572 In the last case, we call the function with one argument,
9573 which is a list of all the arguments given to this function.
9574 If the function can't decide a coding system, it can return
9575 `undecided' so that the normal code-detection is performed.
9576
9577 If OPERATION is `insert-file-contents', the argument corresponding to
9578 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9579 file name to look up, and BUFFER is a buffer that contains the file's
9580 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9581 function to call for FILENAME, that function should examine the
9582 contents of BUFFER instead of reading the file.
9583
9584 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9585   (ptrdiff_t nargs, Lisp_Object *args)
9586 {
9587   Lisp_Object operation, target_idx, target, val;
9588   register Lisp_Object chain;
9589
9590   if (nargs < 2)
9591     error ("Too few arguments");
9592   operation = args[0];
9593   if (!SYMBOLP (operation)
9594       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9595     error ("Invalid first argument");
9596   if (nargs <= 1 + XFASTINT (target_idx))
9597     error ("Too few arguments for operation `%s'",
9598            SDATA (SYMBOL_NAME (operation)));
9599   target = args[XFASTINT (target_idx) + 1];
9600   if (!(STRINGP (target)
9601         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9602             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9603         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9604     error ("Invalid argument %"pI"d of operation `%s'",
9605            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9606   if (CONSP (target))
9607     target = XCAR (target);
9608
9609   chain = ((EQ (operation, Qinsert_file_contents)
9610             || EQ (operation, Qwrite_region))
9611            ? Vfile_coding_system_alist
9612            : (EQ (operation, Qopen_network_stream)
9613               ? Vnetwork_coding_system_alist
9614               : Vprocess_coding_system_alist));
9615   if (NILP (chain))
9616     return Qnil;
9617
9618   for (; CONSP (chain); chain = XCDR (chain))
9619     {
9620       Lisp_Object elt;
9621
9622       elt = XCAR (chain);
9623       if (CONSP (elt)
9624           && ((STRINGP (target)
9625                && STRINGP (XCAR (elt))
9626                && fast_string_match (XCAR (elt), target) >= 0)
9627               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9628         {
9629           val = XCDR (elt);
9630           /* Here, if VAL is both a valid coding system and a valid
9631              function symbol, we return VAL as a coding system.  */
9632           if (CONSP (val))
9633             return val;
9634           if (! SYMBOLP (val))
9635             return Qnil;
9636           if (! NILP (Fcoding_system_p (val)))
9637             return Fcons (val, val);
9638           if (! NILP (Ffboundp (val)))
9639             {
9640               /* We use call1 rather than safe_call1
9641                  so as to get bug reports about functions called here
9642                  which don't handle the current interface.  */
9643               val = call1 (val, Flist (nargs, args));
9644               if (CONSP (val))
9645                 return val;
9646               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9647                 return Fcons (val, val);
9648             }
9649           return Qnil;
9650         }
9651     }
9652   return Qnil;
9653 }
9654
9655 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9656        Sset_coding_system_priority, 0, MANY, 0,
9657        doc: /* Assign higher priority to the coding systems given as arguments.
9658 If multiple coding systems belong to the same category,
9659 all but the first one are ignored.
9660
9661 usage: (set-coding-system-priority &rest coding-systems)  */)
9662   (ptrdiff_t nargs, Lisp_Object *args)
9663 {
9664   ptrdiff_t i, j;
9665   bool changed[coding_category_max];
9666   enum coding_category priorities[coding_category_max];
9667
9668   memset (changed, 0, sizeof changed);
9669
9670   for (i = j = 0; i < nargs; i++)
9671     {
9672       enum coding_category category;
9673       Lisp_Object spec, attrs;
9674
9675       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9676       attrs = AREF (spec, 0);
9677       category = XINT (CODING_ATTR_CATEGORY (attrs));
9678       if (changed[category])
9679         /* Ignore this coding system because a coding system of the
9680            same category already had a higher priority.  */
9681         continue;
9682       changed[category] = 1;
9683       priorities[j++] = category;
9684       if (coding_categories[category].id >= 0
9685           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9686         setup_coding_system (args[i], &coding_categories[category]);
9687       Fset (AREF (Vcoding_category_table, category), args[i]);
9688     }
9689
9690   /* Now we have decided top J priorities.  Reflect the order of the
9691      original priorities to the remaining priorities.  */
9692
9693   for (i = j, j = 0; i < coding_category_max; i++, j++)
9694     {
9695       while (j < coding_category_max
9696              && changed[coding_priorities[j]])
9697         j++;
9698       if (j == coding_category_max)
9699         emacs_abort ();
9700       priorities[i] = coding_priorities[j];
9701     }
9702
9703   memcpy (coding_priorities, priorities, sizeof priorities);
9704
9705   /* Update `coding-category-list'.  */
9706   Vcoding_category_list = Qnil;
9707   for (i = coding_category_max; i-- > 0; )
9708     Vcoding_category_list
9709       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9710                Vcoding_category_list);
9711
9712   return Qnil;
9713 }
9714
9715 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9716        Scoding_system_priority_list, 0, 1, 0,
9717        doc: /* Return a list of coding systems ordered by their priorities.
9718 The list contains a subset of coding systems; i.e. coding systems
9719 assigned to each coding category (see `coding-category-list').
9720
9721 HIGHESTP non-nil means just return the highest priority one.  */)
9722   (Lisp_Object highestp)
9723 {
9724   int i;
9725   Lisp_Object val;
9726
9727   for (i = 0, val = Qnil; i < coding_category_max; i++)
9728     {
9729       enum coding_category category = coding_priorities[i];
9730       int id = coding_categories[category].id;
9731       Lisp_Object attrs;
9732
9733       if (id < 0)
9734         continue;
9735       attrs = CODING_ID_ATTRS (id);
9736       if (! NILP (highestp))
9737         return CODING_ATTR_BASE_NAME (attrs);
9738       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9739     }
9740   return Fnreverse (val);
9741 }
9742
9743 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9744
9745 static Lisp_Object
9746 make_subsidiaries (Lisp_Object base)
9747 {
9748   Lisp_Object subsidiaries;
9749   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9750   char *buf = alloca (base_name_len + 6);
9751   int i;
9752
9753   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9754   subsidiaries = make_uninit_vector (3);
9755   for (i = 0; i < 3; i++)
9756     {
9757       strcpy (buf + base_name_len, suffixes[i]);
9758       ASET (subsidiaries, i, intern (buf));
9759     }
9760   return subsidiaries;
9761 }
9762
9763
9764 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9765        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9766        doc: /* For internal use only.
9767 usage: (define-coding-system-internal ...)  */)
9768   (ptrdiff_t nargs, Lisp_Object *args)
9769 {
9770   Lisp_Object name;
9771   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9772   Lisp_Object attrs;            /* Vector of attributes.  */
9773   Lisp_Object eol_type;
9774   Lisp_Object aliases;
9775   Lisp_Object coding_type, charset_list, safe_charsets;
9776   enum coding_category category;
9777   Lisp_Object tail, val;
9778   int max_charset_id = 0;
9779   int i;
9780
9781   if (nargs < coding_arg_max)
9782     goto short_args;
9783
9784   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9785
9786   name = args[coding_arg_name];
9787   CHECK_SYMBOL (name);
9788   ASET (attrs, coding_attr_base_name, name);
9789
9790   val = args[coding_arg_mnemonic];
9791   if (! STRINGP (val))
9792     CHECK_CHARACTER (val);
9793   ASET (attrs, coding_attr_mnemonic, val);
9794
9795   coding_type = args[coding_arg_coding_type];
9796   CHECK_SYMBOL (coding_type);
9797   ASET (attrs, coding_attr_type, coding_type);
9798
9799   charset_list = args[coding_arg_charset_list];
9800   if (SYMBOLP (charset_list))
9801     {
9802       if (EQ (charset_list, Qiso_2022))
9803         {
9804           if (! EQ (coding_type, Qiso_2022))
9805             error ("Invalid charset-list");
9806           charset_list = Viso_2022_charset_list;
9807         }
9808       else if (EQ (charset_list, Qemacs_mule))
9809         {
9810           if (! EQ (coding_type, Qemacs_mule))
9811             error ("Invalid charset-list");
9812           charset_list = Vemacs_mule_charset_list;
9813         }
9814       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9815         {
9816           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9817             error ("Invalid charset-list");
9818           if (max_charset_id < XFASTINT (XCAR (tail)))
9819             max_charset_id = XFASTINT (XCAR (tail));
9820         }
9821     }
9822   else
9823     {
9824       charset_list = Fcopy_sequence (charset_list);
9825       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9826         {
9827           struct charset *charset;
9828
9829           val = XCAR (tail);
9830           CHECK_CHARSET_GET_CHARSET (val, charset);
9831           if (EQ (coding_type, Qiso_2022)
9832               ? CHARSET_ISO_FINAL (charset) < 0
9833               : EQ (coding_type, Qemacs_mule)
9834               ? CHARSET_EMACS_MULE_ID (charset) < 0
9835               : 0)
9836             error ("Can't handle charset `%s'",
9837                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9838
9839           XSETCAR (tail, make_number (charset->id));
9840           if (max_charset_id < charset->id)
9841             max_charset_id = charset->id;
9842         }
9843     }
9844   ASET (attrs, coding_attr_charset_list, charset_list);
9845
9846   safe_charsets = make_uninit_string (max_charset_id + 1);
9847   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9848   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9849     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9850   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9851
9852   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9853
9854   val = args[coding_arg_decode_translation_table];
9855   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9856     CHECK_SYMBOL (val);
9857   ASET (attrs, coding_attr_decode_tbl, val);
9858
9859   val = args[coding_arg_encode_translation_table];
9860   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9861     CHECK_SYMBOL (val);
9862   ASET (attrs, coding_attr_encode_tbl, val);
9863
9864   val = args[coding_arg_post_read_conversion];
9865   CHECK_SYMBOL (val);
9866   ASET (attrs, coding_attr_post_read, val);
9867
9868   val = args[coding_arg_pre_write_conversion];
9869   CHECK_SYMBOL (val);
9870   ASET (attrs, coding_attr_pre_write, val);
9871
9872   val = args[coding_arg_default_char];
9873   if (NILP (val))
9874     ASET (attrs, coding_attr_default_char, make_number (' '));
9875   else
9876     {
9877       CHECK_CHARACTER (val);
9878       ASET (attrs, coding_attr_default_char, val);
9879     }
9880
9881   val = args[coding_arg_for_unibyte];
9882   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9883
9884   val = args[coding_arg_plist];
9885   CHECK_LIST (val);
9886   ASET (attrs, coding_attr_plist, val);
9887
9888   if (EQ (coding_type, Qcharset))
9889     {
9890       /* Generate a lisp vector of 256 elements.  Each element is nil,
9891          integer, or a list of charset IDs.
9892
9893          If Nth element is nil, the byte code N is invalid in this
9894          coding system.
9895
9896          If Nth element is a number NUM, N is the first byte of a
9897          charset whose ID is NUM.
9898
9899          If Nth element is a list of charset IDs, N is the first byte
9900          of one of them.  The list is sorted by dimensions of the
9901          charsets.  A charset of smaller dimension comes first. */
9902       val = Fmake_vector (make_number (256), Qnil);
9903
9904       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9905         {
9906           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9907           int dim = CHARSET_DIMENSION (charset);
9908           int idx = (dim - 1) * 4;
9909
9910           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9911             ASET (attrs, coding_attr_ascii_compat, Qt);
9912
9913           for (i = charset->code_space[idx];
9914                i <= charset->code_space[idx + 1]; i++)
9915             {
9916               Lisp_Object tmp, tmp2;
9917               int dim2;
9918
9919               tmp = AREF (val, i);
9920               if (NILP (tmp))
9921                 tmp = XCAR (tail);
9922               else if (NUMBERP (tmp))
9923                 {
9924                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9925                   if (dim < dim2)
9926                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9927                   else
9928                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9929                 }
9930               else
9931                 {
9932                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9933                     {
9934                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9935                       if (dim < dim2)
9936                         break;
9937                     }
9938                   if (NILP (tmp2))
9939                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9940                   else
9941                     {
9942                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9943                       XSETCAR (tmp2, XCAR (tail));
9944                     }
9945                 }
9946               ASET (val, i, tmp);
9947             }
9948         }
9949       ASET (attrs, coding_attr_charset_valids, val);
9950       category = coding_category_charset;
9951     }
9952   else if (EQ (coding_type, Qccl))
9953     {
9954       Lisp_Object valids;
9955
9956       if (nargs < coding_arg_ccl_max)
9957         goto short_args;
9958
9959       val = args[coding_arg_ccl_decoder];
9960       CHECK_CCL_PROGRAM (val);
9961       if (VECTORP (val))
9962         val = Fcopy_sequence (val);
9963       ASET (attrs, coding_attr_ccl_decoder, val);
9964
9965       val = args[coding_arg_ccl_encoder];
9966       CHECK_CCL_PROGRAM (val);
9967       if (VECTORP (val))
9968         val = Fcopy_sequence (val);
9969       ASET (attrs, coding_attr_ccl_encoder, val);
9970
9971       val = args[coding_arg_ccl_valids];
9972       valids = Fmake_string (make_number (256), make_number (0));
9973       for (tail = val; CONSP (tail); tail = XCDR (tail))
9974         {
9975           int from, to;
9976
9977           val = XCAR (tail);
9978           if (INTEGERP (val))
9979             {
9980               if (! (0 <= XINT (val) && XINT (val) <= 255))
9981                 args_out_of_range_3 (val, make_number (0), make_number (255));
9982               from = to = XINT (val);
9983             }
9984           else
9985             {
9986               CHECK_CONS (val);
9987               CHECK_NATNUM_CAR (val);
9988               CHECK_NUMBER_CDR (val);
9989               if (XINT (XCAR (val)) > 255)
9990                 args_out_of_range_3 (XCAR (val),
9991                                      make_number (0), make_number (255));
9992               from = XINT (XCAR (val));
9993               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9994                 args_out_of_range_3 (XCDR (val),
9995                                      XCAR (val), make_number (255));
9996               to = XINT (XCDR (val));
9997             }
9998           for (i = from; i <= to; i++)
9999             SSET (valids, i, 1);
10000         }
10001       ASET (attrs, coding_attr_ccl_valids, valids);
10002
10003       category = coding_category_ccl;
10004     }
10005   else if (EQ (coding_type, Qutf_16))
10006     {
10007       Lisp_Object bom, endian;
10008
10009       ASET (attrs, coding_attr_ascii_compat, Qnil);
10010
10011       if (nargs < coding_arg_utf16_max)
10012         goto short_args;
10013
10014       bom = args[coding_arg_utf16_bom];
10015       if (! NILP (bom) && ! EQ (bom, Qt))
10016         {
10017           CHECK_CONS (bom);
10018           val = XCAR (bom);
10019           CHECK_CODING_SYSTEM (val);
10020           val = XCDR (bom);
10021           CHECK_CODING_SYSTEM (val);
10022         }
10023       ASET (attrs, coding_attr_utf_bom, bom);
10024
10025       endian = args[coding_arg_utf16_endian];
10026       CHECK_SYMBOL (endian);
10027       if (NILP (endian))
10028         endian = Qbig;
10029       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10030         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10031       ASET (attrs, coding_attr_utf_16_endian, endian);
10032
10033       category = (CONSP (bom)
10034                   ? coding_category_utf_16_auto
10035                   : NILP (bom)
10036                   ? (EQ (endian, Qbig)
10037                      ? coding_category_utf_16_be_nosig
10038                      : coding_category_utf_16_le_nosig)
10039                   : (EQ (endian, Qbig)
10040                      ? coding_category_utf_16_be
10041                      : coding_category_utf_16_le));
10042     }
10043   else if (EQ (coding_type, Qiso_2022))
10044     {
10045       Lisp_Object initial, reg_usage, request, flags;
10046
10047       if (nargs < coding_arg_iso2022_max)
10048         goto short_args;
10049
10050       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10051       CHECK_VECTOR (initial);
10052       for (i = 0; i < 4; i++)
10053         {
10054           val = AREF (initial, i);
10055           if (! NILP (val))
10056             {
10057               struct charset *charset;
10058
10059               CHECK_CHARSET_GET_CHARSET (val, charset);
10060               ASET (initial, i, make_number (CHARSET_ID (charset)));
10061               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10062                 ASET (attrs, coding_attr_ascii_compat, Qt);
10063             }
10064           else
10065             ASET (initial, i, make_number (-1));
10066         }
10067
10068       reg_usage = args[coding_arg_iso2022_reg_usage];
10069       CHECK_CONS (reg_usage);
10070       CHECK_NUMBER_CAR (reg_usage);
10071       CHECK_NUMBER_CDR (reg_usage);
10072
10073       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10074       for (tail = request; CONSP (tail); tail = XCDR (tail))
10075         {
10076           int id;
10077           Lisp_Object tmp1;
10078
10079           val = XCAR (tail);
10080           CHECK_CONS (val);
10081           tmp1 = XCAR (val);
10082           CHECK_CHARSET_GET_ID (tmp1, id);
10083           CHECK_NATNUM_CDR (val);
10084           if (XINT (XCDR (val)) >= 4)
10085             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10086           XSETCAR (val, make_number (id));
10087         }
10088
10089       flags = args[coding_arg_iso2022_flags];
10090       CHECK_NATNUM (flags);
10091       i = XINT (flags) & INT_MAX;
10092       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10093         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10094       flags = make_number (i);
10095
10096       ASET (attrs, coding_attr_iso_initial, initial);
10097       ASET (attrs, coding_attr_iso_usage, reg_usage);
10098       ASET (attrs, coding_attr_iso_request, request);
10099       ASET (attrs, coding_attr_iso_flags, flags);
10100       setup_iso_safe_charsets (attrs);
10101
10102       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10103         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10104                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10105                     ? coding_category_iso_7_else
10106                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10107                     ? coding_category_iso_7
10108                     : coding_category_iso_7_tight);
10109       else
10110         {
10111           int id = XINT (AREF (initial, 1));
10112
10113           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10114                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10115                        || id < 0)
10116                       ? coding_category_iso_8_else
10117                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10118                       ? coding_category_iso_8_1
10119                       : coding_category_iso_8_2);
10120         }
10121       if (category != coding_category_iso_8_1
10122           && category != coding_category_iso_8_2)
10123         ASET (attrs, coding_attr_ascii_compat, Qnil);
10124     }
10125   else if (EQ (coding_type, Qemacs_mule))
10126     {
10127       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10128         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10129       ASET (attrs, coding_attr_ascii_compat, Qt);
10130       category = coding_category_emacs_mule;
10131     }
10132   else if (EQ (coding_type, Qshift_jis))
10133     {
10134
10135       struct charset *charset;
10136
10137       if (XINT (Flength (charset_list)) != 3
10138           && XINT (Flength (charset_list)) != 4)
10139         error ("There should be three or four charsets");
10140
10141       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10142       if (CHARSET_DIMENSION (charset) != 1)
10143         error ("Dimension of charset %s is not one",
10144                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10145       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10146         ASET (attrs, coding_attr_ascii_compat, Qt);
10147
10148       charset_list = XCDR (charset_list);
10149       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10150       if (CHARSET_DIMENSION (charset) != 1)
10151         error ("Dimension of charset %s is not one",
10152                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10153
10154       charset_list = XCDR (charset_list);
10155       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10156       if (CHARSET_DIMENSION (charset) != 2)
10157         error ("Dimension of charset %s is not two",
10158                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10159
10160       charset_list = XCDR (charset_list);
10161       if (! NILP (charset_list))
10162         {
10163           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10164           if (CHARSET_DIMENSION (charset) != 2)
10165             error ("Dimension of charset %s is not two",
10166                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10167         }
10168
10169       category = coding_category_sjis;
10170       Vsjis_coding_system = name;
10171     }
10172   else if (EQ (coding_type, Qbig5))
10173     {
10174       struct charset *charset;
10175
10176       if (XINT (Flength (charset_list)) != 2)
10177         error ("There should be just two charsets");
10178
10179       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10180       if (CHARSET_DIMENSION (charset) != 1)
10181         error ("Dimension of charset %s is not one",
10182                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10183       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10184         ASET (attrs, coding_attr_ascii_compat, Qt);
10185
10186       charset_list = XCDR (charset_list);
10187       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10188       if (CHARSET_DIMENSION (charset) != 2)
10189         error ("Dimension of charset %s is not two",
10190                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10191
10192       category = coding_category_big5;
10193       Vbig5_coding_system = name;
10194     }
10195   else if (EQ (coding_type, Qraw_text))
10196     {
10197       category = coding_category_raw_text;
10198       ASET (attrs, coding_attr_ascii_compat, Qt);
10199     }
10200   else if (EQ (coding_type, Qutf_8))
10201     {
10202       Lisp_Object bom;
10203
10204       if (nargs < coding_arg_utf8_max)
10205         goto short_args;
10206
10207       bom = args[coding_arg_utf8_bom];
10208       if (! NILP (bom) && ! EQ (bom, Qt))
10209         {
10210           CHECK_CONS (bom);
10211           val = XCAR (bom);
10212           CHECK_CODING_SYSTEM (val);
10213           val = XCDR (bom);
10214           CHECK_CODING_SYSTEM (val);
10215         }
10216       ASET (attrs, coding_attr_utf_bom, bom);
10217       if (NILP (bom))
10218         ASET (attrs, coding_attr_ascii_compat, Qt);
10219
10220       category = (CONSP (bom) ? coding_category_utf_8_auto
10221                   : NILP (bom) ? coding_category_utf_8_nosig
10222                   : coding_category_utf_8_sig);
10223     }
10224   else if (EQ (coding_type, Qundecided))
10225     category = coding_category_undecided;
10226   else
10227     error ("Invalid coding system type: %s",
10228            SDATA (SYMBOL_NAME (coding_type)));
10229
10230   ASET (attrs, coding_attr_category, make_number (category));
10231   ASET (attrs, coding_attr_plist,
10232         Fcons (QCcategory,
10233                Fcons (AREF (Vcoding_category_table, category),
10234                       CODING_ATTR_PLIST (attrs))));
10235   ASET (attrs, coding_attr_plist,
10236         Fcons (QCascii_compatible_p,
10237                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10238                       CODING_ATTR_PLIST (attrs))));
10239
10240   eol_type = args[coding_arg_eol_type];
10241   if (! NILP (eol_type)
10242       && ! EQ (eol_type, Qunix)
10243       && ! EQ (eol_type, Qdos)
10244       && ! EQ (eol_type, Qmac))
10245     error ("Invalid eol-type");
10246
10247   aliases = Fcons (name, Qnil);
10248
10249   if (NILP (eol_type))
10250     {
10251       eol_type = make_subsidiaries (name);
10252       for (i = 0; i < 3; i++)
10253         {
10254           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10255
10256           this_name = AREF (eol_type, i);
10257           this_aliases = Fcons (this_name, Qnil);
10258           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10259           this_spec = make_uninit_vector (3);
10260           ASET (this_spec, 0, attrs);
10261           ASET (this_spec, 1, this_aliases);
10262           ASET (this_spec, 2, this_eol_type);
10263           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10264           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10265           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10266           if (NILP (val))
10267             Vcoding_system_alist
10268               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10269                        Vcoding_system_alist);
10270         }
10271     }
10272
10273   spec_vec = make_uninit_vector (3);
10274   ASET (spec_vec, 0, attrs);
10275   ASET (spec_vec, 1, aliases);
10276   ASET (spec_vec, 2, eol_type);
10277
10278   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10279   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10280   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10281   if (NILP (val))
10282     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10283                                   Vcoding_system_alist);
10284
10285   {
10286     int id = coding_categories[category].id;
10287
10288     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10289       setup_coding_system (name, &coding_categories[category]);
10290   }
10291
10292   return Qnil;
10293
10294  short_args:
10295   return Fsignal (Qwrong_number_of_arguments,
10296                   Fcons (intern ("define-coding-system-internal"),
10297                          make_number (nargs)));
10298 }
10299
10300
10301 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10302        3, 3, 0,
10303        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10304   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10305 {
10306   Lisp_Object spec, attrs;
10307
10308   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10309   attrs = AREF (spec, 0);
10310   if (EQ (prop, QCmnemonic))
10311     {
10312       if (! STRINGP (val))
10313         CHECK_CHARACTER (val);
10314       ASET (attrs, coding_attr_mnemonic, val);
10315     }
10316   else if (EQ (prop, QCdefault_char))
10317     {
10318       if (NILP (val))
10319         val = make_number (' ');
10320       else
10321         CHECK_CHARACTER (val);
10322       ASET (attrs, coding_attr_default_char, val);
10323     }
10324   else if (EQ (prop, QCdecode_translation_table))
10325     {
10326       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10327         CHECK_SYMBOL (val);
10328       ASET (attrs, coding_attr_decode_tbl, val);
10329     }
10330   else if (EQ (prop, QCencode_translation_table))
10331     {
10332       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10333         CHECK_SYMBOL (val);
10334       ASET (attrs, coding_attr_encode_tbl, val);
10335     }
10336   else if (EQ (prop, QCpost_read_conversion))
10337     {
10338       CHECK_SYMBOL (val);
10339       ASET (attrs, coding_attr_post_read, val);
10340     }
10341   else if (EQ (prop, QCpre_write_conversion))
10342     {
10343       CHECK_SYMBOL (val);
10344       ASET (attrs, coding_attr_pre_write, val);
10345     }
10346   else if (EQ (prop, QCascii_compatible_p))
10347     {
10348       ASET (attrs, coding_attr_ascii_compat, val);
10349     }
10350
10351   ASET (attrs, coding_attr_plist,
10352         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10353   return val;
10354 }
10355
10356
10357 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10358        Sdefine_coding_system_alias, 2, 2, 0,
10359        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10360   (Lisp_Object alias, Lisp_Object coding_system)
10361 {
10362   Lisp_Object spec, aliases, eol_type, val;
10363
10364   CHECK_SYMBOL (alias);
10365   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10366   aliases = AREF (spec, 1);
10367   /* ALIASES should be a list of length more than zero, and the first
10368      element is a base coding system.  Append ALIAS at the tail of the
10369      list.  */
10370   while (!NILP (XCDR (aliases)))
10371     aliases = XCDR (aliases);
10372   XSETCDR (aliases, Fcons (alias, Qnil));
10373
10374   eol_type = AREF (spec, 2);
10375   if (VECTORP (eol_type))
10376     {
10377       Lisp_Object subsidiaries;
10378       int i;
10379
10380       subsidiaries = make_subsidiaries (alias);
10381       for (i = 0; i < 3; i++)
10382         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10383                                      AREF (eol_type, i));
10384     }
10385
10386   Fputhash (alias, spec, Vcoding_system_hash_table);
10387   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10388   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10389   if (NILP (val))
10390     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10391                                   Vcoding_system_alist);
10392
10393   return Qnil;
10394 }
10395
10396 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10397        1, 1, 0,
10398        doc: /* Return the base of CODING-SYSTEM.
10399 Any alias or subsidiary coding system is not a base coding system.  */)
10400   (Lisp_Object coding_system)
10401 {
10402   Lisp_Object spec, attrs;
10403
10404   if (NILP (coding_system))
10405     return (Qno_conversion);
10406   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10407   attrs = AREF (spec, 0);
10408   return CODING_ATTR_BASE_NAME (attrs);
10409 }
10410
10411 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10412        1, 1, 0,
10413        doc: "Return the property list of CODING-SYSTEM.")
10414   (Lisp_Object coding_system)
10415 {
10416   Lisp_Object spec, attrs;
10417
10418   if (NILP (coding_system))
10419     coding_system = Qno_conversion;
10420   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10421   attrs = AREF (spec, 0);
10422   return CODING_ATTR_PLIST (attrs);
10423 }
10424
10425
10426 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10427        1, 1, 0,
10428        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10429   (Lisp_Object coding_system)
10430 {
10431   Lisp_Object spec;
10432
10433   if (NILP (coding_system))
10434     coding_system = Qno_conversion;
10435   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10436   return AREF (spec, 1);
10437 }
10438
10439 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10440        Scoding_system_eol_type, 1, 1, 0,
10441        doc: /* Return eol-type of CODING-SYSTEM.
10442 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10443
10444 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10445 and CR respectively.
10446
10447 A vector value indicates that a format of end-of-line should be
10448 detected automatically.  Nth element of the vector is the subsidiary
10449 coding system whose eol-type is N.  */)
10450   (Lisp_Object coding_system)
10451 {
10452   Lisp_Object spec, eol_type;
10453   int n;
10454
10455   if (NILP (coding_system))
10456     coding_system = Qno_conversion;
10457   if (! CODING_SYSTEM_P (coding_system))
10458     return Qnil;
10459   spec = CODING_SYSTEM_SPEC (coding_system);
10460   eol_type = AREF (spec, 2);
10461   if (VECTORP (eol_type))
10462     return Fcopy_sequence (eol_type);
10463   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10464   return make_number (n);
10465 }
10466
10467 #endif /* emacs */
10468
10469 \f
10470 /*** 9. Post-amble ***/
10471
10472 void
10473 init_coding_once (void)
10474 {
10475   int i;
10476
10477   for (i = 0; i < coding_category_max; i++)
10478     {
10479       coding_categories[i].id = -1;
10480       coding_priorities[i] = i;
10481     }
10482
10483   /* ISO2022 specific initialize routine.  */
10484   for (i = 0; i < 0x20; i++)
10485     iso_code_class[i] = ISO_control_0;
10486   for (i = 0x21; i < 0x7F; i++)
10487     iso_code_class[i] = ISO_graphic_plane_0;
10488   for (i = 0x80; i < 0xA0; i++)
10489     iso_code_class[i] = ISO_control_1;
10490   for (i = 0xA1; i < 0xFF; i++)
10491     iso_code_class[i] = ISO_graphic_plane_1;
10492   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10493   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10494   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10495   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10496   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10497   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10498   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10499   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10500   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10501
10502   for (i = 0; i < 256; i++)
10503     {
10504       emacs_mule_bytes[i] = 1;
10505     }
10506   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10507   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10508   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10509   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10510 }
10511
10512 #ifdef emacs
10513
10514 void
10515 syms_of_coding (void)
10516 {
10517   staticpro (&Vcoding_system_hash_table);
10518   {
10519     Lisp_Object args[2];
10520     args[0] = QCtest;
10521     args[1] = Qeq;
10522     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10523   }
10524
10525   staticpro (&Vsjis_coding_system);
10526   Vsjis_coding_system = Qnil;
10527
10528   staticpro (&Vbig5_coding_system);
10529   Vbig5_coding_system = Qnil;
10530
10531   staticpro (&Vcode_conversion_reused_workbuf);
10532   Vcode_conversion_reused_workbuf = Qnil;
10533
10534   staticpro (&Vcode_conversion_workbuf_name);
10535   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10536
10537   reused_workbuf_in_use = 0;
10538
10539   DEFSYM (Qcharset, "charset");
10540   DEFSYM (Qtarget_idx, "target-idx");
10541   DEFSYM (Qcoding_system_history, "coding-system-history");
10542   Fset (Qcoding_system_history, Qnil);
10543
10544   /* Target FILENAME is the first argument.  */
10545   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10546   /* Target FILENAME is the third argument.  */
10547   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10548
10549   DEFSYM (Qcall_process, "call-process");
10550   /* Target PROGRAM is the first argument.  */
10551   Fput (Qcall_process, Qtarget_idx, make_number (0));
10552
10553   DEFSYM (Qcall_process_region, "call-process-region");
10554   /* Target PROGRAM is the third argument.  */
10555   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10556
10557   DEFSYM (Qstart_process, "start-process");
10558   /* Target PROGRAM is the third argument.  */
10559   Fput (Qstart_process, Qtarget_idx, make_number (2));
10560
10561   DEFSYM (Qopen_network_stream, "open-network-stream");
10562   /* Target SERVICE is the fourth argument.  */
10563   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10564
10565   DEFSYM (Qcoding_system, "coding-system");
10566   DEFSYM (Qcoding_aliases, "coding-aliases");
10567
10568   DEFSYM (Qeol_type, "eol-type");
10569   DEFSYM (Qunix, "unix");
10570   DEFSYM (Qdos, "dos");
10571   DEFSYM (Qmac, "mac");
10572
10573   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10574   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10575   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10576   DEFSYM (Qdefault_char, "default-char");
10577   DEFSYM (Qundecided, "undecided");
10578   DEFSYM (Qno_conversion, "no-conversion");
10579   DEFSYM (Qraw_text, "raw-text");
10580
10581   DEFSYM (Qiso_2022, "iso-2022");
10582
10583   DEFSYM (Qutf_8, "utf-8");
10584   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10585
10586 #if defined (WINDOWSNT) || defined (CYGWIN)
10587   /* No, not utf-16-le: that one has a BOM.  */
10588   DEFSYM (Qutf_16le, "utf-16le");
10589 #endif
10590
10591   DEFSYM (Qutf_16, "utf-16");
10592   DEFSYM (Qbig, "big");
10593   DEFSYM (Qlittle, "little");
10594
10595   DEFSYM (Qshift_jis, "shift-jis");
10596   DEFSYM (Qbig5, "big5");
10597
10598   DEFSYM (Qcoding_system_p, "coding-system-p");
10599
10600   DEFSYM (Qcoding_system_error, "coding-system-error");
10601   Fput (Qcoding_system_error, Qerror_conditions,
10602         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10603   Fput (Qcoding_system_error, Qerror_message,
10604         build_pure_c_string ("Invalid coding system"));
10605
10606   /* Intern this now in case it isn't already done.
10607      Setting this variable twice is harmless.
10608      But don't staticpro it here--that is done in alloc.c.  */
10609   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10610
10611   DEFSYM (Qtranslation_table, "translation-table");
10612   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10613   DEFSYM (Qtranslation_table_id, "translation-table-id");
10614   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10615   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10616
10617   DEFSYM (Qvalid_codes, "valid-codes");
10618
10619   DEFSYM (Qemacs_mule, "emacs-mule");
10620
10621   DEFSYM (QCcategory, ":category");
10622   DEFSYM (QCmnemonic, ":mnemonic");
10623   DEFSYM (QCdefault_char, ":default-char");
10624   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10625   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10626   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10627   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10628   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10629
10630   Vcoding_category_table
10631     = Fmake_vector (make_number (coding_category_max), Qnil);
10632   staticpro (&Vcoding_category_table);
10633   /* Followings are target of code detection.  */
10634   ASET (Vcoding_category_table, coding_category_iso_7,
10635         intern_c_string ("coding-category-iso-7"));
10636   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10637         intern_c_string ("coding-category-iso-7-tight"));
10638   ASET (Vcoding_category_table, coding_category_iso_8_1,
10639         intern_c_string ("coding-category-iso-8-1"));
10640   ASET (Vcoding_category_table, coding_category_iso_8_2,
10641         intern_c_string ("coding-category-iso-8-2"));
10642   ASET (Vcoding_category_table, coding_category_iso_7_else,
10643         intern_c_string ("coding-category-iso-7-else"));
10644   ASET (Vcoding_category_table, coding_category_iso_8_else,
10645         intern_c_string ("coding-category-iso-8-else"));
10646   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10647         intern_c_string ("coding-category-utf-8-auto"));
10648   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10649         intern_c_string ("coding-category-utf-8"));
10650   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10651         intern_c_string ("coding-category-utf-8-sig"));
10652   ASET (Vcoding_category_table, coding_category_utf_16_be,
10653         intern_c_string ("coding-category-utf-16-be"));
10654   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10655         intern_c_string ("coding-category-utf-16-auto"));
10656   ASET (Vcoding_category_table, coding_category_utf_16_le,
10657         intern_c_string ("coding-category-utf-16-le"));
10658   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10659         intern_c_string ("coding-category-utf-16-be-nosig"));
10660   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10661         intern_c_string ("coding-category-utf-16-le-nosig"));
10662   ASET (Vcoding_category_table, coding_category_charset,
10663         intern_c_string ("coding-category-charset"));
10664   ASET (Vcoding_category_table, coding_category_sjis,
10665         intern_c_string ("coding-category-sjis"));
10666   ASET (Vcoding_category_table, coding_category_big5,
10667         intern_c_string ("coding-category-big5"));
10668   ASET (Vcoding_category_table, coding_category_ccl,
10669         intern_c_string ("coding-category-ccl"));
10670   ASET (Vcoding_category_table, coding_category_emacs_mule,
10671         intern_c_string ("coding-category-emacs-mule"));
10672   /* Followings are NOT target of code detection.  */
10673   ASET (Vcoding_category_table, coding_category_raw_text,
10674         intern_c_string ("coding-category-raw-text"));
10675   ASET (Vcoding_category_table, coding_category_undecided,
10676         intern_c_string ("coding-category-undecided"));
10677
10678   DEFSYM (Qinsufficient_source, "insufficient-source");
10679   DEFSYM (Qinvalid_source, "invalid-source");
10680   DEFSYM (Qinterrupted, "interrupted");
10681   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10682
10683   defsubr (&Scoding_system_p);
10684   defsubr (&Sread_coding_system);
10685   defsubr (&Sread_non_nil_coding_system);
10686   defsubr (&Scheck_coding_system);
10687   defsubr (&Sdetect_coding_region);
10688   defsubr (&Sdetect_coding_string);
10689   defsubr (&Sfind_coding_systems_region_internal);
10690   defsubr (&Sunencodable_char_position);
10691   defsubr (&Scheck_coding_systems_region);
10692   defsubr (&Sdecode_coding_region);
10693   defsubr (&Sencode_coding_region);
10694   defsubr (&Sdecode_coding_string);
10695   defsubr (&Sencode_coding_string);
10696   defsubr (&Sdecode_sjis_char);
10697   defsubr (&Sencode_sjis_char);
10698   defsubr (&Sdecode_big5_char);
10699   defsubr (&Sencode_big5_char);
10700   defsubr (&Sset_terminal_coding_system_internal);
10701   defsubr (&Sset_safe_terminal_coding_system_internal);
10702   defsubr (&Sterminal_coding_system);
10703   defsubr (&Sset_keyboard_coding_system_internal);
10704   defsubr (&Skeyboard_coding_system);
10705   defsubr (&Sfind_operation_coding_system);
10706   defsubr (&Sset_coding_system_priority);
10707   defsubr (&Sdefine_coding_system_internal);
10708   defsubr (&Sdefine_coding_system_alias);
10709   defsubr (&Scoding_system_put);
10710   defsubr (&Scoding_system_base);
10711   defsubr (&Scoding_system_plist);
10712   defsubr (&Scoding_system_aliases);
10713   defsubr (&Scoding_system_eol_type);
10714   defsubr (&Scoding_system_priority_list);
10715
10716   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10717                doc: /* List of coding systems.
10718
10719 Do not alter the value of this variable manually.  This variable should be
10720 updated by the functions `define-coding-system' and
10721 `define-coding-system-alias'.  */);
10722   Vcoding_system_list = Qnil;
10723
10724   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10725                doc: /* Alist of coding system names.
10726 Each element is one element list of coding system name.
10727 This variable is given to `completing-read' as COLLECTION argument.
10728
10729 Do not alter the value of this variable manually.  This variable should be
10730 updated by the functions `make-coding-system' and
10731 `define-coding-system-alias'.  */);
10732   Vcoding_system_alist = Qnil;
10733
10734   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10735                doc: /* List of coding-categories (symbols) ordered by priority.
10736
10737 On detecting a coding system, Emacs tries code detection algorithms
10738 associated with each coding-category one by one in this order.  When
10739 one algorithm agrees with a byte sequence of source text, the coding
10740 system bound to the corresponding coding-category is selected.
10741
10742 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10743   {
10744     int i;
10745
10746     Vcoding_category_list = Qnil;
10747     for (i = coding_category_max - 1; i >= 0; i--)
10748       Vcoding_category_list
10749         = Fcons (AREF (Vcoding_category_table, i),
10750                  Vcoding_category_list);
10751   }
10752
10753   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10754                doc: /* Specify the coding system for read operations.
10755 It is useful to bind this variable with `let', but do not set it globally.
10756 If the value is a coding system, it is used for decoding on read operation.
10757 If not, an appropriate element is used from one of the coding system alists.
10758 There are three such tables: `file-coding-system-alist',
10759 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10760   Vcoding_system_for_read = Qnil;
10761
10762   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10763                doc: /* Specify the coding system for write operations.
10764 Programs bind this variable with `let', but you should not set it globally.
10765 If the value is a coding system, it is used for encoding of output,
10766 when writing it to a file and when sending it to a file or subprocess.
10767
10768 If this does not specify a coding system, an appropriate element
10769 is used from one of the coding system alists.
10770 There are three such tables: `file-coding-system-alist',
10771 `process-coding-system-alist', and `network-coding-system-alist'.
10772 For output to files, if the above procedure does not specify a coding system,
10773 the value of `buffer-file-coding-system' is used.  */);
10774   Vcoding_system_for_write = Qnil;
10775
10776   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10777                doc: /*
10778 Coding system used in the latest file or process I/O.  */);
10779   Vlast_coding_system_used = Qnil;
10780
10781   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10782                doc: /*
10783 Error status of the last code conversion.
10784
10785 When an error was detected in the last code conversion, this variable
10786 is set to one of the following symbols.
10787   `insufficient-source'
10788   `inconsistent-eol'
10789   `invalid-source'
10790   `interrupted'
10791   `insufficient-memory'
10792 When no error was detected, the value doesn't change.  So, to check
10793 the error status of a code conversion by this variable, you must
10794 explicitly set this variable to nil before performing code
10795 conversion.  */);
10796   Vlast_code_conversion_error = Qnil;
10797
10798   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10799                doc: /*
10800 *Non-nil means always inhibit code conversion of end-of-line format.
10801 See info node `Coding Systems' and info node `Text and Binary' concerning
10802 such conversion.  */);
10803   inhibit_eol_conversion = 0;
10804
10805   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10806                doc: /*
10807 Non-nil means process buffer inherits coding system of process output.
10808 Bind it to t if the process output is to be treated as if it were a file
10809 read from some filesystem.  */);
10810   inherit_process_coding_system = 0;
10811
10812   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10813                doc: /*
10814 Alist to decide a coding system to use for a file I/O operation.
10815 The format is ((PATTERN . VAL) ...),
10816 where PATTERN is a regular expression matching a file name,
10817 VAL is a coding system, a cons of coding systems, or a function symbol.
10818 If VAL is a coding system, it is used for both decoding and encoding
10819 the file contents.
10820 If VAL is a cons of coding systems, the car part is used for decoding,
10821 and the cdr part is used for encoding.
10822 If VAL is a function symbol, the function must return a coding system
10823 or a cons of coding systems which are used as above.  The function is
10824 called with an argument that is a list of the arguments with which
10825 `find-operation-coding-system' was called.  If the function can't decide
10826 a coding system, it can return `undecided' so that the normal
10827 code-detection is performed.
10828
10829 See also the function `find-operation-coding-system'
10830 and the variable `auto-coding-alist'.  */);
10831   Vfile_coding_system_alist = Qnil;
10832
10833   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10834                doc: /*
10835 Alist to decide a coding system to use for a process I/O operation.
10836 The format is ((PATTERN . VAL) ...),
10837 where PATTERN is a regular expression matching a program name,
10838 VAL is a coding system, a cons of coding systems, or a function symbol.
10839 If VAL is a coding system, it is used for both decoding what received
10840 from the program and encoding what sent to the program.
10841 If VAL is a cons of coding systems, the car part is used for decoding,
10842 and the cdr part is used for encoding.
10843 If VAL is a function symbol, the function must return a coding system
10844 or a cons of coding systems which are used as above.
10845
10846 See also the function `find-operation-coding-system'.  */);
10847   Vprocess_coding_system_alist = Qnil;
10848
10849   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10850                doc: /*
10851 Alist to decide a coding system to use for a network I/O operation.
10852 The format is ((PATTERN . VAL) ...),
10853 where PATTERN is a regular expression matching a network service name
10854 or is a port number to connect to,
10855 VAL is a coding system, a cons of coding systems, or a function symbol.
10856 If VAL is a coding system, it is used for both decoding what received
10857 from the network stream and encoding what sent to the network stream.
10858 If VAL is a cons of coding systems, the car part is used for decoding,
10859 and the cdr part is used for encoding.
10860 If VAL is a function symbol, the function must return a coding system
10861 or a cons of coding systems which are used as above.
10862
10863 See also the function `find-operation-coding-system'.  */);
10864   Vnetwork_coding_system_alist = Qnil;
10865
10866   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10867                doc: /* Coding system to use with system messages.
10868 Also used for decoding keyboard input on X Window system.  */);
10869   Vlocale_coding_system = Qnil;
10870
10871   /* The eol mnemonics are reset in startup.el system-dependently.  */
10872   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10873                doc: /*
10874 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10875   eol_mnemonic_unix = build_pure_c_string (":");
10876
10877   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10878                doc: /*
10879 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10880   eol_mnemonic_dos = build_pure_c_string ("\\");
10881
10882   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10883                doc: /*
10884 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10885   eol_mnemonic_mac = build_pure_c_string ("/");
10886
10887   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10888                doc: /*
10889 *String displayed in mode line when end-of-line format is not yet determined.  */);
10890   eol_mnemonic_undecided = build_pure_c_string (":");
10891
10892   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10893                doc: /*
10894 *Non-nil enables character translation while encoding and decoding.  */);
10895   Venable_character_translation = Qt;
10896
10897   DEFVAR_LISP ("standard-translation-table-for-decode",
10898                Vstandard_translation_table_for_decode,
10899                doc: /* Table for translating characters while decoding.  */);
10900   Vstandard_translation_table_for_decode = Qnil;
10901
10902   DEFVAR_LISP ("standard-translation-table-for-encode",
10903                Vstandard_translation_table_for_encode,
10904                doc: /* Table for translating characters while encoding.  */);
10905   Vstandard_translation_table_for_encode = Qnil;
10906
10907   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10908                doc: /* Alist of charsets vs revision numbers.
10909 While encoding, if a charset (car part of an element) is found,
10910 designate it with the escape sequence identifying revision (cdr part
10911 of the element).  */);
10912   Vcharset_revision_table = Qnil;
10913
10914   DEFVAR_LISP ("default-process-coding-system",
10915                Vdefault_process_coding_system,
10916                doc: /* Cons of coding systems used for process I/O by default.
10917 The car part is used for decoding a process output,
10918 the cdr part is used for encoding a text to be sent to a process.  */);
10919   Vdefault_process_coding_system = Qnil;
10920
10921   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10922                doc: /*
10923 Table of extra Latin codes in the range 128..159 (inclusive).
10924 This is a vector of length 256.
10925 If Nth element is non-nil, the existence of code N in a file
10926 \(or output of subprocess) doesn't prevent it to be detected as
10927 a coding system of ISO 2022 variant which has a flag
10928 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10929 or reading output of a subprocess.
10930 Only 128th through 159th elements have a meaning.  */);
10931   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10932
10933   DEFVAR_LISP ("select-safe-coding-system-function",
10934                Vselect_safe_coding_system_function,
10935                doc: /*
10936 Function to call to select safe coding system for encoding a text.
10937
10938 If set, this function is called to force a user to select a proper
10939 coding system which can encode the text in the case that a default
10940 coding system used in each operation can't encode the text.  The
10941 function should take care that the buffer is not modified while
10942 the coding system is being selected.
10943
10944 The default value is `select-safe-coding-system' (which see).  */);
10945   Vselect_safe_coding_system_function = Qnil;
10946
10947   DEFVAR_BOOL ("coding-system-require-warning",
10948                coding_system_require_warning,
10949                doc: /* Internal use only.
10950 If non-nil, on writing a file, `select-safe-coding-system-function' is
10951 called even if `coding-system-for-write' is non-nil.  The command
10952 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10953   coding_system_require_warning = 0;
10954
10955
10956   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10957                inhibit_iso_escape_detection,
10958                doc: /*
10959 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10960
10961 When Emacs reads text, it tries to detect how the text is encoded.
10962 This code detection is sensitive to escape sequences.  If Emacs sees
10963 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10964 of the ISO2022 encodings, and decodes text by the corresponding coding
10965 system (e.g. `iso-2022-7bit').
10966
10967 However, there may be a case that you want to read escape sequences in
10968 a file as is.  In such a case, you can set this variable to non-nil.
10969 Then the code detection will ignore any escape sequences, and no text is
10970 detected as encoded in some ISO-2022 encoding.  The result is that all
10971 escape sequences become visible in a buffer.
10972
10973 The default value is nil, and it is strongly recommended not to change
10974 it.  That is because many Emacs Lisp source files that contain
10975 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10976 in Emacs's distribution, and they won't be decoded correctly on
10977 reading if you suppress escape sequence detection.
10978
10979 The other way to read escape sequences in a file without decoding is
10980 to explicitly specify some coding system that doesn't use ISO-2022
10981 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
10982   inhibit_iso_escape_detection = 0;
10983
10984   DEFVAR_BOOL ("inhibit-null-byte-detection",
10985                inhibit_null_byte_detection,
10986                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10987 By default, Emacs treats it as binary data, and does not attempt to
10988 decode it.  The effect is as if you specified `no-conversion' for
10989 reading that text.
10990
10991 Set this to non-nil when a regular text happens to include null bytes.
10992 Examples are Index nodes of Info files and null-byte delimited output
10993 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10994 decode text as usual.  */);
10995   inhibit_null_byte_detection = 0;
10996
10997   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
10998                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
10999 Internal use only.  Removed after the experimental optimizer gets stable. */);
11000   disable_ascii_optimization = 0;
11001
11002   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11003                doc: /* Char table for translating self-inserting characters.
11004 This is applied to the result of input methods, not their input.
11005 See also `keyboard-translate-table'.
11006
11007 Use of this variable for character code unification was rendered
11008 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11009 internal character representation.  */);
11010     Vtranslation_table_for_input = Qnil;
11011
11012   {
11013     Lisp_Object args[coding_arg_max];
11014     Lisp_Object plist[16];
11015     int i;
11016
11017     for (i = 0; i < coding_arg_max; i++)
11018       args[i] = Qnil;
11019
11020     plist[0] = intern_c_string (":name");
11021     plist[1] = args[coding_arg_name] = Qno_conversion;
11022     plist[2] = intern_c_string (":mnemonic");
11023     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11024     plist[4] = intern_c_string (":coding-type");
11025     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11026     plist[6] = intern_c_string (":ascii-compatible-p");
11027     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11028     plist[8] = intern_c_string (":default-char");
11029     plist[9] = args[coding_arg_default_char] = make_number (0);
11030     plist[10] = intern_c_string (":for-unibyte");
11031     plist[11] = args[coding_arg_for_unibyte] = Qt;
11032     plist[12] = intern_c_string (":docstring");
11033     plist[13] = build_pure_c_string ("Do no conversion.\n\
11034 \n\
11035 When you visit a file with this coding, the file is read into a\n\
11036 unibyte buffer as is, thus each byte of a file is treated as a\n\
11037 character.");
11038     plist[14] = intern_c_string (":eol-type");
11039     plist[15] = args[coding_arg_eol_type] = Qunix;
11040     args[coding_arg_plist] = Flist (16, plist);
11041     Fdefine_coding_system_internal (coding_arg_max, args);
11042
11043     plist[1] = args[coding_arg_name] = Qundecided;
11044     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11045     plist[5] = args[coding_arg_coding_type] = Qundecided;
11046     /* This is already set.
11047        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11048     plist[8] = intern_c_string (":charset-list");
11049     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11050     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11051     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11052     plist[15] = args[coding_arg_eol_type] = Qnil;
11053     args[coding_arg_plist] = Flist (16, plist);
11054     Fdefine_coding_system_internal (coding_arg_max, args);
11055   }
11056
11057   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11058
11059   {
11060     int i;
11061
11062     for (i = 0; i < coding_category_max; i++)
11063       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11064   }
11065 #if defined (DOS_NT)
11066   system_eol_type = Qdos;
11067 #else
11068   system_eol_type = Qunix;
11069 #endif
11070   staticpro (&system_eol_type);
11071 }
11072
11073 char *
11074 emacs_strerror (int error_number)
11075 {
11076   char *str;
11077
11078   synchronize_system_messages_locale ();
11079   str = strerror (error_number);
11080
11081   if (! NILP (Vlocale_coding_system))
11082     {
11083       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11084                                                       Vlocale_coding_system,
11085                                                       0);
11086       str = SSDATA (dec);
11087     }
11088
11089   return str;
11090 }
11091
11092 #endif /* emacs */