src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 653   do {                                                  \
 654     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 655     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 656   } while (0)
 657
 658
 659 /* Safely get one byte from the source text pointed by SRC which ends
 660    at SRC_END, and set C to that byte.  If there are not enough bytes
 661    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 662    and a multibyte character is found at SRC, set C to the
 663    negative value of the character code.  The caller should declare
 664    and set these variables appropriately in advance:
 665         src, src_end, multibytep */
 666
 667 #define ONE_MORE_BYTE(c)                                \
 668   do {                                                  \
 669     if (src == src_end)                                 \
 670       {                                                 \
 671         if (src_base < src)                             \
 672           record_conversion_result                      \
 673             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 674         goto no_more_source;                            \
 675       }                                                 \
 676     c = *src++;                                         \
 677     if (multibytep && (c & 0x80))                       \
 678       {                                                 \
 679         if ((c & 0xFE) == 0xC0)                         \
 680           c = ((c & 1) << 6) | *src++;                  \
 681         else                                            \
 682           {                                             \
 683             src--;                                      \
 684             c = - string_char (src, &src, NULL);        \
 685             record_conversion_result                    \
 686               (coding, CODING_RESULT_INVALID_SRC);      \
 687           }                                             \
 688       }                                                 \
 689     consumed_chars++;                                   \
 690   } while (0)
 691
 692 /* Safely get two bytes from the source text pointed by SRC which ends
 693    at SRC_END, and set C1 and C2 to those bytes while skipping the
 694    heading multibyte characters.  If there are not enough bytes in the
 695    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 696    a multibyte character is found for C2, set C2 to the negative value
 697    of the character code.  The caller should declare and set these
 698    variables appropriately in advance:
 699         src, src_end, multibytep
 700    It is intended that this macro is used in detect_coding_utf_16.  */
 701
 702 #define TWO_MORE_BYTES(c1, c2)                          \
 703   do {                                                  \
 704     do {                                                \
 705       if (src == src_end)                               \
 706         goto no_more_source;                            \
 707       c1 = *src++;                                      \
 708       if (multibytep && (c1 & 0x80))                    \
 709         {                                               \
 710           if ((c1 & 0xFE) == 0xC0)                      \
 711             c1 = ((c1 & 1) << 6) | *src++;              \
 712           else                                          \
 713             {                                           \
 714               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 715               c1 = -1;                                  \
 716             }                                           \
 717         }                                               \
 718     } while (c1 < 0);                                   \
 719     if (src == src_end)                                 \
 720       goto no_more_source;                              \
 721     c2 = *src++;                                        \
 722     if (multibytep && (c2 & 0x80))                      \
 723       {                                                 \
 724         if ((c2 & 0xFE) == 0xC0)                        \
 725           c2 = ((c2 & 1) << 6) | *src++;                \
 726         else                                            \
 727           c2 = -1;                                      \
 728       }                                                 \
 729   } while (0)
 730
 731
 732 /* Store a byte C in the place pointed by DST and increment DST to the
 733    next free point, and increment PRODUCED_CHARS.  The caller should
 734    assure that C is 0..127, and declare and set the variable `dst'
 735    appropriately in advance.
 736 */
 737
 738
 739 #define EMIT_ONE_ASCII_BYTE(c)  \
 740   do {                          \
 741     produced_chars++;           \
 742     *dst++ = (c);               \
 743   } while (0)
 744
 745
 746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 747
 748 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 749   do {                                  \
 750     produced_chars += 2;                \
 751     *dst++ = (c1), *dst++ = (c2);       \
 752   } while (0)
 753
 754
 755 /* Store a byte C in the place pointed by DST and increment DST to the
 756    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 757    store in an appropriate multibyte form.  The caller should
 758    declare and set the variables `dst' and `multibytep' appropriately
 759    in advance.  */
 760
 761 #define EMIT_ONE_BYTE(c)                \
 762   do {                                  \
 763     produced_chars++;                   \
 764     if (multibytep)                     \
 765       {                                 \
 766         unsigned ch = (c);              \
 767         if (ch >= 0x80)                 \
 768           ch = BYTE8_TO_CHAR (ch);      \
 769         CHAR_STRING_ADVANCE (ch, dst);  \
 770       }                                 \
 771     else                                \
 772       *dst++ = (c);                     \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_BYTES(c1, c2)          \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     if (multibytep)                     \
 782       {                                 \
 783         unsigned ch;                    \
 784                                         \
 785         ch = (c1);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789         ch = (c2);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793       }                                 \
 794     else                                \
 795       {                                 \
 796         *dst++ = (c1);                  \
 797         *dst++ = (c2);                  \
 798       }                                 \
 799   } while (0)
 800
 801
 802 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 803   do {                                  \
 804     EMIT_ONE_BYTE (c1);                 \
 805     EMIT_TWO_BYTES (c2, c3);            \
 806   } while (0)
 807
 808
 809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 810   do {                                          \
 811     EMIT_TWO_BYTES (c1, c2);                    \
 812     EMIT_TWO_BYTES (c3, c4);                    \
 813   } while (0)
 814
 815
 816 static void
 817 record_conversion_result (struct coding_system *coding,
 818                           enum coding_result_code result)
 819 {
 820   coding->result = result;
 821   switch (result)
 822     {
 823     case CODING_RESULT_INSUFFICIENT_SRC:
 824       Vlast_code_conversion_error = Qinsufficient_source;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_DST:
 833       /* Don't record this error in Vlast_code_conversion_error
 834          because it happens just temporarily and is resolved when the
 835          whole conversion is finished.  */
 836       break;
 837     case CODING_RESULT_SUCCESS:
 838       break;
 839     default:
 840       Vlast_code_conversion_error = intern ("Unknown error");
 841     }
 842 }
 843
 844 /* These wrapper macros are used to preserve validity of pointers into
 845    buffer text across calls to decode_char, encode_char, etc, which
 846    could cause relocation of buffers if it loads a charset map,
 847    because loading a charset map allocates large structures.  */
 848
 849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 850   do {                                                                       \
 851     ptrdiff_t offset;                                                        \
 852                                                                              \
 853     charset_map_loaded = 0;                                                  \
 854     c = DECODE_CHAR (charset, code);                                         \
 855     if (charset_map_loaded                                                   \
 856         && (offset = coding_change_source (coding)))                         \
 857       {                                                                      \
 858         src += offset;                                                       \
 859         src_base += offset;                                                  \
 860         src_end += offset;                                                   \
 861       }                                                                      \
 862   } while (0)
 863
 864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 865   do {                                                                  \
 866     ptrdiff_t offset;                                                   \
 867                                                                         \
 868     charset_map_loaded = 0;                                             \
 869     code = ENCODE_CHAR (charset, c);                                    \
 870     if (charset_map_loaded                                              \
 871         && (offset = coding_change_destination (coding)))               \
 872       {                                                                 \
 873         dst += offset;                                                  \
 874         dst_end += offset;                                              \
 875       }                                                                 \
 876   } while (0)
 877
 878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 879   do {                                                                  \
 880     ptrdiff_t offset;                                                   \
 881                                                                         \
 882     charset_map_loaded = 0;                                             \
 883     charset = char_charset (c, charset_list, code_return);              \
 884     if (charset_map_loaded                                              \
 885         && (offset = coding_change_destination (coding)))               \
 886       {                                                                 \
 887         dst += offset;                                                  \
 888         dst_end += offset;                                              \
 889       }                                                                 \
 890   } while (0)
 891
 892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 893   do {                                                                  \
 894     ptrdiff_t offset;                                                   \
 895                                                                         \
 896     charset_map_loaded = 0;                                             \
 897     result = CHAR_CHARSET_P (c, charset);                               \
 898     if (charset_map_loaded                                              \
 899         && (offset = coding_change_destination (coding)))               \
 900       {                                                                 \
 901         dst += offset;                                                  \
 902         dst_end += offset;                                              \
 903       }                                                                 \
 904   } while (0)
 905
 906
 907 /* If there are at least BYTES length of room at dst, allocate memory
 908    for coding->destination and update dst and dst_end.  We don't have
 909    to take care of coding->source which will be relocated.  It is
 910    handled by calling coding_set_source in encode_coding.  */
 911
 912 #define ASSURE_DESTINATION(bytes)                               \
 913   do {                                                          \
 914     if (dst + (bytes) >= dst_end)                               \
 915       {                                                         \
 916         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 917                                                                 \
 918         dst = alloc_destination (coding, more_bytes, dst);      \
 919         dst_end = coding->destination + coding->dst_bytes;      \
 920       }                                                         \
 921   } while (0)
 922
 923
 924 /* Store multibyte form of the character C in P, and advance P to the
 925    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 926    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 927    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 928
 929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 930
 931 /* Return the character code of character whose multibyte form is at
 932    P, and advance P to the end of the multibyte form.  This used to be
 933    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 934    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 935
 936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 937
 938 /* Set coding->source from coding->src_object.  */
 939
 940 static void
 941 coding_set_source (struct coding_system *coding)
 942 {
 943   if (BUFFERP (coding->src_object))
 944     {
 945       struct buffer *buf = XBUFFER (coding->src_object);
 946
 947       if (coding->src_pos < 0)
 948         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 949       else
 950         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 951     }
 952   else if (STRINGP (coding->src_object))
 953     {
 954       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 955     }
 956   else
 957     {
 958       /* Otherwise, the source is C string and is never relocated
 959          automatically.  Thus we don't have to update anything.  */
 960     }
 961 }
 962
 963
 964 /* Set coding->source from coding->src_object, and return how many
 965    bytes coding->source was changed.  */
 966
 967 static ptrdiff_t
 968 coding_change_source (struct coding_system *coding)
 969 {
 970   const unsigned char *orig = coding->source;
 971   coding_set_source (coding);
 972   return coding->source - orig;
 973 }
 974
 975
 976 /* Set coding->destination from coding->dst_object.  */
 977
 978 static void
 979 coding_set_destination (struct coding_system *coding)
 980 {
 981   if (BUFFERP (coding->dst_object))
 982     {
 983       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 984         {
 985           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 986           coding->dst_bytes = (GAP_END_ADDR
 987                                - (coding->src_bytes - coding->consumed)
 988                                - coding->destination);
 989         }
 990       else
 991         {
 992           /* We are sure that coding->dst_pos_byte is before the gap
 993              of the buffer. */
 994           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 995                                  + coding->dst_pos_byte - BEG_BYTE);
 996           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 997                                - coding->destination);
 998         }
 999     }
1000   else
1001     {
1002       /* Otherwise, the destination is C string and is never relocated
1003          automatically.  Thus we don't have to update anything.  */
1004     }
1005 }
1006
1007
1008 /* Set coding->destination from coding->dst_object, and return how
1009    many bytes coding->destination was changed.  */
1010
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1013 {
1014   const unsigned char *orig = coding->destination;
1015   coding_set_destination (coding);
1016   return coding->destination - orig;
1017 }
1018
1019
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1022 {
1023   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024     string_overflow ();
1025   coding->destination = xrealloc (coding->destination,
1026                                   coding->dst_bytes + bytes);
1027   coding->dst_bytes += bytes;
1028 }
1029
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1033 {
1034   if (EQ (coding->src_object, coding->dst_object))
1035     {
1036       /* The gap may contain the produced data at the head and not-yet
1037          consumed data at the tail.  To preserve those data, we at
1038          first make the gap size to zero, then increase the gap
1039          size.  */
1040       ptrdiff_t add = GAP_SIZE;
1041
1042       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044       make_gap (bytes);
1045       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1047     }
1048   else
1049     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1050 }
1051
1052
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055                    unsigned char *dst)
1056 {
1057   ptrdiff_t offset = dst - coding->destination;
1058
1059   if (BUFFERP (coding->dst_object))
1060     {
1061       struct buffer *buf = XBUFFER (coding->dst_object);
1062
1063       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1064     }
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   coding_set_destination (coding);
1068   dst = coding->destination + offset;
1069   return dst;
1070 }
1071
1072 /** Macros for annotations.  */
1073
1074 /* An annotation data is stored in the array coding->charbuf in this
1075    format:
1076      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077    LENGTH is the number of elements in the annotation.
1078    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079    NCHARS is the number of characters in the text annotated.
1080
1081    The format of the following elements depend on ANNOTATION_MASK.
1082
1083    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084    follows:
1085      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1086
1087    NBYTES is the number of bytes specified in the header part of
1088    old-style emacs-mule encoding, or 0 for the other kind of
1089    composition.
1090
1091    METHOD is one of enum composition_method.
1092
1093    Optional COMPOSITION-COMPONENTS are characters and composition
1094    rules.
1095
1096    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097    follows.
1098
1099    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100    recover from an invalid annotation, and should be skipped by
1101    produce_annotation.  */
1102
1103 /* Maximum length of the header of annotation data.  */
1104 #define MAX_ANNOTATION_LENGTH 5
1105
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1107   do {                                                  \
1108     *(buf)++ = -(len);                                  \
1109     *(buf)++ = (mask);                                  \
1110     *(buf)++ = (nchars);                                \
1111     coding->annotated = 1;                              \
1112   } while (0);
1113
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1115   do {                                                                      \
1116     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117     *buf++ = nbytes;                                                        \
1118     *buf++ = method;                                                        \
1119   } while (0)
1120
1121
1122 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1123   do {                                                                  \
1124     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125     *buf++ = id;                                                        \
1126   } while (0)
1127
1128 \f
1129 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1130
1131
1132
1133 \f
1134 /*** 3. UTF-8 ***/
1135
1136 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1137    Return true if a text is encoded in UTF-8.  */
1138
1139 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1140 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1141 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1142 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1143 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1144 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1145
1146 #define UTF_8_BOM_1 0xEF
1147 #define UTF_8_BOM_2 0xBB
1148 #define UTF_8_BOM_3 0xBF
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   bool found = 0;
1160
1161   detect_info->checked |= CATEGORY_MASK_UTF_8;
1162   /* A coding system of this category is always ASCII compatible.  */
1163   src += coding->head_ascii;
1164
1165   while (1)
1166     {
1167       int c, c1, c2, c3, c4;
1168
1169       src_base = src;
1170       ONE_MORE_BYTE (c);
1171       if (c < 0 || UTF_8_1_OCTET_P (c))
1172         continue;
1173       ONE_MORE_BYTE (c1);
1174       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175         break;
1176       if (UTF_8_2_OCTET_LEADING_P (c))
1177         {
1178           found = 1;
1179           continue;
1180         }
1181       ONE_MORE_BYTE (c2);
1182       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1183         break;
1184       if (UTF_8_3_OCTET_LEADING_P (c))
1185         {
1186           found = 1;
1187           if (src_base == coding->source
1188               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189             bom_found = 1;
1190           continue;
1191         }
1192       ONE_MORE_BYTE (c3);
1193       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1194         break;
1195       if (UTF_8_4_OCTET_LEADING_P (c))
1196         {
1197           found = 1;
1198           continue;
1199         }
1200       ONE_MORE_BYTE (c4);
1201       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1202         break;
1203       if (UTF_8_5_OCTET_LEADING_P (c))
1204         {
1205           found = 1;
1206           continue;
1207         }
1208       break;
1209     }
1210   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1211   return 0;
1212
1213  no_more_source:
1214   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1215     {
1216       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1217       return 0;
1218     }
1219   if (bom_found)
1220     {
1221       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1222       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1223     }
1224   else
1225     {
1226       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1227       if (found)
1228         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1229     }
1230   return 1;
1231 }
1232
1233
1234 static void
1235 decode_coding_utf_8 (struct coding_system *coding)
1236 {
1237   const unsigned char *src = coding->source + coding->consumed;
1238   const unsigned char *src_end = coding->source + coding->src_bytes;
1239   const unsigned char *src_base;
1240   int *charbuf = coding->charbuf + coding->charbuf_used;
1241   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1242   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1243   bool multibytep = coding->src_multibyte;
1244   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1245   bool eol_dos
1246     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1247   int byte_after_cr = -1;
1248
1249   if (bom != utf_without_bom)
1250     {
1251       int c1, c2, c3;
1252
1253       src_base = src;
1254       ONE_MORE_BYTE (c1);
1255       if (! UTF_8_3_OCTET_LEADING_P (c1))
1256         src = src_base;
1257       else
1258         {
1259           ONE_MORE_BYTE (c2);
1260           if (! UTF_8_EXTRA_OCTET_P (c2))
1261             src = src_base;
1262           else
1263             {
1264               ONE_MORE_BYTE (c3);
1265               if (! UTF_8_EXTRA_OCTET_P (c3))
1266                 src = src_base;
1267               else
1268                 {
1269                   if ((c1 != UTF_8_BOM_1)
1270                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1271                     src = src_base;
1272                   else
1273                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1274                 }
1275             }
1276         }
1277     }
1278   CODING_UTF_8_BOM (coding) = utf_without_bom;
1279
1280   while (1)
1281     {
1282       int c, c1, c2, c3, c4, c5;
1283
1284       src_base = src;
1285       consumed_chars_base = consumed_chars;
1286
1287       if (charbuf >= charbuf_end)
1288         {
1289           if (byte_after_cr >= 0)
1290             src_base--;
1291           break;
1292         }
1293
1294       if (byte_after_cr >= 0)
1295         c1 = byte_after_cr, byte_after_cr = -1;
1296       else
1297         ONE_MORE_BYTE (c1);
1298       if (c1 < 0)
1299         {
1300           c = - c1;
1301         }
1302       else if (UTF_8_1_OCTET_P (c1))
1303         {
1304           if (eol_dos && c1 == '\r')
1305             ONE_MORE_BYTE (byte_after_cr);
1306           c = c1;
1307         }
1308       else
1309         {
1310           ONE_MORE_BYTE (c2);
1311           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1312             goto invalid_code;
1313           if (UTF_8_2_OCTET_LEADING_P (c1))
1314             {
1315               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1316               /* Reject overlong sequences here and below.  Encoders
1317                  producing them are incorrect, they can be misleading,
1318                  and they mess up read/write invariance.  */
1319               if (c < 128)
1320                 goto invalid_code;
1321             }
1322           else
1323             {
1324               ONE_MORE_BYTE (c3);
1325               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1326                 goto invalid_code;
1327               if (UTF_8_3_OCTET_LEADING_P (c1))
1328                 {
1329                   c = (((c1 & 0xF) << 12)
1330                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1331                   if (c < 0x800
1332                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1333                     goto invalid_code;
1334                 }
1335               else
1336                 {
1337                   ONE_MORE_BYTE (c4);
1338                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1339                     goto invalid_code;
1340                   if (UTF_8_4_OCTET_LEADING_P (c1))
1341                     {
1342                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1343                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1344                     if (c < 0x10000)
1345                       goto invalid_code;
1346                     }
1347                   else
1348                     {
1349                       ONE_MORE_BYTE (c5);
1350                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1351                         goto invalid_code;
1352                       if (UTF_8_5_OCTET_LEADING_P (c1))
1353                         {
1354                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1355                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1356                                | (c5 & 0x3F));
1357                           if ((c > MAX_CHAR) || (c < 0x200000))
1358                             goto invalid_code;
1359                         }
1360                       else
1361                         goto invalid_code;
1362                     }
1363                 }
1364             }
1365         }
1366
1367       *charbuf++ = c;
1368       continue;
1369
1370     invalid_code:
1371       src = src_base;
1372       consumed_chars = consumed_chars_base;
1373       ONE_MORE_BYTE (c);
1374       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1375       coding->errors++;
1376     }
1377
1378  no_more_source:
1379   coding->consumed_char += consumed_chars_base;
1380   coding->consumed = src_base - coding->source;
1381   coding->charbuf_used = charbuf - coding->charbuf;
1382 }
1383
1384
1385 static bool
1386 encode_coding_utf_8 (struct coding_system *coding)
1387 {
1388   bool multibytep = coding->dst_multibyte;
1389   int *charbuf = coding->charbuf;
1390   int *charbuf_end = charbuf + coding->charbuf_used;
1391   unsigned char *dst = coding->destination + coding->produced;
1392   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1393   ptrdiff_t produced_chars = 0;
1394   int c;
1395
1396   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1397     {
1398       ASSURE_DESTINATION (3);
1399       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1400       CODING_UTF_8_BOM (coding) = utf_without_bom;
1401     }
1402
1403   if (multibytep)
1404     {
1405       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1406
1407       while (charbuf < charbuf_end)
1408         {
1409           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1410
1411           ASSURE_DESTINATION (safe_room);
1412           c = *charbuf++;
1413           if (CHAR_BYTE8_P (c))
1414             {
1415               c = CHAR_TO_BYTE8 (c);
1416               EMIT_ONE_BYTE (c);
1417             }
1418           else
1419             {
1420               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1421               for (p = str; p < pend; p++)
1422                 EMIT_ONE_BYTE (*p);
1423             }
1424         }
1425     }
1426   else
1427     {
1428       int safe_room = MAX_MULTIBYTE_LENGTH;
1429
1430       while (charbuf < charbuf_end)
1431         {
1432           ASSURE_DESTINATION (safe_room);
1433           c = *charbuf++;
1434           if (CHAR_BYTE8_P (c))
1435             *dst++ = CHAR_TO_BYTE8 (c);
1436           else
1437             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1438           produced_chars++;
1439         }
1440     }
1441   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1442   coding->produced_char += produced_chars;
1443   coding->produced = dst - coding->destination;
1444   return 0;
1445 }
1446
1447
1448 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1449    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1450
1451 #define UTF_16_HIGH_SURROGATE_P(val) \
1452   (((val) & 0xFC00) == 0xD800)
1453
1454 #define UTF_16_LOW_SURROGATE_P(val) \
1455   (((val) & 0xFC00) == 0xDC00)
1456
1457
1458 static bool
1459 detect_coding_utf_16 (struct coding_system *coding,
1460                       struct coding_detection_info *detect_info)
1461 {
1462   const unsigned char *src = coding->source;
1463   const unsigned char *src_end = coding->source + coding->src_bytes;
1464   bool multibytep = coding->src_multibyte;
1465   int c1, c2;
1466
1467   detect_info->checked |= CATEGORY_MASK_UTF_16;
1468   if (coding->mode & CODING_MODE_LAST_BLOCK
1469       && (coding->src_chars & 1))
1470     {
1471       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1472       return 0;
1473     }
1474
1475   TWO_MORE_BYTES (c1, c2);
1476   if ((c1 == 0xFF) && (c2 == 0xFE))
1477     {
1478       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1479                              | CATEGORY_MASK_UTF_16_AUTO);
1480       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1481                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1482                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1483     }
1484   else if ((c1 == 0xFE) && (c2 == 0xFF))
1485     {
1486       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1487                              | CATEGORY_MASK_UTF_16_AUTO);
1488       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1489                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1490                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1491     }
1492   else if (c2 < 0)
1493     {
1494       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495       return 0;
1496     }
1497   else
1498     {
1499       /* We check the dispersion of Eth and Oth bytes where E is even and
1500          O is odd.  If both are high, we assume binary data.*/
1501       unsigned char e[256], o[256];
1502       unsigned e_num = 1, o_num = 1;
1503
1504       memset (e, 0, 256);
1505       memset (o, 0, 256);
1506       e[c1] = 1;
1507       o[c2] = 1;
1508
1509       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1510                                 |CATEGORY_MASK_UTF_16_BE
1511                                 | CATEGORY_MASK_UTF_16_LE);
1512
1513       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1514              != CATEGORY_MASK_UTF_16)
1515         {
1516           TWO_MORE_BYTES (c1, c2);
1517           if (c2 < 0)
1518             break;
1519           if (! e[c1])
1520             {
1521               e[c1] = 1;
1522               e_num++;
1523               if (e_num >= 128)
1524                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1525             }
1526           if (! o[c2])
1527             {
1528               o[c2] = 1;
1529               o_num++;
1530               if (o_num >= 128)
1531                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1532             }
1533         }
1534       return 0;
1535     }
1536
1537  no_more_source:
1538   return 1;
1539 }
1540
1541 static void
1542 decode_coding_utf_16 (struct coding_system *coding)
1543 {
1544   const unsigned char *src = coding->source + coding->consumed;
1545   const unsigned char *src_end = coding->source + coding->src_bytes;
1546   const unsigned char *src_base;
1547   int *charbuf = coding->charbuf + coding->charbuf_used;
1548   /* We may produces at most 3 chars in one loop.  */
1549   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1550   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1551   bool multibytep = coding->src_multibyte;
1552   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1553   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1554   int surrogate = CODING_UTF_16_SURROGATE (coding);
1555   bool eol_dos
1556     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1557   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1558
1559   if (bom == utf_with_bom)
1560     {
1561       int c, c1, c2;
1562
1563       src_base = src;
1564       ONE_MORE_BYTE (c1);
1565       ONE_MORE_BYTE (c2);
1566       c = (c1 << 8) | c2;
1567
1568       if (endian == utf_16_big_endian
1569           ? c != 0xFEFF : c != 0xFFFE)
1570         {
1571           /* The first two bytes are not BOM.  Treat them as bytes
1572              for a normal character.  */
1573           src = src_base;
1574           coding->errors++;
1575         }
1576       CODING_UTF_16_BOM (coding) = utf_without_bom;
1577     }
1578   else if (bom == utf_detect_bom)
1579     {
1580       /* We have already tried to detect BOM and failed in
1581          detect_coding.  */
1582       CODING_UTF_16_BOM (coding) = utf_without_bom;
1583     }
1584
1585   while (1)
1586     {
1587       int c, c1, c2;
1588
1589       src_base = src;
1590       consumed_chars_base = consumed_chars;
1591
1592       if (charbuf >= charbuf_end)
1593         {
1594           if (byte_after_cr1 >= 0)
1595             src_base -= 2;
1596           break;
1597         }
1598
1599       if (byte_after_cr1 >= 0)
1600         c1 = byte_after_cr1, byte_after_cr1 = -1;
1601       else
1602         ONE_MORE_BYTE (c1);
1603       if (c1 < 0)
1604         {
1605           *charbuf++ = -c1;
1606           continue;
1607         }
1608       if (byte_after_cr2 >= 0)
1609         c2 = byte_after_cr2, byte_after_cr2 = -1;
1610       else
1611         ONE_MORE_BYTE (c2);
1612       if (c2 < 0)
1613         {
1614           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1615           *charbuf++ = -c2;
1616           continue;
1617         }
1618       c = (endian == utf_16_big_endian
1619            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1620
1621       if (surrogate)
1622         {
1623           if (! UTF_16_LOW_SURROGATE_P (c))
1624             {
1625               if (endian == utf_16_big_endian)
1626                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1627               else
1628                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1629               *charbuf++ = c1;
1630               *charbuf++ = c2;
1631               coding->errors++;
1632               if (UTF_16_HIGH_SURROGATE_P (c))
1633                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1634               else
1635                 *charbuf++ = c;
1636             }
1637           else
1638             {
1639               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1640               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1641               *charbuf++ = 0x10000 + c;
1642             }
1643         }
1644       else
1645         {
1646           if (UTF_16_HIGH_SURROGATE_P (c))
1647             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1648           else
1649             {
1650               if (eol_dos && c == '\r')
1651                 {
1652                   ONE_MORE_BYTE (byte_after_cr1);
1653                   ONE_MORE_BYTE (byte_after_cr2);
1654                 }
1655               *charbuf++ = c;
1656             }
1657         }
1658     }
1659
1660  no_more_source:
1661   coding->consumed_char += consumed_chars_base;
1662   coding->consumed = src_base - coding->source;
1663   coding->charbuf_used = charbuf - coding->charbuf;
1664 }
1665
1666 static bool
1667 encode_coding_utf_16 (struct coding_system *coding)
1668 {
1669   bool multibytep = coding->dst_multibyte;
1670   int *charbuf = coding->charbuf;
1671   int *charbuf_end = charbuf + coding->charbuf_used;
1672   unsigned char *dst = coding->destination + coding->produced;
1673   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1674   int safe_room = 8;
1675   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1676   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1677   ptrdiff_t produced_chars = 0;
1678   int c;
1679
1680   if (bom != utf_without_bom)
1681     {
1682       ASSURE_DESTINATION (safe_room);
1683       if (big_endian)
1684         EMIT_TWO_BYTES (0xFE, 0xFF);
1685       else
1686         EMIT_TWO_BYTES (0xFF, 0xFE);
1687       CODING_UTF_16_BOM (coding) = utf_without_bom;
1688     }
1689
1690   while (charbuf < charbuf_end)
1691     {
1692       ASSURE_DESTINATION (safe_room);
1693       c = *charbuf++;
1694       if (c > MAX_UNICODE_CHAR)
1695         c = coding->default_char;
1696
1697       if (c < 0x10000)
1698         {
1699           if (big_endian)
1700             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1701           else
1702             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1703         }
1704       else
1705         {
1706           int c1, c2;
1707
1708           c -= 0x10000;
1709           c1 = (c >> 10) + 0xD800;
1710           c2 = (c & 0x3FF) + 0xDC00;
1711           if (big_endian)
1712             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1713           else
1714             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1715         }
1716     }
1717   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1718   coding->produced = dst - coding->destination;
1719   coding->produced_char += produced_chars;
1720   return 0;
1721 }
1722
1723 \f
1724 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1725
1726 /* Emacs' internal format for representation of multiple character
1727    sets is a kind of multi-byte encoding, i.e. characters are
1728    represented by variable-length sequences of one-byte codes.
1729
1730    ASCII characters and control characters (e.g. `tab', `newline') are
1731    represented by one-byte sequences which are their ASCII codes, in
1732    the range 0x00 through 0x7F.
1733
1734    8-bit characters of the range 0x80..0x9F are represented by
1735    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1736    code + 0x20).
1737
1738    8-bit characters of the range 0xA0..0xFF are represented by
1739    one-byte sequences which are their 8-bit code.
1740
1741    The other characters are represented by a sequence of `base
1742    leading-code', optional `extended leading-code', and one or two
1743    `position-code's.  The length of the sequence is determined by the
1744    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1745    whereas extended leading-code and position-code take the range 0xA0
1746    through 0xFF.  See `charset.h' for more details about leading-code
1747    and position-code.
1748
1749    --- CODE RANGE of Emacs' internal format ---
1750    character set        range
1751    -------------        -----
1752    ascii                0x00..0x7F
1753    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1754    eight-bit-graphic    0xA0..0xBF
1755    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1756    ---------------------------------------------
1757
1758    As this is the internal character representation, the format is
1759    usually not used externally (i.e. in a file or in a data sent to a
1760    process).  But, it is possible to have a text externally in this
1761    format (i.e. by encoding by the coding system `emacs-mule').
1762
1763    In that case, a sequence of one-byte codes has a slightly different
1764    form.
1765
1766    At first, all characters in eight-bit-control are represented by
1767    one-byte sequences which are their 8-bit code.
1768
1769    Next, character composition data are represented by the byte
1770    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1771    where,
1772         METHOD is 0xF2 plus one of composition method (enum
1773         composition_method),
1774
1775         BYTES is 0xA0 plus a byte length of this composition data,
1776
1777         CHARS is 0xA0 plus a number of characters composed by this
1778         data,
1779
1780         COMPONENTs are characters of multibyte form or composition
1781         rules encoded by two-byte of ASCII codes.
1782
1783    In addition, for backward compatibility, the following formats are
1784    also recognized as composition data on decoding.
1785
1786    0x80 MSEQ ...
1787    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1788
1789    Here,
1790         MSEQ is a multibyte form but in these special format:
1791           ASCII: 0xA0 ASCII_CODE+0x80,
1792           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1793         RULE is a one byte code of the range 0xA0..0xF0 that
1794         represents a composition rule.
1795   */
1796
1797 char emacs_mule_bytes[256];
1798
1799
1800 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1801    Return true if a text is encoded in 'emacs-mule'.  */
1802
1803 static bool
1804 detect_coding_emacs_mule (struct coding_system *coding,
1805                           struct coding_detection_info *detect_info)
1806 {
1807   const unsigned char *src = coding->source, *src_base;
1808   const unsigned char *src_end = coding->source + coding->src_bytes;
1809   bool multibytep = coding->src_multibyte;
1810   ptrdiff_t consumed_chars = 0;
1811   int c;
1812   int found = 0;
1813
1814   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1815   /* A coding system of this category is always ASCII compatible.  */
1816   src += coding->head_ascii;
1817
1818   while (1)
1819     {
1820       src_base = src;
1821       ONE_MORE_BYTE (c);
1822       if (c < 0)
1823         continue;
1824       if (c == 0x80)
1825         {
1826           /* Perhaps the start of composite character.  We simply skip
1827              it because analyzing it is too heavy for detecting.  But,
1828              at least, we check that the composite character
1829              constitutes of more than 4 bytes.  */
1830           const unsigned char *src_start;
1831
1832         repeat:
1833           src_start = src;
1834           do
1835             {
1836               ONE_MORE_BYTE (c);
1837             }
1838           while (c >= 0xA0);
1839
1840           if (src - src_start <= 4)
1841             break;
1842           found = CATEGORY_MASK_EMACS_MULE;
1843           if (c == 0x80)
1844             goto repeat;
1845         }
1846
1847       if (c < 0x80)
1848         {
1849           if (c < 0x20
1850               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1851             break;
1852         }
1853       else
1854         {
1855           int more_bytes = emacs_mule_bytes[c] - 1;
1856
1857           while (more_bytes > 0)
1858             {
1859               ONE_MORE_BYTE (c);
1860               if (c < 0xA0)
1861                 {
1862                   src--;        /* Unread the last byte.  */
1863                   break;
1864                 }
1865               more_bytes--;
1866             }
1867           if (more_bytes != 0)
1868             break;
1869           found = CATEGORY_MASK_EMACS_MULE;
1870         }
1871     }
1872   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1873   return 0;
1874
1875  no_more_source:
1876   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1877     {
1878       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1879       return 0;
1880     }
1881   detect_info->found |= found;
1882   return 1;
1883 }
1884
1885
1886 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1887    character.  If CMP_STATUS indicates that we must expect MSEQ or
1888    RULE described above, decode it and return the negative value of
1889    the decoded character or rule.  If an invalid byte is found, return
1890    -1.  If SRC is too short, return -2.  */
1891
1892 static int
1893 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1894                  int *nbytes, int *nchars, int *id,
1895                  struct composition_status *cmp_status)
1896 {
1897   const unsigned char *src_end = coding->source + coding->src_bytes;
1898   const unsigned char *src_base = src;
1899   bool multibytep = coding->src_multibyte;
1900   int charset_ID;
1901   unsigned code;
1902   int c;
1903   int consumed_chars = 0;
1904   bool mseq_found = 0;
1905
1906   ONE_MORE_BYTE (c);
1907   if (c < 0)
1908     {
1909       c = -c;
1910       charset_ID = emacs_mule_charset[0];
1911     }
1912   else
1913     {
1914       if (c >= 0xA0)
1915         {
1916           if (cmp_status->state != COMPOSING_NO
1917               && cmp_status->old_form)
1918             {
1919               if (cmp_status->state == COMPOSING_CHAR)
1920                 {
1921                   if (c == 0xA0)
1922                     {
1923                       ONE_MORE_BYTE (c);
1924                       c -= 0x80;
1925                       if (c < 0)
1926                         goto invalid_code;
1927                     }
1928                   else
1929                     c -= 0x20;
1930                   mseq_found = 1;
1931                 }
1932               else
1933                 {
1934                   *nbytes = src - src_base;
1935                   *nchars = consumed_chars;
1936                   return -c;
1937                 }
1938             }
1939           else
1940             goto invalid_code;
1941         }
1942
1943       switch (emacs_mule_bytes[c])
1944         {
1945         case 2:
1946           if ((charset_ID = emacs_mule_charset[c]) < 0)
1947             goto invalid_code;
1948           ONE_MORE_BYTE (c);
1949           if (c < 0xA0)
1950             goto invalid_code;
1951           code = c & 0x7F;
1952           break;
1953
1954         case 3:
1955           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1956               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1957             {
1958               ONE_MORE_BYTE (c);
1959               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1960                 goto invalid_code;
1961               ONE_MORE_BYTE (c);
1962               if (c < 0xA0)
1963                 goto invalid_code;
1964               code = c & 0x7F;
1965             }
1966           else
1967             {
1968               if ((charset_ID = emacs_mule_charset[c]) < 0)
1969                 goto invalid_code;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code = (c & 0x7F) << 8;
1974               ONE_MORE_BYTE (c);
1975               if (c < 0xA0)
1976                 goto invalid_code;
1977               code |= c & 0x7F;
1978             }
1979           break;
1980
1981         case 4:
1982           ONE_MORE_BYTE (c);
1983           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1984             goto invalid_code;
1985           ONE_MORE_BYTE (c);
1986           if (c < 0xA0)
1987             goto invalid_code;
1988           code = (c & 0x7F) << 8;
1989           ONE_MORE_BYTE (c);
1990           if (c < 0xA0)
1991             goto invalid_code;
1992           code |= c & 0x7F;
1993           break;
1994
1995         case 1:
1996           code = c;
1997           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
1998           break;
1999
2000         default:
2001           emacs_abort ();
2002         }
2003       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2004                           CHARSET_FROM_ID (charset_ID), code, c);
2005       if (c < 0)
2006         goto invalid_code;
2007     }
2008   *nbytes = src - src_base;
2009   *nchars = consumed_chars;
2010   if (id)
2011     *id = charset_ID;
2012   return (mseq_found ? -c : c);
2013
2014  no_more_source:
2015   return -2;
2016
2017  invalid_code:
2018   return -1;
2019 }
2020
2021
2022 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2023
2024 /* Handle these composition sequence ('|': the end of header elements,
2025    BYTES and CHARS >= 0xA0):
2026
2027    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2028    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2029    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2030
2031    and these old form:
2032
2033    (4) relative composition: 0x80 | MSEQ ... MSEQ
2034    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2035
2036    When the starter 0x80 and the following header elements are found,
2037    this annotation header is produced.
2038
2039         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2040
2041    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2042    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2043
2044    Then, upon reading the following elements, these codes are produced
2045    until the composition end is found:
2046
2047    (1) CHAR ... CHAR
2048    (2) ALT ... ALT CHAR ... CHAR
2049    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2050    (4) CHAR ... CHAR
2051    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2052
2053    When the composition end is found, LENGTH and NCHARS in the
2054    annotation header is updated as below:
2055
2056    (1) LENGTH: unchanged, NCHARS: unchanged
2057    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2058    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2059    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2060    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2061
2062    If an error is found while composing, the annotation header is
2063    changed to the original composition header (plus filler -1s) as
2064    below:
2065
2066    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2067    (5)          [ 0x80 0xFF -1 -1- -1 ]
2068
2069    and the sequence [ -2 DECODED-RULE ] is changed to the original
2070    byte sequence as below:
2071         o the original byte sequence is B: [ B -1 ]
2072         o the original byte sequence is B1 B2: [ B1 B2 ]
2073
2074    Most of the routines are implemented by macros because many
2075    variables and labels in the caller decode_coding_emacs_mule must be
2076    accessible, and they are usually called just once (thus doesn't
2077    increase the size of compiled object).  */
2078
2079 /* Decode a composition rule represented by C as a component of
2080    composition sequence of Emacs 20 style.  Set RULE to the decoded
2081    rule. */
2082
2083 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2084   do {                                                  \
2085     int gref, nref;                                     \
2086                                                         \
2087     c -= 0xA0;                                          \
2088     if (c < 0 || c >= 81)                               \
2089       goto invalid_code;                                \
2090     gref = c / 9, nref = c % 9;                         \
2091     if (gref == 4) gref = 10;                           \
2092     if (nref == 4) nref = 10;                           \
2093     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2094   } while (0)
2095
2096
2097 /* Decode a composition rule represented by C and the following byte
2098    at SRC as a component of composition sequence of Emacs 21 style.
2099    Set RULE to the decoded rule.  */
2100
2101 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2102   do {                                                  \
2103     int gref, nref;                                     \
2104                                                         \
2105     gref = c - 0x20;                                    \
2106     if (gref < 0 || gref >= 81)                         \
2107       goto invalid_code;                                \
2108     ONE_MORE_BYTE (c);                                  \
2109     nref = c - 0x20;                                    \
2110     if (nref < 0 || nref >= 81)                         \
2111       goto invalid_code;                                \
2112     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2113   } while (0)
2114
2115
2116 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2117    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2118    byte length of this composition information, CHARS is the number of
2119    characters composed by this composition.  */
2120
2121 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2122   do {                                                                  \
2123     enum composition_method method = c - 0xF2;                          \
2124     int nbytes, nchars;                                                 \
2125                                                                         \
2126     ONE_MORE_BYTE (c);                                                  \
2127     if (c < 0)                                                          \
2128       goto invalid_code;                                                \
2129     nbytes = c - 0xA0;                                                  \
2130     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2131       goto invalid_code;                                                \
2132     ONE_MORE_BYTE (c);                                                  \
2133     nchars = c - 0xA0;                                                  \
2134     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2135       goto invalid_code;                                                \
2136     cmp_status->old_form = 0;                                           \
2137     cmp_status->method = method;                                        \
2138     if (method == COMPOSITION_RELATIVE)                                 \
2139       cmp_status->state = COMPOSING_CHAR;                               \
2140     else                                                                \
2141       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2142     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2143     cmp_status->nchars = nchars;                                        \
2144     cmp_status->ncomps = nbytes - 4;                                    \
2145     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2146   } while (0)
2147
2148
2149 /* Start of Emacs 20 style format for relative composition.  */
2150
2151 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2152   do {                                                          \
2153     cmp_status->old_form = 1;                                   \
2154     cmp_status->method = COMPOSITION_RELATIVE;                  \
2155     cmp_status->state = COMPOSING_CHAR;                         \
2156     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2157     cmp_status->nchars = cmp_status->ncomps = 0;                \
2158     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2159   } while (0)
2160
2161
2162 /* Start of Emacs 20 style format for rule-base composition.  */
2163
2164 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2165   do {                                                          \
2166     cmp_status->old_form = 1;                                   \
2167     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2168     cmp_status->state = COMPOSING_CHAR;                         \
2169     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2170     cmp_status->nchars = cmp_status->ncomps = 0;                \
2171     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2172   } while (0)
2173
2174
2175 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2176   do {                                                  \
2177     const unsigned char *current_src = src;             \
2178                                                         \
2179     ONE_MORE_BYTE (c);                                  \
2180     if (c < 0)                                          \
2181       goto invalid_code;                                \
2182     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2183         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2184       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2185     else if (c < 0xA0)                                  \
2186       goto invalid_code;                                \
2187     else if (c < 0xC0)                                  \
2188       {                                                 \
2189         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2190         /* Re-read C as a composition component.  */    \
2191         src = current_src;                              \
2192       }                                                 \
2193     else if (c == 0xFF)                                 \
2194       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2195     else                                                \
2196       goto invalid_code;                                \
2197   } while (0)
2198
2199 #define EMACS_MULE_COMPOSITION_END()                            \
2200   do {                                                          \
2201     int idx = - cmp_status->length;                             \
2202                                                                 \
2203     if (cmp_status->old_form)                                   \
2204       charbuf[idx + 2] = cmp_status->nchars;                    \
2205     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2206       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2207     cmp_status->state = COMPOSING_NO;                           \
2208   } while (0)
2209
2210
2211 static int
2212 emacs_mule_finish_composition (int *charbuf,
2213                                struct composition_status *cmp_status)
2214 {
2215   int idx = - cmp_status->length;
2216   int new_chars;
2217
2218   if (cmp_status->old_form && cmp_status->nchars > 0)
2219     {
2220       charbuf[idx + 2] = cmp_status->nchars;
2221       new_chars = 0;
2222       if (cmp_status->method == COMPOSITION_WITH_RULE
2223           && cmp_status->state == COMPOSING_CHAR)
2224         {
2225           /* The last rule was invalid.  */
2226           int rule = charbuf[-1] + 0xA0;
2227
2228           charbuf[-2] = BYTE8_TO_CHAR (rule);
2229           charbuf[-1] = -1;
2230           new_chars = 1;
2231         }
2232     }
2233   else
2234     {
2235       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2236
2237       if (cmp_status->method == COMPOSITION_WITH_RULE)
2238         {
2239           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2240           charbuf[idx++] = -3;
2241           charbuf[idx++] = 0;
2242           new_chars = 1;
2243         }
2244       else
2245         {
2246           int nchars = charbuf[idx + 1] + 0xA0;
2247           int nbytes = charbuf[idx + 2] + 0xA0;
2248
2249           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2250           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2251           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2252           charbuf[idx++] = -1;
2253           new_chars = 4;
2254         }
2255     }
2256   cmp_status->state = COMPOSING_NO;
2257   return new_chars;
2258 }
2259
2260 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2261   do {                                                                    \
2262     if (cmp_status->state != COMPOSING_NO)                                \
2263       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2264   } while (0)
2265
2266
2267 static void
2268 decode_coding_emacs_mule (struct coding_system *coding)
2269 {
2270   const unsigned char *src = coding->source + coding->consumed;
2271   const unsigned char *src_end = coding->source + coding->src_bytes;
2272   const unsigned char *src_base;
2273   int *charbuf = coding->charbuf + coding->charbuf_used;
2274   /* We may produce two annotations (charset and composition) in one
2275      loop and one more charset annotation at the end.  */
2276   int *charbuf_end
2277     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2278       /* We can produce up to 2 characters in a loop.  */
2279       - 1;
2280   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2281   bool multibytep = coding->src_multibyte;
2282   ptrdiff_t char_offset = coding->produced_char;
2283   ptrdiff_t last_offset = char_offset;
2284   int last_id = charset_ascii;
2285   bool eol_dos
2286     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2287   int byte_after_cr = -1;
2288   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2289
2290   if (cmp_status->state != COMPOSING_NO)
2291     {
2292       int i;
2293
2294       if (charbuf_end - charbuf < cmp_status->length)
2295         emacs_abort ();
2296       for (i = 0; i < cmp_status->length; i++)
2297         *charbuf++ = cmp_status->carryover[i];
2298       coding->annotated = 1;
2299     }
2300
2301   while (1)
2302     {
2303       int c, id IF_LINT (= 0);
2304
2305       src_base = src;
2306       consumed_chars_base = consumed_chars;
2307
2308       if (charbuf >= charbuf_end)
2309         {
2310           if (byte_after_cr >= 0)
2311             src_base--;
2312           break;
2313         }
2314
2315       if (byte_after_cr >= 0)
2316         c = byte_after_cr, byte_after_cr = -1;
2317       else
2318         ONE_MORE_BYTE (c);
2319
2320       if (c < 0 || c == 0x80)
2321         {
2322           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2323           if (c < 0)
2324             {
2325               *charbuf++ = -c;
2326               char_offset++;
2327             }
2328           else
2329             DECODE_EMACS_MULE_COMPOSITION_START ();
2330           continue;
2331         }
2332
2333       if (c < 0x80)
2334         {
2335           if (eol_dos && c == '\r')
2336             ONE_MORE_BYTE (byte_after_cr);
2337           id = charset_ascii;
2338           if (cmp_status->state != COMPOSING_NO)
2339             {
2340               if (cmp_status->old_form)
2341                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2342               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2343                 cmp_status->ncomps--;
2344             }
2345         }
2346       else
2347         {
2348           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2349           /* emacs_mule_char can load a charset map from a file, which
2350              allocates a large structure and might cause buffer text
2351              to be relocated as result.  Thus, we need to remember the
2352              original pointer to buffer text, and fix up all related
2353              pointers after the call.  */
2354           const unsigned char *orig = coding->source;
2355           ptrdiff_t offset;
2356
2357           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2358                                cmp_status);
2359           offset = coding->source - orig;
2360           if (offset)
2361             {
2362               src += offset;
2363               src_base += offset;
2364               src_end += offset;
2365             }
2366           if (c < 0)
2367             {
2368               if (c == -1)
2369                 goto invalid_code;
2370               if (c == -2)
2371                 break;
2372             }
2373           src = src_base + nbytes;
2374           consumed_chars = consumed_chars_base + nchars;
2375           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2376             cmp_status->ncomps -= nchars;
2377         }
2378
2379       /* Now if C >= 0, we found a normally encoded character, if C <
2380          0, we found an old-style composition component character or
2381          rule.  */
2382
2383       if (cmp_status->state == COMPOSING_NO)
2384         {
2385           if (last_id != id)
2386             {
2387               if (last_id != charset_ascii)
2388                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2389                                   last_id);
2390               last_id = id;
2391               last_offset = char_offset;
2392             }
2393           *charbuf++ = c;
2394           char_offset++;
2395         }
2396       else if (cmp_status->state == COMPOSING_CHAR)
2397         {
2398           if (cmp_status->old_form)
2399             {
2400               if (c >= 0)
2401                 {
2402                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403                   *charbuf++ = c;
2404                   char_offset++;
2405                 }
2406               else
2407                 {
2408                   *charbuf++ = -c;
2409                   cmp_status->nchars++;
2410                   cmp_status->length++;
2411                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2412                     EMACS_MULE_COMPOSITION_END ();
2413                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2414                     cmp_status->state = COMPOSING_RULE;
2415                 }
2416             }
2417           else
2418             {
2419               *charbuf++ = c;
2420               cmp_status->length++;
2421               cmp_status->nchars--;
2422               if (cmp_status->nchars == 0)
2423                 EMACS_MULE_COMPOSITION_END ();
2424             }
2425         }
2426       else if (cmp_status->state == COMPOSING_RULE)
2427         {
2428           int rule;
2429
2430           if (c >= 0)
2431             {
2432               EMACS_MULE_COMPOSITION_END ();
2433               *charbuf++ = c;
2434               char_offset++;
2435             }
2436           else
2437             {
2438               c = -c;
2439               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2440               if (rule < 0)
2441                 goto invalid_code;
2442               *charbuf++ = -2;
2443               *charbuf++ = rule;
2444               cmp_status->length += 2;
2445               cmp_status->state = COMPOSING_CHAR;
2446             }
2447         }
2448       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2449         {
2450           *charbuf++ = c;
2451           cmp_status->length++;
2452           if (cmp_status->ncomps == 0)
2453             cmp_status->state = COMPOSING_CHAR;
2454           else if (cmp_status->ncomps > 0)
2455             {
2456               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2457                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2458             }
2459           else
2460             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2461         }
2462       else                      /* COMPOSING_COMPONENT_RULE */
2463         {
2464           int rule;
2465
2466           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2467           if (rule < 0)
2468             goto invalid_code;
2469           *charbuf++ = -2;
2470           *charbuf++ = rule;
2471           cmp_status->length += 2;
2472           cmp_status->ncomps--;
2473           if (cmp_status->ncomps > 0)
2474             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2475           else
2476             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2477         }
2478       continue;
2479
2480     invalid_code:
2481       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482       src = src_base;
2483       consumed_chars = consumed_chars_base;
2484       ONE_MORE_BYTE (c);
2485       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2486       char_offset++;
2487       coding->errors++;
2488     }
2489
2490  no_more_source:
2491   if (cmp_status->state != COMPOSING_NO)
2492     {
2493       if (coding->mode & CODING_MODE_LAST_BLOCK)
2494         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495       else
2496         {
2497           int i;
2498
2499           charbuf -= cmp_status->length;
2500           for (i = 0; i < cmp_status->length; i++)
2501             cmp_status->carryover[i] = charbuf[i];
2502         }
2503     }
2504   if (last_id != charset_ascii)
2505     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2506   coding->consumed_char += consumed_chars_base;
2507   coding->consumed = src_base - coding->source;
2508   coding->charbuf_used = charbuf - coding->charbuf;
2509 }
2510
2511
2512 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2513   do {                                          \
2514     if (id < 0xA0)                              \
2515       codes[0] = id, codes[1] = 0;              \
2516     else if (id < 0xE0)                         \
2517       codes[0] = 0x9A, codes[1] = id;           \
2518     else if (id < 0xF0)                         \
2519       codes[0] = 0x9B, codes[1] = id;           \
2520     else if (id < 0xF5)                         \
2521       codes[0] = 0x9C, codes[1] = id;           \
2522     else                                        \
2523       codes[0] = 0x9D, codes[1] = id;           \
2524   } while (0);
2525
2526
2527 static bool
2528 encode_coding_emacs_mule (struct coding_system *coding)
2529 {
2530   bool multibytep = coding->dst_multibyte;
2531   int *charbuf = coding->charbuf;
2532   int *charbuf_end = charbuf + coding->charbuf_used;
2533   unsigned char *dst = coding->destination + coding->produced;
2534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2535   int safe_room = 8;
2536   ptrdiff_t produced_chars = 0;
2537   Lisp_Object attrs, charset_list;
2538   int c;
2539   int preferred_charset_id = -1;
2540
2541   CODING_GET_INFO (coding, attrs, charset_list);
2542   if (! EQ (charset_list, Vemacs_mule_charset_list))
2543     {
2544       charset_list = Vemacs_mule_charset_list;
2545       ASET (attrs, coding_attr_charset_list, charset_list);
2546     }
2547
2548   while (charbuf < charbuf_end)
2549     {
2550       ASSURE_DESTINATION (safe_room);
2551       c = *charbuf++;
2552
2553       if (c < 0)
2554         {
2555           /* Handle an annotation.  */
2556           switch (*charbuf)
2557             {
2558             case CODING_ANNOTATE_COMPOSITION_MASK:
2559               /* Not yet implemented.  */
2560               break;
2561             case CODING_ANNOTATE_CHARSET_MASK:
2562               preferred_charset_id = charbuf[3];
2563               if (preferred_charset_id >= 0
2564                   && NILP (Fmemq (make_number (preferred_charset_id),
2565                                   charset_list)))
2566                 preferred_charset_id = -1;
2567               break;
2568             default:
2569               emacs_abort ();
2570             }
2571           charbuf += -c - 1;
2572           continue;
2573         }
2574
2575       if (ASCII_CHAR_P (c))
2576         EMIT_ONE_ASCII_BYTE (c);
2577       else if (CHAR_BYTE8_P (c))
2578         {
2579           c = CHAR_TO_BYTE8 (c);
2580           EMIT_ONE_BYTE (c);
2581         }
2582       else
2583         {
2584           struct charset *charset;
2585           unsigned code;
2586           int dimension;
2587           int emacs_mule_id;
2588           unsigned char leading_codes[2];
2589
2590           if (preferred_charset_id >= 0)
2591             {
2592               bool result;
2593
2594               charset = CHARSET_FROM_ID (preferred_charset_id);
2595               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2596               if (result)
2597                 code = ENCODE_CHAR (charset, c);
2598               else
2599                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2600                                      &code, charset);
2601             }
2602           else
2603             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2604                                  &code, charset);
2605           if (! charset)
2606             {
2607               c = coding->default_char;
2608               if (ASCII_CHAR_P (c))
2609                 {
2610                   EMIT_ONE_ASCII_BYTE (c);
2611                   continue;
2612                 }
2613               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2614                                    &code, charset);
2615             }
2616           dimension = CHARSET_DIMENSION (charset);
2617           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2618           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2619           EMIT_ONE_BYTE (leading_codes[0]);
2620           if (leading_codes[1])
2621             EMIT_ONE_BYTE (leading_codes[1]);
2622           if (dimension == 1)
2623             EMIT_ONE_BYTE (code | 0x80);
2624           else
2625             {
2626               code |= 0x8080;
2627               EMIT_ONE_BYTE (code >> 8);
2628               EMIT_ONE_BYTE (code & 0xFF);
2629             }
2630         }
2631     }
2632   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2633   coding->produced_char += produced_chars;
2634   coding->produced = dst - coding->destination;
2635   return 0;
2636 }
2637
2638 \f
2639 /*** 7. ISO2022 handlers ***/
2640
2641 /* The following note describes the coding system ISO2022 briefly.
2642    Since the intention of this note is to help understand the
2643    functions in this file, some parts are NOT ACCURATE or are OVERLY
2644    SIMPLIFIED.  For thorough understanding, please refer to the
2645    original document of ISO2022.  This is equivalent to the standard
2646    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2647
2648    ISO2022 provides many mechanisms to encode several character sets
2649    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2650    is encoded using bytes less than 128.  This may make the encoded
2651    text a little bit longer, but the text passes more easily through
2652    several types of gateway, some of which strip off the MSB (Most
2653    Significant Bit).
2654
2655    There are two kinds of character sets: control character sets and
2656    graphic character sets.  The former contain control characters such
2657    as `newline' and `escape' to provide control functions (control
2658    functions are also provided by escape sequences).  The latter
2659    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2660    two control character sets and many graphic character sets.
2661
2662    Graphic character sets are classified into one of the following
2663    four classes, according to the number of bytes (DIMENSION) and
2664    number of characters in one dimension (CHARS) of the set:
2665    - DIMENSION1_CHARS94
2666    - DIMENSION1_CHARS96
2667    - DIMENSION2_CHARS94
2668    - DIMENSION2_CHARS96
2669
2670    In addition, each character set is assigned an identification tag,
2671    unique for each set, called the "final character" (denoted as <F>
2672    hereafter).  The <F> of each character set is decided by ECMA(*)
2673    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2674    (0x30..0x3F are for private use only).
2675
2676    Note (*): ECMA = European Computer Manufacturers Association
2677
2678    Here are examples of graphic character sets [NAME(<F>)]:
2679         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2680         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2681         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2682         o DIMENSION2_CHARS96 -- none for the moment
2683
2684    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2685         C0 [0x00..0x1F] -- control character plane 0
2686         GL [0x20..0x7F] -- graphic character plane 0
2687         C1 [0x80..0x9F] -- control character plane 1
2688         GR [0xA0..0xFF] -- graphic character plane 1
2689
2690    A control character set is directly designated and invoked to C0 or
2691    C1 by an escape sequence.  The most common case is that:
2692    - ISO646's  control character set is designated/invoked to C0, and
2693    - ISO6429's control character set is designated/invoked to C1,
2694    and usually these designations/invocations are omitted in encoded
2695    text.  In a 7-bit environment, only C0 can be used, and a control
2696    character for C1 is encoded by an appropriate escape sequence to
2697    fit into the environment.  All control characters for C1 are
2698    defined to have corresponding escape sequences.
2699
2700    A graphic character set is at first designated to one of four
2701    graphic registers (G0 through G3), then these graphic registers are
2702    invoked to GL or GR.  These designations and invocations can be
2703    done independently.  The most common case is that G0 is invoked to
2704    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2705    these invocations and designations are omitted in encoded text.
2706    In a 7-bit environment, only GL can be used.
2707
2708    When a graphic character set of CHARS94 is invoked to GL, codes
2709    0x20 and 0x7F of the GL area work as control characters SPACE and
2710    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2711    be used.
2712
2713    There are two ways of invocation: locking-shift and single-shift.
2714    With locking-shift, the invocation lasts until the next different
2715    invocation, whereas with single-shift, the invocation affects the
2716    following character only and doesn't affect the locking-shift
2717    state.  Invocations are done by the following control characters or
2718    escape sequences:
2719
2720    ----------------------------------------------------------------------
2721    abbrev  function                  cntrl escape seq   description
2722    ----------------------------------------------------------------------
2723    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2724    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2725    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2726    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2727    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2728    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2729    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2730    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2731    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2732    ----------------------------------------------------------------------
2733    (*) These are not used by any known coding system.
2734
2735    Control characters for these functions are defined by macros
2736    ISO_CODE_XXX in `coding.h'.
2737
2738    Designations are done by the following escape sequences:
2739    ----------------------------------------------------------------------
2740    escape sequence      description
2741    ----------------------------------------------------------------------
2742    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2743    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2744    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2745    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2746    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2747    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2748    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2749    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2750    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2751    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2752    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2753    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2754    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2755    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2756    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2757    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2758    ----------------------------------------------------------------------
2759
2760    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2761    of dimension 1, chars 94, and final character <F>, etc...
2762
2763    Note (*): Although these designations are not allowed in ISO2022,
2764    Emacs accepts them on decoding, and produces them on encoding
2765    CHARS96 character sets in a coding system which is characterized as
2766    7-bit environment, non-locking-shift, and non-single-shift.
2767
2768    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2769    '(' must be omitted.  We refer to this as "short-form" hereafter.
2770
2771    Now you may notice that there are a lot of ways of encoding the
2772    same multilingual text in ISO2022.  Actually, there exist many
2773    coding systems such as Compound Text (used in X11's inter client
2774    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2775    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2776    localized platforms), and all of these are variants of ISO2022.
2777
2778    In addition to the above, Emacs handles two more kinds of escape
2779    sequences: ISO6429's direction specification and Emacs' private
2780    sequence for specifying character composition.
2781
2782    ISO6429's direction specification takes the following form:
2783         o CSI ']'      -- end of the current direction
2784         o CSI '0' ']'  -- end of the current direction
2785         o CSI '1' ']'  -- start of left-to-right text
2786         o CSI '2' ']'  -- start of right-to-left text
2787    The control character CSI (0x9B: control sequence introducer) is
2788    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2789
2790    Character composition specification takes the following form:
2791         o ESC '0' -- start relative composition
2792         o ESC '1' -- end composition
2793         o ESC '2' -- start rule-base composition (*)
2794         o ESC '3' -- start relative composition with alternate chars  (**)
2795         o ESC '4' -- start rule-base composition with alternate chars  (**)
2796   Since these are not standard escape sequences of any ISO standard,
2797   the use of them with these meanings is restricted to Emacs only.
2798
2799   (*) This form is used only in Emacs 20.7 and older versions,
2800   but newer versions can safely decode it.
2801   (**) This form is used only in Emacs 21.1 and newer versions,
2802   and older versions can't decode it.
2803
2804   Here's a list of example usages of these composition escape
2805   sequences (categorized by `enum composition_method').
2806
2807   COMPOSITION_RELATIVE:
2808         ESC 0 CHAR [ CHAR ] ESC 1
2809   COMPOSITION_WITH_RULE:
2810         ESC 2 CHAR [ RULE CHAR ] ESC 1
2811   COMPOSITION_WITH_ALTCHARS:
2812         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2813   COMPOSITION_WITH_RULE_ALTCHARS:
2814         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2815
2816 static enum iso_code_class_type iso_code_class[256];
2817
2818 #define SAFE_CHARSET_P(coding, id)      \
2819   ((id) <= (coding)->max_charset_id     \
2820    && (coding)->safe_charsets[id] != 255)
2821
2822 static void
2823 setup_iso_safe_charsets (Lisp_Object attrs)
2824 {
2825   Lisp_Object charset_list, safe_charsets;
2826   Lisp_Object request;
2827   Lisp_Object reg_usage;
2828   Lisp_Object tail;
2829   EMACS_INT reg94, reg96;
2830   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2831   int max_charset_id;
2832
2833   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2834   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2835       && ! EQ (charset_list, Viso_2022_charset_list))
2836     {
2837       charset_list = Viso_2022_charset_list;
2838       ASET (attrs, coding_attr_charset_list, charset_list);
2839       ASET (attrs, coding_attr_safe_charsets, Qnil);
2840     }
2841
2842   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2843     return;
2844
2845   max_charset_id = 0;
2846   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2847     {
2848       int id = XINT (XCAR (tail));
2849       if (max_charset_id < id)
2850         max_charset_id = id;
2851     }
2852
2853   safe_charsets = make_uninit_string (max_charset_id + 1);
2854   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2855   request = AREF (attrs, coding_attr_iso_request);
2856   reg_usage = AREF (attrs, coding_attr_iso_usage);
2857   reg94 = XINT (XCAR (reg_usage));
2858   reg96 = XINT (XCDR (reg_usage));
2859
2860   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2861     {
2862       Lisp_Object id;
2863       Lisp_Object reg;
2864       struct charset *charset;
2865
2866       id = XCAR (tail);
2867       charset = CHARSET_FROM_ID (XINT (id));
2868       reg = Fcdr (Fassq (id, request));
2869       if (! NILP (reg))
2870         SSET (safe_charsets, XINT (id), XINT (reg));
2871       else if (charset->iso_chars_96)
2872         {
2873           if (reg96 < 4)
2874             SSET (safe_charsets, XINT (id), reg96);
2875         }
2876       else
2877         {
2878           if (reg94 < 4)
2879             SSET (safe_charsets, XINT (id), reg94);
2880         }
2881     }
2882   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2883 }
2884
2885
2886 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2887    Return true if a text is encoded in one of ISO-2022 based coding
2888    systems.  */
2889
2890 static bool
2891 detect_coding_iso_2022 (struct coding_system *coding,
2892                         struct coding_detection_info *detect_info)
2893 {
2894   const unsigned char *src = coding->source, *src_base = src;
2895   const unsigned char *src_end = coding->source + coding->src_bytes;
2896   bool multibytep = coding->src_multibyte;
2897   bool single_shifting = 0;
2898   int id;
2899   int c, c1;
2900   ptrdiff_t consumed_chars = 0;
2901   int i;
2902   int rejected = 0;
2903   int found = 0;
2904   int composition_count = -1;
2905
2906   detect_info->checked |= CATEGORY_MASK_ISO;
2907
2908   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2909     {
2910       struct coding_system *this = &(coding_categories[i]);
2911       Lisp_Object attrs, val;
2912
2913       if (this->id < 0)
2914         continue;
2915       attrs = CODING_ID_ATTRS (this->id);
2916       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2917           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2918         setup_iso_safe_charsets (attrs);
2919       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2920       this->max_charset_id = SCHARS (val) - 1;
2921       this->safe_charsets = SDATA (val);
2922     }
2923
2924   /* A coding system of this category is always ASCII compatible.  */
2925   src += coding->head_ascii;
2926
2927   while (rejected != CATEGORY_MASK_ISO)
2928     {
2929       src_base = src;
2930       ONE_MORE_BYTE (c);
2931       switch (c)
2932         {
2933         case ISO_CODE_ESC:
2934           if (inhibit_iso_escape_detection)
2935             break;
2936           single_shifting = 0;
2937           ONE_MORE_BYTE (c);
2938           if (c == 'N' || c == 'O')
2939             {
2940               /* ESC <Fe> for SS2 or SS3.  */
2941               single_shifting = 1;
2942               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2943             }
2944           else if (c == '1')
2945             {
2946               /* End of composition.  */
2947               if (composition_count < 0
2948                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2949                 /* Invalid */
2950                 break;
2951               composition_count = -1;
2952               found |= CATEGORY_MASK_ISO;
2953             }
2954           else if (c >= '0' && c <= '4')
2955             {
2956               /* ESC <Fp> for start/end composition.  */
2957               composition_count = 0;
2958             }
2959           else
2960             {
2961               if (c >= '(' && c <= '/')
2962                 {
2963                   /* Designation sequence for a charset of dimension 1.  */
2964                   ONE_MORE_BYTE (c1);
2965                   if (c1 < ' ' || c1 >= 0x80
2966                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2967                     /* Invalid designation sequence.  Just ignore.  */
2968                     break;
2969                 }
2970               else if (c == '$')
2971                 {
2972                   /* Designation sequence for a charset of dimension 2.  */
2973                   ONE_MORE_BYTE (c);
2974                   if (c >= '@' && c <= 'B')
2975                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2976                     id = iso_charset_table[1][0][c];
2977                   else if (c >= '(' && c <= '/')
2978                     {
2979                       ONE_MORE_BYTE (c1);
2980                       if (c1 < ' ' || c1 >= 0x80
2981                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2982                         /* Invalid designation sequence.  Just ignore.  */
2983                         break;
2984                     }
2985                   else
2986                     /* Invalid designation sequence.  Just ignore it.  */
2987                     break;
2988                 }
2989               else
2990                 {
2991                   /* Invalid escape sequence.  Just ignore it.  */
2992                   break;
2993                 }
2994
2995               /* We found a valid designation sequence for CHARSET.  */
2996               rejected |= CATEGORY_MASK_ISO_8BIT;
2997               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2998                                   id))
2999                 found |= CATEGORY_MASK_ISO_7;
3000               else
3001                 rejected |= CATEGORY_MASK_ISO_7;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_ELSE;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_8_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3017             }
3018           break;
3019
3020         case ISO_CODE_SO:
3021         case ISO_CODE_SI:
3022           /* Locking shift out/in.  */
3023           if (inhibit_iso_escape_detection)
3024             break;
3025           single_shifting = 0;
3026           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3027           break;
3028
3029         case ISO_CODE_CSI:
3030           /* Control sequence introducer.  */
3031           single_shifting = 0;
3032           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3033           found |= CATEGORY_MASK_ISO_8_ELSE;
3034           goto check_extra_latin;
3035
3036         case ISO_CODE_SS2:
3037         case ISO_CODE_SS3:
3038           /* Single shift.   */
3039           if (inhibit_iso_escape_detection)
3040             break;
3041           single_shifting = 0;
3042           rejected |= CATEGORY_MASK_ISO_7BIT;
3043           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3044               & CODING_ISO_FLAG_SINGLE_SHIFT)
3045             {
3046               found |= CATEGORY_MASK_ISO_8_1;
3047               single_shifting = 1;
3048             }
3049           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3050               & CODING_ISO_FLAG_SINGLE_SHIFT)
3051             {
3052               found |= CATEGORY_MASK_ISO_8_2;
3053               single_shifting = 1;
3054             }
3055           if (single_shifting)
3056             break;
3057           goto check_extra_latin;
3058
3059         default:
3060           if (c < 0)
3061             continue;
3062           if (c < 0x80)
3063             {
3064               if (composition_count >= 0)
3065                 composition_count++;
3066               single_shifting = 0;
3067               break;
3068             }
3069           if (c >= 0xA0)
3070             {
3071               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072               found |= CATEGORY_MASK_ISO_8_1;
3073               /* Check the length of succeeding codes of the range
3074                  0xA0..0FF.  If the byte length is even, we include
3075                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3076                  only when we are not single shifting.  */
3077               if (! single_shifting
3078                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3079                 {
3080                   int len = 1;
3081                   while (src < src_end)
3082                     {
3083                       src_base = src;
3084                       ONE_MORE_BYTE (c);
3085                       if (c < 0xA0)
3086                         {
3087                           src = src_base;
3088                           break;
3089                         }
3090                       len++;
3091                     }
3092
3093                   if (len & 1 && src < src_end)
3094                     {
3095                       rejected |= CATEGORY_MASK_ISO_8_2;
3096                       if (composition_count >= 0)
3097                         composition_count += len;
3098                     }
3099                   else
3100                     {
3101                       found |= CATEGORY_MASK_ISO_8_2;
3102                       if (composition_count >= 0)
3103                         composition_count += len / 2;
3104                     }
3105                 }
3106               break;
3107             }
3108         check_extra_latin:
3109           if (! VECTORP (Vlatin_extra_code_table)
3110               || NILP (AREF (Vlatin_extra_code_table, c)))
3111             {
3112               rejected = CATEGORY_MASK_ISO;
3113               break;
3114             }
3115           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116               & CODING_ISO_FLAG_LATIN_EXTRA)
3117             found |= CATEGORY_MASK_ISO_8_1;
3118           else
3119             rejected |= CATEGORY_MASK_ISO_8_1;
3120           rejected |= CATEGORY_MASK_ISO_8_2;
3121           break;
3122         }
3123     }
3124   detect_info->rejected |= CATEGORY_MASK_ISO;
3125   return 0;
3126
3127  no_more_source:
3128   detect_info->rejected |= rejected;
3129   detect_info->found |= (found & ~rejected);
3130   return 1;
3131 }
3132
3133
3134 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3135    escape sequence should be kept.  */
3136 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3137   do {                                                                  \
3138     int id, prev;                                                       \
3139                                                                         \
3140     if (final < '0' || final >= 128                                     \
3141         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3142         || !SAFE_CHARSET_P (coding, id))                                \
3143       {                                                                 \
3144         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3145         chars_96 = -1;                                                  \
3146         break;                                                          \
3147       }                                                                 \
3148     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3149     if (id == charset_jisx0201_roman)                                   \
3150       {                                                                 \
3151         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3152           id = charset_ascii;                                           \
3153       }                                                                 \
3154     else if (id == charset_jisx0208_1978)                               \
3155       {                                                                 \
3156         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3157           id = charset_jisx0208;                                        \
3158       }                                                                 \
3159     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3160     /* If there was an invalid designation to REG previously, and this  \
3161        designation is ASCII to REG, we should keep this designation     \
3162        sequence.  */                                                    \
3163     if (prev == -2 && id == charset_ascii)                              \
3164       chars_96 = -1;                                                    \
3165   } while (0)
3166
3167
3168 /* Handle these composition sequence (ALT: alternate char):
3169
3170    (1) relative composition: ESC 0 CHAR ... ESC 1
3171    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3172    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3173    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3174
3175    When the start sequence (ESC 0/2/3/4) is found, this annotation
3176    header is produced.
3177
3178         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3179
3180    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3181    produced until the end sequence (ESC 1) is found:
3182
3183    (1) CHAR ... CHAR
3184    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3185    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3186    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3187
3188    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3189    annotation header is updated as below:
3190
3191    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3192    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3193    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3194    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3195
3196    If an error is found while composing, the annotation header is
3197    changed to:
3198
3199         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3200
3201    and the sequence [ -2 DECODED-RULE ] is changed to the original
3202    byte sequence as below:
3203         o the original byte sequence is B: [ B -1 ]
3204         o the original byte sequence is B1 B2: [ B1 B2 ]
3205    and the sequence [ -1 -1 ] is changed to the original byte
3206    sequence:
3207         [ ESC '0' ]
3208 */
3209
3210 /* Decode a composition rule C1 and maybe one more byte from the
3211    source, and set RULE to the encoded composition rule.  If the rule
3212    is invalid, goto invalid_code.  */
3213
3214 #define DECODE_COMPOSITION_RULE(rule)                                   \
3215   do {                                                                  \
3216     rule = c1 - 32;                                                     \
3217     if (rule < 0)                                                       \
3218       goto invalid_code;                                                \
3219     if (rule < 81)              /* old format (before ver.21) */        \
3220       {                                                                 \
3221         int gref = (rule) / 9;                                          \
3222         int nref = (rule) % 9;                                          \
3223         if (gref == 4) gref = 10;                                       \
3224         if (nref == 4) nref = 10;                                       \
3225         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3226       }                                                                 \
3227     else                        /* new format (after ver.21) */         \
3228       {                                                                 \
3229         int b;                                                          \
3230                                                                         \
3231         ONE_MORE_BYTE (b);                                              \
3232         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3233           goto invalid_code;                                            \
3234         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3235         rule += 0x100;   /* Distinguish it from the old format.  */     \
3236       }                                                                 \
3237   } while (0)
3238
3239 #define ENCODE_COMPOSITION_RULE(rule)                           \
3240   do {                                                          \
3241     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3242                                                                 \
3243     if (rule < 0x100)           /* old format */                \
3244       {                                                         \
3245         if (gref == 10) gref = 4;                               \
3246         if (nref == 10) nref = 4;                               \
3247         charbuf[idx] = 32 + gref * 9 + nref;                    \
3248         charbuf[idx + 1] = -1;                                  \
3249         new_chars++;                                            \
3250       }                                                         \
3251     else                                /* new format */        \
3252       {                                                         \
3253         charbuf[idx] = 32 + 81 + gref;                          \
3254         charbuf[idx + 1] = 32 + nref;                           \
3255         new_chars += 2;                                         \
3256       }                                                         \
3257   } while (0)
3258
3259 /* Finish the current composition as invalid.  */
3260
3261 static int
3262 finish_composition (int *charbuf, struct composition_status *cmp_status)
3263 {
3264   int idx = - cmp_status->length;
3265   int new_chars;
3266
3267   /* Recover the original ESC sequence */
3268   charbuf[idx++] = ISO_CODE_ESC;
3269   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3270                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3271                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3272                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3273                     : '4');
3274   charbuf[idx++] = -2;
3275   charbuf[idx++] = 0;
3276   charbuf[idx++] = -1;
3277   new_chars = cmp_status->nchars;
3278   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3279     for (; idx < 0; idx++)
3280       {
3281         int elt = charbuf[idx];
3282
3283         if (elt == -2)
3284           {
3285             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3286             idx++;
3287           }
3288         else if (elt == -1)
3289           {
3290             charbuf[idx++] = ISO_CODE_ESC;
3291             charbuf[idx] = '0';
3292             new_chars += 2;
3293           }
3294       }
3295   cmp_status->state = COMPOSING_NO;
3296   return new_chars;
3297 }
3298
3299 /* If characters are under composition, finish the composition.  */
3300 #define MAYBE_FINISH_COMPOSITION()                              \
3301   do {                                                          \
3302     if (cmp_status->state != COMPOSING_NO)                      \
3303       char_offset += finish_composition (charbuf, cmp_status);  \
3304   } while (0)
3305
3306 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3307
3308    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3309    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3310    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3311    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3312
3313    Produce this annotation sequence now:
3314
3315    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3316 */
3317
3318 #define DECODE_COMPOSITION_START(c1)                                       \
3319   do {                                                                     \
3320     if (c1 == '0'                                                          \
3321         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3322              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3323             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3324                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3325       {                                                                    \
3326         *charbuf++ = -1;                                                   \
3327         *charbuf++= -1;                                                    \
3328         cmp_status->state = COMPOSING_CHAR;                                \
3329         cmp_status->length += 2;                                           \
3330       }                                                                    \
3331     else                                                                   \
3332       {                                                                    \
3333         MAYBE_FINISH_COMPOSITION ();                                       \
3334         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3335                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3336                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3337                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3338         cmp_status->state                                                  \
3339           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3340         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3341         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3342         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3343         coding->annotated = 1;                                             \
3344       }                                                                    \
3345   } while (0)
3346
3347
3348 /* Handle composition end sequence ESC 1.  */
3349
3350 #define DECODE_COMPOSITION_END()                                        \
3351   do {                                                                  \
3352     if (cmp_status->nchars == 0                                         \
3353         || ((cmp_status->state == COMPOSING_CHAR)                       \
3354             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3355       {                                                                 \
3356         MAYBE_FINISH_COMPOSITION ();                                    \
3357         goto invalid_code;                                              \
3358       }                                                                 \
3359     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3360       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3361     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3362       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3363     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3364     char_offset += cmp_status->nchars;                                  \
3365     cmp_status->state = COMPOSING_NO;                                   \
3366   } while (0)
3367
3368 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3369
3370 #define STORE_COMPOSITION_RULE(rule)    \
3371   do {                                  \
3372     *charbuf++ = -2;                    \
3373     *charbuf++ = rule;                  \
3374     cmp_status->length += 2;            \
3375     cmp_status->state--;                \
3376   } while (0)
3377
3378 /* Store a composed char or a component char C in charbuf, and update
3379    cmp_status.  */
3380
3381 #define STORE_COMPOSITION_CHAR(c)                                       \
3382   do {                                                                  \
3383     *charbuf++ = (c);                                                   \
3384     cmp_status->length++;                                               \
3385     if (cmp_status->state == COMPOSING_CHAR)                            \
3386       cmp_status->nchars++;                                             \
3387     else                                                                \
3388       cmp_status->ncomps++;                                             \
3389     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3390         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3391             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3392       cmp_status->state++;                                              \
3393   } while (0)
3394
3395
3396 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3397
3398 static void
3399 decode_coding_iso_2022 (struct coding_system *coding)
3400 {
3401   const unsigned char *src = coding->source + coding->consumed;
3402   const unsigned char *src_end = coding->source + coding->src_bytes;
3403   const unsigned char *src_base;
3404   int *charbuf = coding->charbuf + coding->charbuf_used;
3405   /* We may produce two annotations (charset and composition) in one
3406      loop and one more charset annotation at the end.  */
3407   int *charbuf_end
3408     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3409   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3410   bool multibytep = coding->src_multibyte;
3411   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3412   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3413   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3414   int charset_id_2, charset_id_3;
3415   struct charset *charset;
3416   int c;
3417   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3418   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3419   ptrdiff_t char_offset = coding->produced_char;
3420   ptrdiff_t last_offset = char_offset;
3421   int last_id = charset_ascii;
3422   bool eol_dos
3423     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3424   int byte_after_cr = -1;
3425   int i;
3426
3427   setup_iso_safe_charsets (attrs);
3428   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3429
3430   if (cmp_status->state != COMPOSING_NO)
3431     {
3432       if (charbuf_end - charbuf < cmp_status->length)
3433         emacs_abort ();
3434       for (i = 0; i < cmp_status->length; i++)
3435         *charbuf++ = cmp_status->carryover[i];
3436       coding->annotated = 1;
3437     }
3438
3439   while (1)
3440     {
3441       int c1, c2, c3;
3442
3443       src_base = src;
3444       consumed_chars_base = consumed_chars;
3445
3446       if (charbuf >= charbuf_end)
3447         {
3448           if (byte_after_cr >= 0)
3449             src_base--;
3450           break;
3451         }
3452
3453       if (byte_after_cr >= 0)
3454         c1 = byte_after_cr, byte_after_cr = -1;
3455       else
3456         ONE_MORE_BYTE (c1);
3457       if (c1 < 0)
3458         goto invalid_code;
3459
3460       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3461         {
3462           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3463           char_offset++;
3464           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3465           continue;
3466         }
3467
3468       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3469         {
3470           if (c1 == ISO_CODE_ESC)
3471             {
3472               if (src + 1 >= src_end)
3473                 goto no_more_source;
3474               *charbuf++ = ISO_CODE_ESC;
3475               char_offset++;
3476               if (src[0] == '%' && src[1] == '@')
3477                 {
3478                   src += 2;
3479                   consumed_chars += 2;
3480                   char_offset += 2;
3481                   /* We are sure charbuf can contain two more chars. */
3482                   *charbuf++ = '%';
3483                   *charbuf++ = '@';
3484                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3485                 }
3486             }
3487           else
3488             {
3489               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3490               char_offset++;
3491             }
3492           continue;
3493         }
3494
3495       if ((cmp_status->state == COMPOSING_RULE
3496            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3497           && c1 != ISO_CODE_ESC)
3498         {
3499           int rule;
3500
3501           DECODE_COMPOSITION_RULE (rule);
3502           STORE_COMPOSITION_RULE (rule);
3503           continue;
3504         }
3505
3506       /* We produce at most one character.  */
3507       switch (iso_code_class [c1])
3508         {
3509         case ISO_0x20_or_0x7F:
3510           if (charset_id_0 < 0
3511               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3512             /* This is SPACE or DEL.  */
3513             charset = CHARSET_FROM_ID (charset_ascii);
3514           else
3515             charset = CHARSET_FROM_ID (charset_id_0);
3516           break;
3517
3518         case ISO_graphic_plane_0:
3519           if (charset_id_0 < 0)
3520             charset = CHARSET_FROM_ID (charset_ascii);
3521           else
3522             charset = CHARSET_FROM_ID (charset_id_0);
3523           break;
3524
3525         case ISO_0xA0_or_0xFF:
3526           if (charset_id_1 < 0
3527               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3528               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3529             goto invalid_code;
3530           /* This is a graphic character, we fall down ... */
3531
3532         case ISO_graphic_plane_1:
3533           if (charset_id_1 < 0)
3534             goto invalid_code;
3535           charset = CHARSET_FROM_ID (charset_id_1);
3536           break;
3537
3538         case ISO_control_0:
3539           if (eol_dos && c1 == '\r')
3540             ONE_MORE_BYTE (byte_after_cr);
3541           MAYBE_FINISH_COMPOSITION ();
3542           charset = CHARSET_FROM_ID (charset_ascii);
3543           break;
3544
3545         case ISO_control_1:
3546           goto invalid_code;
3547
3548         case ISO_shift_out:
3549           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3550               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3551             goto invalid_code;
3552           CODING_ISO_INVOCATION (coding, 0) = 1;
3553           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3554           continue;
3555
3556         case ISO_shift_in:
3557           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3558             goto invalid_code;
3559           CODING_ISO_INVOCATION (coding, 0) = 0;
3560           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3561           continue;
3562
3563         case ISO_single_shift_2_7:
3564           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3565             goto invalid_code;
3566         case ISO_single_shift_2:
3567           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3568             goto invalid_code;
3569           /* SS2 is handled as an escape sequence of ESC 'N' */
3570           c1 = 'N';
3571           goto label_escape_sequence;
3572
3573         case ISO_single_shift_3:
3574           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3575             goto invalid_code;
3576           /* SS2 is handled as an escape sequence of ESC 'O' */
3577           c1 = 'O';
3578           goto label_escape_sequence;
3579
3580         case ISO_control_sequence_introducer:
3581           /* CSI is handled as an escape sequence of ESC '[' ...  */
3582           c1 = '[';
3583           goto label_escape_sequence;
3584
3585         case ISO_escape:
3586           ONE_MORE_BYTE (c1);
3587         label_escape_sequence:
3588           /* Escape sequences handled here are invocation,
3589              designation, direction specification, and character
3590              composition specification.  */
3591           switch (c1)
3592             {
3593             case '&':           /* revision of following character set */
3594               ONE_MORE_BYTE (c1);
3595               if (!(c1 >= '@' && c1 <= '~'))
3596                 goto invalid_code;
3597               ONE_MORE_BYTE (c1);
3598               if (c1 != ISO_CODE_ESC)
3599                 goto invalid_code;
3600               ONE_MORE_BYTE (c1);
3601               goto label_escape_sequence;
3602
3603             case '$':           /* designation of 2-byte character set */
3604               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3605                 goto invalid_code;
3606               {
3607                 int reg, chars96;
3608
3609                 ONE_MORE_BYTE (c1);
3610                 if (c1 >= '@' && c1 <= 'B')
3611                   {     /* designation of JISX0208.1978, GB2312.1980,
3612                            or JISX0208.1980 */
3613                     reg = 0, chars96 = 0;
3614                   }
3615                 else if (c1 >= 0x28 && c1 <= 0x2B)
3616                   { /* designation of DIMENSION2_CHARS94 character set */
3617                     reg = c1 - 0x28, chars96 = 0;
3618                     ONE_MORE_BYTE (c1);
3619                   }
3620                 else if (c1 >= 0x2C && c1 <= 0x2F)
3621                   { /* designation of DIMENSION2_CHARS96 character set */
3622                     reg = c1 - 0x2C, chars96 = 1;
3623                     ONE_MORE_BYTE (c1);
3624                   }
3625                 else
3626                   goto invalid_code;
3627                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3628                 /* We must update these variables now.  */
3629                 if (reg == 0)
3630                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3631                 else if (reg == 1)
3632                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3633                 if (chars96 < 0)
3634                   goto invalid_code;
3635               }
3636               continue;
3637
3638             case 'n':           /* invocation of locking-shift-2 */
3639               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3641                 goto invalid_code;
3642               CODING_ISO_INVOCATION (coding, 0) = 2;
3643               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644               continue;
3645
3646             case 'o':           /* invocation of locking-shift-3 */
3647               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3649                 goto invalid_code;
3650               CODING_ISO_INVOCATION (coding, 0) = 3;
3651               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652               continue;
3653
3654             case 'N':           /* invocation of single-shift-2 */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3656                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3657                 goto invalid_code;
3658               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3659               if (charset_id_2 < 0)
3660                 charset = CHARSET_FROM_ID (charset_ascii);
3661               else
3662                 charset = CHARSET_FROM_ID (charset_id_2);
3663               ONE_MORE_BYTE (c1);
3664               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3665                 goto invalid_code;
3666               break;
3667
3668             case 'O':           /* invocation of single-shift-3 */
3669               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3670                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3671                 goto invalid_code;
3672               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3673               if (charset_id_3 < 0)
3674                 charset = CHARSET_FROM_ID (charset_ascii);
3675               else
3676                 charset = CHARSET_FROM_ID (charset_id_3);
3677               ONE_MORE_BYTE (c1);
3678               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3679                 goto invalid_code;
3680               break;
3681
3682             case '0': case '2': case '3': case '4': /* start composition */
3683               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3684                 goto invalid_code;
3685               if (last_id != charset_ascii)
3686                 {
3687                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3688                   last_id = charset_ascii;
3689                   last_offset = char_offset;
3690                 }
3691               DECODE_COMPOSITION_START (c1);
3692               continue;
3693
3694             case '1':           /* end composition */
3695               if (cmp_status->state == COMPOSING_NO)
3696                 goto invalid_code;
3697               DECODE_COMPOSITION_END ();
3698               continue;
3699
3700             case '[':           /* specification of direction */
3701               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3702                 goto invalid_code;
3703               /* For the moment, nested direction is not supported.
3704                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3705                  left-to-right, and nonzero means right-to-left.  */
3706               ONE_MORE_BYTE (c1);
3707               switch (c1)
3708                 {
3709                 case ']':       /* end of the current direction */
3710                   coding->mode &= ~CODING_MODE_DIRECTION;
3711
3712                 case '0':       /* end of the current direction */
3713                 case '1':       /* start of left-to-right direction */
3714                   ONE_MORE_BYTE (c1);
3715                   if (c1 == ']')
3716                     coding->mode &= ~CODING_MODE_DIRECTION;
3717                   else
3718                     goto invalid_code;
3719                   break;
3720
3721                 case '2':       /* start of right-to-left direction */
3722                   ONE_MORE_BYTE (c1);
3723                   if (c1 == ']')
3724                     coding->mode |= CODING_MODE_DIRECTION;
3725                   else
3726                     goto invalid_code;
3727                   break;
3728
3729                 default:
3730                   goto invalid_code;
3731                 }
3732               continue;
3733
3734             case '%':
3735               ONE_MORE_BYTE (c1);
3736               if (c1 == '/')
3737                 {
3738                   /* CTEXT extended segment:
3739                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3740                      We keep these bytes as is for the moment.
3741                      They may be decoded by post-read-conversion.  */
3742                   int dim, M, L;
3743                   int size;
3744
3745                   ONE_MORE_BYTE (dim);
3746                   if (dim < '0' || dim > '4')
3747                     goto invalid_code;
3748                   ONE_MORE_BYTE (M);
3749                   if (M < 128)
3750                     goto invalid_code;
3751                   ONE_MORE_BYTE (L);
3752                   if (L < 128)
3753                     goto invalid_code;
3754                   size = ((M - 128) * 128) + (L - 128);
3755                   if (charbuf + 6 > charbuf_end)
3756                     goto break_loop;
3757                   *charbuf++ = ISO_CODE_ESC;
3758                   *charbuf++ = '%';
3759                   *charbuf++ = '/';
3760                   *charbuf++ = dim;
3761                   *charbuf++ = BYTE8_TO_CHAR (M);
3762                   *charbuf++ = BYTE8_TO_CHAR (L);
3763                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3764                 }
3765               else if (c1 == 'G')
3766                 {
3767                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3768                      ESC % G --UTF-8-BYTES-- ESC % @
3769                      We keep these bytes as is for the moment.
3770                      They may be decoded by post-read-conversion.  */
3771                   if (charbuf + 3 > charbuf_end)
3772                     goto break_loop;
3773                   *charbuf++ = ISO_CODE_ESC;
3774                   *charbuf++ = '%';
3775                   *charbuf++ = 'G';
3776                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3777                 }
3778               else
3779                 goto invalid_code;
3780               continue;
3781               break;
3782
3783             default:
3784               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3785                 goto invalid_code;
3786               {
3787                 int reg, chars96;
3788
3789                 if (c1 >= 0x28 && c1 <= 0x2B)
3790                   { /* designation of DIMENSION1_CHARS94 character set */
3791                     reg = c1 - 0x28, chars96 = 0;
3792                     ONE_MORE_BYTE (c1);
3793                   }
3794                 else if (c1 >= 0x2C && c1 <= 0x2F)
3795                   { /* designation of DIMENSION1_CHARS96 character set */
3796                     reg = c1 - 0x2C, chars96 = 1;
3797                     ONE_MORE_BYTE (c1);
3798                   }
3799                 else
3800                   goto invalid_code;
3801                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3802                 /* We must update these variables now.  */
3803                 if (reg == 0)
3804                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3805                 else if (reg == 1)
3806                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3807                 if (chars96 < 0)
3808                   goto invalid_code;
3809               }
3810               continue;
3811             }
3812           break;
3813
3814         default:
3815           emacs_abort ();
3816         }
3817
3818       if (cmp_status->state == COMPOSING_NO
3819           && charset->id != charset_ascii
3820           && last_id != charset->id)
3821         {
3822           if (last_id != charset_ascii)
3823             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3824           last_id = charset->id;
3825           last_offset = char_offset;
3826         }
3827
3828       /* Now we know CHARSET and 1st position code C1 of a character.
3829          Produce a decoded character while getting 2nd and 3rd
3830          position codes C2, C3 if necessary.  */
3831       if (CHARSET_DIMENSION (charset) > 1)
3832         {
3833           ONE_MORE_BYTE (c2);
3834           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3835               || ((c1 & 0x80) != (c2 & 0x80)))
3836             /* C2 is not in a valid range.  */
3837             goto invalid_code;
3838           if (CHARSET_DIMENSION (charset) == 2)
3839             c1 = (c1 << 8) | c2;
3840           else
3841             {
3842               ONE_MORE_BYTE (c3);
3843               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3844                   || ((c1 & 0x80) != (c3 & 0x80)))
3845                 /* C3 is not in a valid range.  */
3846                 goto invalid_code;
3847               c1 = (c1 << 16) | (c2 << 8) | c2;
3848             }
3849         }
3850       c1 &= 0x7F7F7F;
3851       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3852       if (c < 0)
3853         {
3854           MAYBE_FINISH_COMPOSITION ();
3855           for (; src_base < src; src_base++, char_offset++)
3856             {
3857               if (ASCII_BYTE_P (*src_base))
3858                 *charbuf++ = *src_base;
3859               else
3860                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3861             }
3862         }
3863       else if (cmp_status->state == COMPOSING_NO)
3864         {
3865           *charbuf++ = c;
3866           char_offset++;
3867         }
3868       else if ((cmp_status->state == COMPOSING_CHAR
3869                 ? cmp_status->nchars
3870                 : cmp_status->ncomps)
3871                >= MAX_COMPOSITION_COMPONENTS)
3872         {
3873           /* Too long composition.  */
3874           MAYBE_FINISH_COMPOSITION ();
3875           *charbuf++ = c;
3876           char_offset++;
3877         }
3878       else
3879         STORE_COMPOSITION_CHAR (c);
3880       continue;
3881
3882     invalid_code:
3883       MAYBE_FINISH_COMPOSITION ();
3884       src = src_base;
3885       consumed_chars = consumed_chars_base;
3886       ONE_MORE_BYTE (c);
3887       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3888       char_offset++;
3889       coding->errors++;
3890       continue;
3891
3892     break_loop:
3893       break;
3894     }
3895
3896  no_more_source:
3897   if (cmp_status->state != COMPOSING_NO)
3898     {
3899       if (coding->mode & CODING_MODE_LAST_BLOCK)
3900         MAYBE_FINISH_COMPOSITION ();
3901       else
3902         {
3903           charbuf -= cmp_status->length;
3904           for (i = 0; i < cmp_status->length; i++)
3905             cmp_status->carryover[i] = charbuf[i];
3906         }
3907     }
3908   else if (last_id != charset_ascii)
3909     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3910   coding->consumed_char += consumed_chars_base;
3911   coding->consumed = src_base - coding->source;
3912   coding->charbuf_used = charbuf - coding->charbuf;
3913 }
3914
3915
3916 /* ISO2022 encoding stuff.  */
3917
3918 /*
3919    It is not enough to say just "ISO2022" on encoding, we have to
3920    specify more details.  In Emacs, each coding system of ISO2022
3921    variant has the following specifications:
3922         1. Initial designation to G0 thru G3.
3923         2. Allows short-form designation?
3924         3. ASCII should be designated to G0 before control characters?
3925         4. ASCII should be designated to G0 at end of line?
3926         5. 7-bit environment or 8-bit environment?
3927         6. Use locking-shift?
3928         7. Use Single-shift?
3929    And the following two are only for Japanese:
3930         8. Use ASCII in place of JIS0201-1976-Roman?
3931         9. Use JISX0208-1983 in place of JISX0208-1978?
3932    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3933    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3934    details.
3935 */
3936
3937 /* Produce codes (escape sequence) for designating CHARSET to graphic
3938    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3939    '@', 'A', or 'B' and the coding system CODING allows, produce
3940    designation sequence of short-form.  */
3941
3942 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3943   do {                                                                  \
3944     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3945     const char *intermediate_char_94 = "()*+";                          \
3946     const char *intermediate_char_96 = ",-./";                          \
3947     int revision = -1;                                                  \
3948                                                                         \
3949     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3950       revision = CHARSET_ISO_REVISION (charset);                        \
3951                                                                         \
3952     if (revision >= 0)                                                  \
3953       {                                                                 \
3954         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3955         EMIT_ONE_BYTE ('@' + revision);                                 \
3956       }                                                                 \
3957     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3958     if (CHARSET_DIMENSION (charset) == 1)                               \
3959       {                                                                 \
3960         int b;                                                          \
3961         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3962           b = intermediate_char_94[reg];                                \
3963         else                                                            \
3964           b = intermediate_char_96[reg];                                \
3965         EMIT_ONE_ASCII_BYTE (b);                                        \
3966       }                                                                 \
3967     else                                                                \
3968       {                                                                 \
3969         EMIT_ONE_ASCII_BYTE ('$');                                      \
3970         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3971           {                                                             \
3972             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3973                 || reg != 0                                             \
3974                 || final_char < '@' || final_char > 'B')                \
3975               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3976           }                                                             \
3977         else                                                            \
3978           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3979       }                                                                 \
3980     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3981                                                                         \
3982     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3983   } while (0)
3984
3985
3986 /* The following two macros produce codes (control character or escape
3987    sequence) for ISO2022 single-shift functions (single-shift-2 and
3988    single-shift-3).  */
3989
3990 #define ENCODE_SINGLE_SHIFT_2                                           \
3991   do {                                                                  \
3992     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3993       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3994     else                                                                \
3995       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3996     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3997   } while (0)
3998
3999
4000 #define ENCODE_SINGLE_SHIFT_3                                           \
4001   do {                                                                  \
4002     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4003       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4004     else                                                                \
4005       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4006     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4007   } while (0)
4008
4009
4010 /* The following four macros produce codes (control character or
4011    escape sequence) for ISO2022 locking-shift functions (shift-in,
4012    shift-out, locking-shift-2, and locking-shift-3).  */
4013
4014 #define ENCODE_SHIFT_IN                                 \
4015   do {                                                  \
4016     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4017     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4018   } while (0)
4019
4020
4021 #define ENCODE_SHIFT_OUT                                \
4022   do {                                                  \
4023     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4024     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4025   } while (0)
4026
4027
4028 #define ENCODE_LOCKING_SHIFT_2                          \
4029   do {                                                  \
4030     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4031     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4032   } while (0)
4033
4034
4035 #define ENCODE_LOCKING_SHIFT_3                          \
4036   do {                                                  \
4037     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4038     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4039   } while (0)
4040
4041
4042 /* Produce codes for a DIMENSION1 character whose character set is
4043    CHARSET and whose position-code is C1.  Designation and invocation
4044    sequences are also produced in advance if necessary.  */
4045
4046 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4047   do {                                                                  \
4048     int id = CHARSET_ID (charset);                                      \
4049                                                                         \
4050     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4051         && id == charset_ascii)                                         \
4052       {                                                                 \
4053         id = charset_jisx0201_roman;                                    \
4054         charset = CHARSET_FROM_ID (id);                                 \
4055       }                                                                 \
4056                                                                         \
4057     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4058       {                                                                 \
4059         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4060           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4061         else                                                            \
4062           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4063         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4064         break;                                                          \
4065       }                                                                 \
4066     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4067       {                                                                 \
4068         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4069         break;                                                          \
4070       }                                                                 \
4071     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4072       {                                                                 \
4073         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4074         break;                                                          \
4075       }                                                                 \
4076     else                                                                \
4077       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4078          must invoke it, or, at first, designate it to some graphic     \
4079          register.  Then repeat the loop to actually produce the        \
4080          character.  */                                                 \
4081       dst = encode_invocation_designation (charset, coding, dst,        \
4082                                            &produced_chars);            \
4083   } while (1)
4084
4085
4086 /* Produce codes for a DIMENSION2 character whose character set is
4087    CHARSET and whose position-codes are C1 and C2.  Designation and
4088    invocation codes are also produced in advance if necessary.  */
4089
4090 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4091   do {                                                                  \
4092     int id = CHARSET_ID (charset);                                      \
4093                                                                         \
4094     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4095         && id == charset_jisx0208)                                      \
4096       {                                                                 \
4097         id = charset_jisx0208_1978;                                     \
4098         charset = CHARSET_FROM_ID (id);                                 \
4099       }                                                                 \
4100                                                                         \
4101     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4102       {                                                                 \
4103         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4104           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4105         else                                                            \
4106           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4107         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4108         break;                                                          \
4109       }                                                                 \
4110     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4111       {                                                                 \
4112         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4113         break;                                                          \
4114       }                                                                 \
4115     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4116       {                                                                 \
4117         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4118         break;                                                          \
4119       }                                                                 \
4120     else                                                                \
4121       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4122          must invoke it, or, at first, designate it to some graphic     \
4123          register.  Then repeat the loop to actually produce the        \
4124          character.  */                                                 \
4125       dst = encode_invocation_designation (charset, coding, dst,        \
4126                                            &produced_chars);            \
4127   } while (1)
4128
4129
4130 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4131   do {                                                                     \
4132     unsigned code;                                                         \
4133     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4134                                                                            \
4135     if (CHARSET_DIMENSION (charset) == 1)                                  \
4136       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4137     else                                                                   \
4138       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4139   } while (0)
4140
4141
4142 /* Produce designation and invocation codes at a place pointed by DST
4143    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4144    Return new DST.  */
4145
4146 static unsigned char *
4147 encode_invocation_designation (struct charset *charset,
4148                                struct coding_system *coding,
4149                                unsigned char *dst, ptrdiff_t *p_nchars)
4150 {
4151   bool multibytep = coding->dst_multibyte;
4152   ptrdiff_t produced_chars = *p_nchars;
4153   int reg;                      /* graphic register number */
4154   int id = CHARSET_ID (charset);
4155
4156   /* At first, check designations.  */
4157   for (reg = 0; reg < 4; reg++)
4158     if (id == CODING_ISO_DESIGNATION (coding, reg))
4159       break;
4160
4161   if (reg >= 4)
4162     {
4163       /* CHARSET is not yet designated to any graphic registers.  */
4164       /* At first check the requested designation.  */
4165       reg = CODING_ISO_REQUEST (coding, id);
4166       if (reg < 0)
4167         /* Since CHARSET requests no special designation, designate it
4168            to graphic register 0.  */
4169         reg = 0;
4170
4171       ENCODE_DESIGNATION (charset, reg, coding);
4172     }
4173
4174   if (CODING_ISO_INVOCATION (coding, 0) != reg
4175       && CODING_ISO_INVOCATION (coding, 1) != reg)
4176     {
4177       /* Since the graphic register REG is not invoked to any graphic
4178          planes, invoke it to graphic plane 0.  */
4179       switch (reg)
4180         {
4181         case 0:                 /* graphic register 0 */
4182           ENCODE_SHIFT_IN;
4183           break;
4184
4185         case 1:                 /* graphic register 1 */
4186           ENCODE_SHIFT_OUT;
4187           break;
4188
4189         case 2:                 /* graphic register 2 */
4190           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4191             ENCODE_SINGLE_SHIFT_2;
4192           else
4193             ENCODE_LOCKING_SHIFT_2;
4194           break;
4195
4196         case 3:                 /* graphic register 3 */
4197           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4198             ENCODE_SINGLE_SHIFT_3;
4199           else
4200             ENCODE_LOCKING_SHIFT_3;
4201           break;
4202         }
4203     }
4204
4205   *p_nchars = produced_chars;
4206   return dst;
4207 }
4208
4209
4210 /* Produce codes for designation and invocation to reset the graphic
4211    planes and registers to initial state.  */
4212 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4213   do {                                                                  \
4214     int reg;                                                            \
4215     struct charset *charset;                                            \
4216                                                                         \
4217     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4218       ENCODE_SHIFT_IN;                                                  \
4219     for (reg = 0; reg < 4; reg++)                                       \
4220       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4221           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4222               != CODING_ISO_INITIAL (coding, reg)))                     \
4223         {                                                               \
4224           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4225           ENCODE_DESIGNATION (charset, reg, coding);                    \
4226         }                                                               \
4227   } while (0)
4228
4229
4230 /* Produce designation sequences of charsets in the line started from
4231    CHARBUF to a place pointed by DST, and return the number of
4232    produced bytes.  DST should not directly point a buffer text area
4233    which may be relocated by char_charset call.
4234
4235    If the current block ends before any end-of-line, we may fail to
4236    find all the necessary designations.  */
4237
4238 static ptrdiff_t
4239 encode_designation_at_bol (struct coding_system *coding,
4240                            int *charbuf, int *charbuf_end,
4241                            unsigned char *dst)
4242 {
4243   unsigned char *orig = dst;
4244   struct charset *charset;
4245   /* Table of charsets to be designated to each graphic register.  */
4246   int r[4];
4247   int c, found = 0, reg;
4248   ptrdiff_t produced_chars = 0;
4249   bool multibytep = coding->dst_multibyte;
4250   Lisp_Object attrs;
4251   Lisp_Object charset_list;
4252
4253   attrs = CODING_ID_ATTRS (coding->id);
4254   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4255   if (EQ (charset_list, Qiso_2022))
4256     charset_list = Viso_2022_charset_list;
4257
4258   for (reg = 0; reg < 4; reg++)
4259     r[reg] = -1;
4260
4261   while (charbuf < charbuf_end && found < 4)
4262     {
4263       int id;
4264
4265       c = *charbuf++;
4266       if (c == '\n')
4267         break;
4268       charset = char_charset (c, charset_list, NULL);
4269       id = CHARSET_ID (charset);
4270       reg = CODING_ISO_REQUEST (coding, id);
4271       if (reg >= 0 && r[reg] < 0)
4272         {
4273           found++;
4274           r[reg] = id;
4275         }
4276     }
4277
4278   if (found)
4279     {
4280       for (reg = 0; reg < 4; reg++)
4281         if (r[reg] >= 0
4282             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4283           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4284     }
4285
4286   return dst - orig;
4287 }
4288
4289 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4290
4291 static bool
4292 encode_coding_iso_2022 (struct coding_system *coding)
4293 {
4294   bool multibytep = coding->dst_multibyte;
4295   int *charbuf = coding->charbuf;
4296   int *charbuf_end = charbuf + coding->charbuf_used;
4297   unsigned char *dst = coding->destination + coding->produced;
4298   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4299   int safe_room = 16;
4300   bool bol_designation
4301     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4302        && CODING_ISO_BOL (coding));
4303   ptrdiff_t produced_chars = 0;
4304   Lisp_Object attrs, eol_type, charset_list;
4305   bool ascii_compatible;
4306   int c;
4307   int preferred_charset_id = -1;
4308
4309   CODING_GET_INFO (coding, attrs, charset_list);
4310   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4311   if (VECTORP (eol_type))
4312     eol_type = Qunix;
4313
4314   setup_iso_safe_charsets (attrs);
4315   /* Charset list may have been changed.  */
4316   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4317   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4318
4319   ascii_compatible
4320     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4321        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4322                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4323
4324   while (charbuf < charbuf_end)
4325     {
4326       ASSURE_DESTINATION (safe_room);
4327
4328       if (bol_designation)
4329         {
4330           /* We have to produce designation sequences if any now.  */
4331           unsigned char desig_buf[16];
4332           int nbytes;
4333           ptrdiff_t offset;
4334
4335           charset_map_loaded = 0;
4336           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4337                                               desig_buf);
4338           if (charset_map_loaded
4339               && (offset = coding_change_destination (coding)))
4340             {
4341               dst += offset;
4342               dst_end += offset;
4343             }
4344           memcpy (dst, desig_buf, nbytes);
4345           dst += nbytes;
4346           /* We are sure that designation sequences are all ASCII bytes.  */
4347           produced_chars += nbytes;
4348           bol_designation = 0;
4349           ASSURE_DESTINATION (safe_room);
4350         }
4351
4352       c = *charbuf++;
4353
4354       if (c < 0)
4355         {
4356           /* Handle an annotation.  */
4357           switch (*charbuf)
4358             {
4359             case CODING_ANNOTATE_COMPOSITION_MASK:
4360               /* Not yet implemented.  */
4361               break;
4362             case CODING_ANNOTATE_CHARSET_MASK:
4363               preferred_charset_id = charbuf[2];
4364               if (preferred_charset_id >= 0
4365                   && NILP (Fmemq (make_number (preferred_charset_id),
4366                                   charset_list)))
4367                 preferred_charset_id = -1;
4368               break;
4369             default:
4370               emacs_abort ();
4371             }
4372           charbuf += -c - 1;
4373           continue;
4374         }
4375
4376       /* Now encode the character C.  */
4377       if (c < 0x20 || c == 0x7F)
4378         {
4379           if (c == '\n'
4380               || (c == '\r' && EQ (eol_type, Qmac)))
4381             {
4382               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4383                 ENCODE_RESET_PLANE_AND_REGISTER ();
4384               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4385                 {
4386                   int i;
4387
4388                   for (i = 0; i < 4; i++)
4389                     CODING_ISO_DESIGNATION (coding, i)
4390                       = CODING_ISO_INITIAL (coding, i);
4391                 }
4392               bol_designation = ((CODING_ISO_FLAGS (coding)
4393                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4394                                  != 0);
4395             }
4396           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4397             ENCODE_RESET_PLANE_AND_REGISTER ();
4398           EMIT_ONE_ASCII_BYTE (c);
4399         }
4400       else if (ASCII_CHAR_P (c))
4401         {
4402           if (ascii_compatible)
4403             EMIT_ONE_ASCII_BYTE (c);
4404           else
4405             {
4406               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4407               ENCODE_ISO_CHARACTER (charset, c);
4408             }
4409         }
4410       else if (CHAR_BYTE8_P (c))
4411         {
4412           c = CHAR_TO_BYTE8 (c);
4413           EMIT_ONE_BYTE (c);
4414         }
4415       else
4416         {
4417           struct charset *charset;
4418
4419           if (preferred_charset_id >= 0)
4420             {
4421               bool result;
4422
4423               charset = CHARSET_FROM_ID (preferred_charset_id);
4424               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4425               if (! result)
4426                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4427                                      NULL, charset);
4428             }
4429           else
4430             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431                                  NULL, charset);
4432           if (!charset)
4433             {
4434               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4435                 {
4436                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4437                   charset = CHARSET_FROM_ID (charset_ascii);
4438                 }
4439               else
4440                 {
4441                   c = coding->default_char;
4442                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4443                                        charset_list, NULL, charset);
4444                 }
4445             }
4446           ENCODE_ISO_CHARACTER (charset, c);
4447         }
4448     }
4449
4450   if (coding->mode & CODING_MODE_LAST_BLOCK
4451       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4452     {
4453       ASSURE_DESTINATION (safe_room);
4454       ENCODE_RESET_PLANE_AND_REGISTER ();
4455     }
4456   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4457   CODING_ISO_BOL (coding) = bol_designation;
4458   coding->produced_char += produced_chars;
4459   coding->produced = dst - coding->destination;
4460   return 0;
4461 }
4462
4463 \f
4464 /*** 8,9. SJIS and BIG5 handlers ***/
4465
4466 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4467    quite widely.  So, for the moment, Emacs supports them in the bare
4468    C code.  But, in the future, they may be supported only by CCL.  */
4469
4470 /* SJIS is a coding system encoding three character sets: ASCII, right
4471    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4472    as is.  A character of charset katakana-jisx0201 is encoded by
4473    "position-code + 0x80".  A character of charset japanese-jisx0208
4474    is encoded in 2-byte but two position-codes are divided and shifted
4475    so that it fit in the range below.
4476
4477    --- CODE RANGE of SJIS ---
4478    (character set)      (range)
4479    ASCII                0x00 .. 0x7F
4480    KATAKANA-JISX0201    0xA0 .. 0xDF
4481    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4482             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4483    -------------------------------
4484
4485 */
4486
4487 /* BIG5 is a coding system encoding two character sets: ASCII and
4488    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4489    character set and is encoded in two-byte.
4490
4491    --- CODE RANGE of BIG5 ---
4492    (character set)      (range)
4493    ASCII                0x00 .. 0x7F
4494    Big5 (1st byte)      0xA1 .. 0xFE
4495         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4496    --------------------------
4497
4498   */
4499
4500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4501    Return true if a text is encoded in SJIS.  */
4502
4503 static bool
4504 detect_coding_sjis (struct coding_system *coding,
4505                     struct coding_detection_info *detect_info)
4506 {
4507   const unsigned char *src = coding->source, *src_base;
4508   const unsigned char *src_end = coding->source + coding->src_bytes;
4509   bool multibytep = coding->src_multibyte;
4510   ptrdiff_t consumed_chars = 0;
4511   int found = 0;
4512   int c;
4513   Lisp_Object attrs, charset_list;
4514   int max_first_byte_of_2_byte_code;
4515
4516   CODING_GET_INFO (coding, attrs, charset_list);
4517   max_first_byte_of_2_byte_code
4518     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4519
4520   detect_info->checked |= CATEGORY_MASK_SJIS;
4521   /* A coding system of this category is always ASCII compatible.  */
4522   src += coding->head_ascii;
4523
4524   while (1)
4525     {
4526       src_base = src;
4527       ONE_MORE_BYTE (c);
4528       if (c < 0x80)
4529         continue;
4530       if ((c >= 0x81 && c <= 0x9F)
4531           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4532         {
4533           ONE_MORE_BYTE (c);
4534           if (c < 0x40 || c == 0x7F || c > 0xFC)
4535             break;
4536           found = CATEGORY_MASK_SJIS;
4537         }
4538       else if (c >= 0xA0 && c < 0xE0)
4539         found = CATEGORY_MASK_SJIS;
4540       else
4541         break;
4542     }
4543   detect_info->rejected |= CATEGORY_MASK_SJIS;
4544   return 0;
4545
4546  no_more_source:
4547   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4548     {
4549       detect_info->rejected |= CATEGORY_MASK_SJIS;
4550       return 0;
4551     }
4552   detect_info->found |= found;
4553   return 1;
4554 }
4555
4556 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4557    Return true if a text is encoded in BIG5.  */
4558
4559 static bool
4560 detect_coding_big5 (struct coding_system *coding,
4561                     struct coding_detection_info *detect_info)
4562 {
4563   const unsigned char *src = coding->source, *src_base;
4564   const unsigned char *src_end = coding->source + coding->src_bytes;
4565   bool multibytep = coding->src_multibyte;
4566   ptrdiff_t consumed_chars = 0;
4567   int found = 0;
4568   int c;
4569
4570   detect_info->checked |= CATEGORY_MASK_BIG5;
4571   /* A coding system of this category is always ASCII compatible.  */
4572   src += coding->head_ascii;
4573
4574   while (1)
4575     {
4576       src_base = src;
4577       ONE_MORE_BYTE (c);
4578       if (c < 0x80)
4579         continue;
4580       if (c >= 0xA1)
4581         {
4582           ONE_MORE_BYTE (c);
4583           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4584             return 0;
4585           found = CATEGORY_MASK_BIG5;
4586         }
4587       else
4588         break;
4589     }
4590   detect_info->rejected |= CATEGORY_MASK_BIG5;
4591   return 0;
4592
4593  no_more_source:
4594   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4595     {
4596       detect_info->rejected |= CATEGORY_MASK_BIG5;
4597       return 0;
4598     }
4599   detect_info->found |= found;
4600   return 1;
4601 }
4602
4603 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4604
4605 static void
4606 decode_coding_sjis (struct coding_system *coding)
4607 {
4608   const unsigned char *src = coding->source + coding->consumed;
4609   const unsigned char *src_end = coding->source + coding->src_bytes;
4610   const unsigned char *src_base;
4611   int *charbuf = coding->charbuf + coding->charbuf_used;
4612   /* We may produce one charset annotation in one loop and one more at
4613      the end.  */
4614   int *charbuf_end
4615     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4616   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4617   bool multibytep = coding->src_multibyte;
4618   struct charset *charset_roman, *charset_kanji, *charset_kana;
4619   struct charset *charset_kanji2;
4620   Lisp_Object attrs, charset_list, val;
4621   ptrdiff_t char_offset = coding->produced_char;
4622   ptrdiff_t last_offset = char_offset;
4623   int last_id = charset_ascii;
4624   bool eol_dos
4625     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4626   int byte_after_cr = -1;
4627
4628   CODING_GET_INFO (coding, attrs, charset_list);
4629
4630   val = charset_list;
4631   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4632   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4633   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4634   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4635
4636   while (1)
4637     {
4638       int c, c1;
4639       struct charset *charset;
4640
4641       src_base = src;
4642       consumed_chars_base = consumed_chars;
4643
4644       if (charbuf >= charbuf_end)
4645         {
4646           if (byte_after_cr >= 0)
4647             src_base--;
4648           break;
4649         }
4650
4651       if (byte_after_cr >= 0)
4652         c = byte_after_cr, byte_after_cr = -1;
4653       else
4654         ONE_MORE_BYTE (c);
4655       if (c < 0)
4656         goto invalid_code;
4657       if (c < 0x80)
4658         {
4659           if (eol_dos && c == '\r')
4660             ONE_MORE_BYTE (byte_after_cr);
4661           charset = charset_roman;
4662         }
4663       else if (c == 0x80 || c == 0xA0)
4664         goto invalid_code;
4665       else if (c >= 0xA1 && c <= 0xDF)
4666         {
4667           /* SJIS -> JISX0201-Kana */
4668           c &= 0x7F;
4669           charset = charset_kana;
4670         }
4671       else if (c <= 0xEF)
4672         {
4673           /* SJIS -> JISX0208 */
4674           ONE_MORE_BYTE (c1);
4675           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4676             goto invalid_code;
4677           c = (c << 8) | c1;
4678           SJIS_TO_JIS (c);
4679           charset = charset_kanji;
4680         }
4681       else if (c <= 0xFC && charset_kanji2)
4682         {
4683           /* SJIS -> JISX0213-2 */
4684           ONE_MORE_BYTE (c1);
4685           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4686             goto invalid_code;
4687           c = (c << 8) | c1;
4688           SJIS_TO_JIS2 (c);
4689           charset = charset_kanji2;
4690         }
4691       else
4692         goto invalid_code;
4693       if (charset->id != charset_ascii
4694           && last_id != charset->id)
4695         {
4696           if (last_id != charset_ascii)
4697             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4698           last_id = charset->id;
4699           last_offset = char_offset;
4700         }
4701       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4702       *charbuf++ = c;
4703       char_offset++;
4704       continue;
4705
4706     invalid_code:
4707       src = src_base;
4708       consumed_chars = consumed_chars_base;
4709       ONE_MORE_BYTE (c);
4710       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4711       char_offset++;
4712       coding->errors++;
4713     }
4714
4715  no_more_source:
4716   if (last_id != charset_ascii)
4717     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4718   coding->consumed_char += consumed_chars_base;
4719   coding->consumed = src_base - coding->source;
4720   coding->charbuf_used = charbuf - coding->charbuf;
4721 }
4722
4723 static void
4724 decode_coding_big5 (struct coding_system *coding)
4725 {
4726   const unsigned char *src = coding->source + coding->consumed;
4727   const unsigned char *src_end = coding->source + coding->src_bytes;
4728   const unsigned char *src_base;
4729   int *charbuf = coding->charbuf + coding->charbuf_used;
4730   /* We may produce one charset annotation in one loop and one more at
4731      the end.  */
4732   int *charbuf_end
4733     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4734   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4735   bool multibytep = coding->src_multibyte;
4736   struct charset *charset_roman, *charset_big5;
4737   Lisp_Object attrs, charset_list, val;
4738   ptrdiff_t char_offset = coding->produced_char;
4739   ptrdiff_t last_offset = char_offset;
4740   int last_id = charset_ascii;
4741   bool eol_dos
4742     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4743   int byte_after_cr = -1;
4744
4745   CODING_GET_INFO (coding, attrs, charset_list);
4746   val = charset_list;
4747   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4748   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4749
4750   while (1)
4751     {
4752       int c, c1;
4753       struct charset *charset;
4754
4755       src_base = src;
4756       consumed_chars_base = consumed_chars;
4757
4758       if (charbuf >= charbuf_end)
4759         {
4760           if (byte_after_cr >= 0)
4761             src_base--;
4762           break;
4763         }
4764
4765       if (byte_after_cr >= 0)
4766         c = byte_after_cr, byte_after_cr = -1;
4767       else
4768         ONE_MORE_BYTE (c);
4769
4770       if (c < 0)
4771         goto invalid_code;
4772       if (c < 0x80)
4773         {
4774           if (eol_dos && c == '\r')
4775             ONE_MORE_BYTE (byte_after_cr);
4776           charset = charset_roman;
4777         }
4778       else
4779         {
4780           /* BIG5 -> Big5 */
4781           if (c < 0xA1 || c > 0xFE)
4782             goto invalid_code;
4783           ONE_MORE_BYTE (c1);
4784           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4785             goto invalid_code;
4786           c = c << 8 | c1;
4787           charset = charset_big5;
4788         }
4789       if (charset->id != charset_ascii
4790           && last_id != charset->id)
4791         {
4792           if (last_id != charset_ascii)
4793             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4794           last_id = charset->id;
4795           last_offset = char_offset;
4796         }
4797       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4798       *charbuf++ = c;
4799       char_offset++;
4800       continue;
4801
4802     invalid_code:
4803       src = src_base;
4804       consumed_chars = consumed_chars_base;
4805       ONE_MORE_BYTE (c);
4806       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4807       char_offset++;
4808       coding->errors++;
4809     }
4810
4811  no_more_source:
4812   if (last_id != charset_ascii)
4813     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814   coding->consumed_char += consumed_chars_base;
4815   coding->consumed = src_base - coding->source;
4816   coding->charbuf_used = charbuf - coding->charbuf;
4817 }
4818
4819 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4820    This function can encode charsets `ascii', `katakana-jisx0201',
4821    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4822    are sure that all these charsets are registered as official charset
4823    (i.e. do not have extended leading-codes).  Characters of other
4824    charsets are produced without any encoding.  */
4825
4826 static bool
4827 encode_coding_sjis (struct coding_system *coding)
4828 {
4829   bool multibytep = coding->dst_multibyte;
4830   int *charbuf = coding->charbuf;
4831   int *charbuf_end = charbuf + coding->charbuf_used;
4832   unsigned char *dst = coding->destination + coding->produced;
4833   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4834   int safe_room = 4;
4835   ptrdiff_t produced_chars = 0;
4836   Lisp_Object attrs, charset_list, val;
4837   bool ascii_compatible;
4838   struct charset *charset_kanji, *charset_kana;
4839   struct charset *charset_kanji2;
4840   int c;
4841
4842   CODING_GET_INFO (coding, attrs, charset_list);
4843   val = XCDR (charset_list);
4844   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4847
4848   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4849
4850   while (charbuf < charbuf_end)
4851     {
4852       ASSURE_DESTINATION (safe_room);
4853       c = *charbuf++;
4854       /* Now encode the character C.  */
4855       if (ASCII_CHAR_P (c) && ascii_compatible)
4856         EMIT_ONE_ASCII_BYTE (c);
4857       else if (CHAR_BYTE8_P (c))
4858         {
4859           c = CHAR_TO_BYTE8 (c);
4860           EMIT_ONE_BYTE (c);
4861         }
4862       else
4863         {
4864           unsigned code;
4865           struct charset *charset;
4866           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4867                                &code, charset);
4868
4869           if (!charset)
4870             {
4871               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4872                 {
4873                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4874                   charset = CHARSET_FROM_ID (charset_ascii);
4875                 }
4876               else
4877                 {
4878                   c = coding->default_char;
4879                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4880                                        charset_list, &code, charset);
4881                 }
4882             }
4883           if (code == CHARSET_INVALID_CODE (charset))
4884             emacs_abort ();
4885           if (charset == charset_kanji)
4886             {
4887               int c1, c2;
4888               JIS_TO_SJIS (code);
4889               c1 = code >> 8, c2 = code & 0xFF;
4890               EMIT_TWO_BYTES (c1, c2);
4891             }
4892           else if (charset == charset_kana)
4893             EMIT_ONE_BYTE (code | 0x80);
4894           else if (charset_kanji2 && charset == charset_kanji2)
4895             {
4896               int c1, c2;
4897
4898               c1 = code >> 8;
4899               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4900                   || c1 == 0x28
4901                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4902                 {
4903                   JIS_TO_SJIS2 (code);
4904                   c1 = code >> 8, c2 = code & 0xFF;
4905                   EMIT_TWO_BYTES (c1, c2);
4906                 }
4907               else
4908                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4909             }
4910           else
4911             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4912         }
4913     }
4914   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4915   coding->produced_char += produced_chars;
4916   coding->produced = dst - coding->destination;
4917   return 0;
4918 }
4919
4920 static bool
4921 encode_coding_big5 (struct coding_system *coding)
4922 {
4923   bool multibytep = coding->dst_multibyte;
4924   int *charbuf = coding->charbuf;
4925   int *charbuf_end = charbuf + coding->charbuf_used;
4926   unsigned char *dst = coding->destination + coding->produced;
4927   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4928   int safe_room = 4;
4929   ptrdiff_t produced_chars = 0;
4930   Lisp_Object attrs, charset_list, val;
4931   bool ascii_compatible;
4932   struct charset *charset_big5;
4933   int c;
4934
4935   CODING_GET_INFO (coding, attrs, charset_list);
4936   val = XCDR (charset_list);
4937   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4938   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4939
4940   while (charbuf < charbuf_end)
4941     {
4942       ASSURE_DESTINATION (safe_room);
4943       c = *charbuf++;
4944       /* Now encode the character C.  */
4945       if (ASCII_CHAR_P (c) && ascii_compatible)
4946         EMIT_ONE_ASCII_BYTE (c);
4947       else if (CHAR_BYTE8_P (c))
4948         {
4949           c = CHAR_TO_BYTE8 (c);
4950           EMIT_ONE_BYTE (c);
4951         }
4952       else
4953         {
4954           unsigned code;
4955           struct charset *charset;
4956           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4957                                &code, charset);
4958
4959           if (! charset)
4960             {
4961               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4962                 {
4963                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4964                   charset = CHARSET_FROM_ID (charset_ascii);
4965                 }
4966               else
4967                 {
4968                   c = coding->default_char;
4969                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4970                                        charset_list, &code, charset);
4971                 }
4972             }
4973           if (code == CHARSET_INVALID_CODE (charset))
4974             emacs_abort ();
4975           if (charset == charset_big5)
4976             {
4977               int c1, c2;
4978
4979               c1 = code >> 8, c2 = code & 0xFF;
4980               EMIT_TWO_BYTES (c1, c2);
4981             }
4982           else
4983             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4984         }
4985     }
4986   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4987   coding->produced_char += produced_chars;
4988   coding->produced = dst - coding->destination;
4989   return 0;
4990 }
4991
4992 \f
4993 /*** 10. CCL handlers ***/
4994
4995 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4996    Return true if a text is encoded in a coding system of which
4997    encoder/decoder are written in CCL program.  */
4998
4999 static bool
5000 detect_coding_ccl (struct coding_system *coding,
5001                    struct coding_detection_info *detect_info)
5002 {
5003   const unsigned char *src = coding->source, *src_base;
5004   const unsigned char *src_end = coding->source + coding->src_bytes;
5005   bool multibytep = coding->src_multibyte;
5006   ptrdiff_t consumed_chars = 0;
5007   int found = 0;
5008   unsigned char *valids;
5009   ptrdiff_t head_ascii = coding->head_ascii;
5010   Lisp_Object attrs;
5011
5012   detect_info->checked |= CATEGORY_MASK_CCL;
5013
5014   coding = &coding_categories[coding_category_ccl];
5015   valids = CODING_CCL_VALIDS (coding);
5016   attrs = CODING_ID_ATTRS (coding->id);
5017   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5018     src += head_ascii;
5019
5020   while (1)
5021     {
5022       int c;
5023
5024       src_base = src;
5025       ONE_MORE_BYTE (c);
5026       if (c < 0 || ! valids[c])
5027         break;
5028       if ((valids[c] > 1))
5029         found = CATEGORY_MASK_CCL;
5030     }
5031   detect_info->rejected |= CATEGORY_MASK_CCL;
5032   return 0;
5033
5034  no_more_source:
5035   detect_info->found |= found;
5036   return 1;
5037 }
5038
5039 static void
5040 decode_coding_ccl (struct coding_system *coding)
5041 {
5042   const unsigned char *src = coding->source + coding->consumed;
5043   const unsigned char *src_end = coding->source + coding->src_bytes;
5044   int *charbuf = coding->charbuf + coding->charbuf_used;
5045   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5046   ptrdiff_t consumed_chars = 0;
5047   bool multibytep = coding->src_multibyte;
5048   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5049   int source_charbuf[1024];
5050   int source_byteidx[1025];
5051   Lisp_Object attrs, charset_list;
5052
5053   CODING_GET_INFO (coding, attrs, charset_list);
5054
5055   while (1)
5056     {
5057       const unsigned char *p = src;
5058       ptrdiff_t offset;
5059       int i = 0;
5060
5061       if (multibytep)
5062         {
5063           while (i < 1024 && p < src_end)
5064             {
5065               source_byteidx[i] = p - src;
5066               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5067             }
5068           source_byteidx[i] = p - src;
5069         }
5070       else
5071         while (i < 1024 && p < src_end)
5072           source_charbuf[i++] = *p++;
5073
5074       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5075         ccl->last_block = 1;
5076       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5077       charset_map_loaded = 0;
5078       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5079                   charset_list);
5080       if (charset_map_loaded
5081           && (offset = coding_change_source (coding)))
5082         {
5083           p += offset;
5084           src += offset;
5085           src_end += offset;
5086         }
5087       charbuf += ccl->produced;
5088       if (multibytep)
5089         src += source_byteidx[ccl->consumed];
5090       else
5091         src += ccl->consumed;
5092       consumed_chars += ccl->consumed;
5093       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5094         break;
5095     }
5096
5097   switch (ccl->status)
5098     {
5099     case CCL_STAT_SUSPEND_BY_SRC:
5100       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5101       break;
5102     case CCL_STAT_SUSPEND_BY_DST:
5103       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5104       break;
5105     case CCL_STAT_QUIT:
5106     case CCL_STAT_INVALID_CMD:
5107       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5108       break;
5109     default:
5110       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5111       break;
5112     }
5113   coding->consumed_char += consumed_chars;
5114   coding->consumed = src - coding->source;
5115   coding->charbuf_used = charbuf - coding->charbuf;
5116 }
5117
5118 static bool
5119 encode_coding_ccl (struct coding_system *coding)
5120 {
5121   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5122   bool multibytep = coding->dst_multibyte;
5123   int *charbuf = coding->charbuf;
5124   int *charbuf_end = charbuf + coding->charbuf_used;
5125   unsigned char *dst = coding->destination + coding->produced;
5126   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5127   int destination_charbuf[1024];
5128   ptrdiff_t produced_chars = 0;
5129   int i;
5130   Lisp_Object attrs, charset_list;
5131
5132   CODING_GET_INFO (coding, attrs, charset_list);
5133   if (coding->consumed_char == coding->src_chars
5134       && coding->mode & CODING_MODE_LAST_BLOCK)
5135     ccl->last_block = 1;
5136
5137   do
5138     {
5139       ptrdiff_t offset;
5140
5141       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5142       charset_map_loaded = 0;
5143       ccl_driver (ccl, charbuf, destination_charbuf,
5144                   charbuf_end - charbuf, 1024, charset_list);
5145       if (charset_map_loaded
5146           && (offset = coding_change_destination (coding)))
5147         dst += offset;
5148       if (multibytep)
5149         {
5150           ASSURE_DESTINATION (ccl->produced * 2);
5151           for (i = 0; i < ccl->produced; i++)
5152             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5153         }
5154       else
5155         {
5156           ASSURE_DESTINATION (ccl->produced);
5157           for (i = 0; i < ccl->produced; i++)
5158             *dst++ = destination_charbuf[i] & 0xFF;
5159           produced_chars += ccl->produced;
5160         }
5161       charbuf += ccl->consumed;
5162       if (ccl->status == CCL_STAT_QUIT
5163           || ccl->status == CCL_STAT_INVALID_CMD)
5164         break;
5165     }
5166   while (charbuf < charbuf_end);
5167
5168   switch (ccl->status)
5169     {
5170     case CCL_STAT_SUSPEND_BY_SRC:
5171       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5172       break;
5173     case CCL_STAT_SUSPEND_BY_DST:
5174       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5175       break;
5176     case CCL_STAT_QUIT:
5177     case CCL_STAT_INVALID_CMD:
5178       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5179       break;
5180     default:
5181       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5182       break;
5183     }
5184
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10, 11. no-conversion handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5194
5195 static void
5196 decode_coding_raw_text (struct coding_system *coding)
5197 {
5198   bool eol_dos
5199     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5200
5201   coding->chars_at_source = 1;
5202   coding->consumed_char = coding->src_chars;
5203   coding->consumed = coding->src_bytes;
5204   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5205     {
5206       coding->consumed_char--;
5207       coding->consumed--;
5208       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5209     }
5210   else
5211     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5212 }
5213
5214 static bool
5215 encode_coding_raw_text (struct coding_system *coding)
5216 {
5217   bool multibytep = coding->dst_multibyte;
5218   int *charbuf = coding->charbuf;
5219   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5220   unsigned char *dst = coding->destination + coding->produced;
5221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222   ptrdiff_t produced_chars = 0;
5223   int c;
5224
5225   if (multibytep)
5226     {
5227       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5228
5229       if (coding->src_multibyte)
5230         while (charbuf < charbuf_end)
5231           {
5232             ASSURE_DESTINATION (safe_room);
5233             c = *charbuf++;
5234             if (ASCII_CHAR_P (c))
5235               EMIT_ONE_ASCII_BYTE (c);
5236             else if (CHAR_BYTE8_P (c))
5237               {
5238                 c = CHAR_TO_BYTE8 (c);
5239                 EMIT_ONE_BYTE (c);
5240               }
5241             else
5242               {
5243                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5244
5245                 CHAR_STRING_ADVANCE (c, p1);
5246                 do
5247                   {
5248                     EMIT_ONE_BYTE (*p0);
5249                     p0++;
5250                   }
5251                 while (p0 < p1);
5252               }
5253           }
5254       else
5255         while (charbuf < charbuf_end)
5256           {
5257             ASSURE_DESTINATION (safe_room);
5258             c = *charbuf++;
5259             EMIT_ONE_BYTE (c);
5260           }
5261     }
5262   else
5263     {
5264       if (coding->src_multibyte)
5265         {
5266           int safe_room = MAX_MULTIBYTE_LENGTH;
5267
5268           while (charbuf < charbuf_end)
5269             {
5270               ASSURE_DESTINATION (safe_room);
5271               c = *charbuf++;
5272               if (ASCII_CHAR_P (c))
5273                 *dst++ = c;
5274               else if (CHAR_BYTE8_P (c))
5275                 *dst++ = CHAR_TO_BYTE8 (c);
5276               else
5277                 CHAR_STRING_ADVANCE (c, dst);
5278             }
5279         }
5280       else
5281         {
5282           ASSURE_DESTINATION (charbuf_end - charbuf);
5283           while (charbuf < charbuf_end && dst < dst_end)
5284             *dst++ = *charbuf++;
5285         }
5286       produced_chars = dst - (coding->destination + coding->produced);
5287     }
5288   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5289   coding->produced_char += produced_chars;
5290   coding->produced = dst - coding->destination;
5291   return 0;
5292 }
5293
5294 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5295    Return true if a text is encoded in a charset-based coding system.  */
5296
5297 static bool
5298 detect_coding_charset (struct coding_system *coding,
5299                        struct coding_detection_info *detect_info)
5300 {
5301   const unsigned char *src = coding->source, *src_base;
5302   const unsigned char *src_end = coding->source + coding->src_bytes;
5303   bool multibytep = coding->src_multibyte;
5304   ptrdiff_t consumed_chars = 0;
5305   Lisp_Object attrs, valids, name;
5306   int found = 0;
5307   ptrdiff_t head_ascii = coding->head_ascii;
5308   bool check_latin_extra = 0;
5309
5310   detect_info->checked |= CATEGORY_MASK_CHARSET;
5311
5312   coding = &coding_categories[coding_category_charset];
5313   attrs = CODING_ID_ATTRS (coding->id);
5314   valids = AREF (attrs, coding_attr_charset_valids);
5315   name = CODING_ID_NAME (coding->id);
5316   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5317                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5318       || strncmp (SSDATA (SYMBOL_NAME (name)),
5319                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5320     check_latin_extra = 1;
5321
5322   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5323     src += head_ascii;
5324
5325   while (1)
5326     {
5327       int c;
5328       Lisp_Object val;
5329       struct charset *charset;
5330       int dim, idx;
5331
5332       src_base = src;
5333       ONE_MORE_BYTE (c);
5334       if (c < 0)
5335         continue;
5336       val = AREF (valids, c);
5337       if (NILP (val))
5338         break;
5339       if (c >= 0x80)
5340         {
5341           if (c < 0xA0
5342               && check_latin_extra
5343               && (!VECTORP (Vlatin_extra_code_table)
5344                   || NILP (AREF (Vlatin_extra_code_table, c))))
5345             break;
5346           found = CATEGORY_MASK_CHARSET;
5347         }
5348       if (INTEGERP (val))
5349         {
5350           charset = CHARSET_FROM_ID (XFASTINT (val));
5351           dim = CHARSET_DIMENSION (charset);
5352           for (idx = 1; idx < dim; idx++)
5353             {
5354               if (src == src_end)
5355                 goto too_short;
5356               ONE_MORE_BYTE (c);
5357               if (c < charset->code_space[(dim - 1 - idx) * 4]
5358                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5359                 break;
5360             }
5361           if (idx < dim)
5362             break;
5363         }
5364       else
5365         {
5366           idx = 1;
5367           for (; CONSP (val); val = XCDR (val))
5368             {
5369               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5370               dim = CHARSET_DIMENSION (charset);
5371               while (idx < dim)
5372                 {
5373                   if (src == src_end)
5374                     goto too_short;
5375                   ONE_MORE_BYTE (c);
5376                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5377                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5378                     break;
5379                   idx++;
5380                 }
5381               if (idx == dim)
5382                 {
5383                   val = Qnil;
5384                   break;
5385                 }
5386             }
5387           if (CONSP (val))
5388             break;
5389         }
5390     }
5391  too_short:
5392   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5393   return 0;
5394
5395  no_more_source:
5396   detect_info->found |= found;
5397   return 1;
5398 }
5399
5400 static void
5401 decode_coding_charset (struct coding_system *coding)
5402 {
5403   const unsigned char *src = coding->source + coding->consumed;
5404   const unsigned char *src_end = coding->source + coding->src_bytes;
5405   const unsigned char *src_base;
5406   int *charbuf = coding->charbuf + coding->charbuf_used;
5407   /* We may produce one charset annotation in one loop and one more at
5408      the end.  */
5409   int *charbuf_end
5410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5411   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5412   bool multibytep = coding->src_multibyte;
5413   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5414   Lisp_Object valids;
5415   ptrdiff_t char_offset = coding->produced_char;
5416   ptrdiff_t last_offset = char_offset;
5417   int last_id = charset_ascii;
5418   bool eol_dos
5419     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5420   int byte_after_cr = -1;
5421
5422   valids = AREF (attrs, coding_attr_charset_valids);
5423
5424   while (1)
5425     {
5426       int c;
5427       Lisp_Object val;
5428       struct charset *charset;
5429       int dim;
5430       int len = 1;
5431       unsigned code;
5432
5433       src_base = src;
5434       consumed_chars_base = consumed_chars;
5435
5436       if (charbuf >= charbuf_end)
5437         {
5438           if (byte_after_cr >= 0)
5439             src_base--;
5440           break;
5441         }
5442
5443       if (byte_after_cr >= 0)
5444         {
5445           c = byte_after_cr;
5446           byte_after_cr = -1;
5447         }
5448       else
5449         {
5450           ONE_MORE_BYTE (c);
5451           if (eol_dos && c == '\r')
5452             ONE_MORE_BYTE (byte_after_cr);
5453         }
5454       if (c < 0)
5455         goto invalid_code;
5456       code = c;
5457
5458       val = AREF (valids, c);
5459       if (! INTEGERP (val) && ! CONSP (val))
5460         goto invalid_code;
5461       if (INTEGERP (val))
5462         {
5463           charset = CHARSET_FROM_ID (XFASTINT (val));
5464           dim = CHARSET_DIMENSION (charset);
5465           while (len < dim)
5466             {
5467               ONE_MORE_BYTE (c);
5468               code = (code << 8) | c;
5469               len++;
5470             }
5471           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5472                               charset, code, c);
5473         }
5474       else
5475         {
5476           /* VAL is a list of charset IDs.  It is assured that the
5477              list is sorted by charset dimensions (smaller one
5478              comes first).  */
5479           while (CONSP (val))
5480             {
5481               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5482               dim = CHARSET_DIMENSION (charset);
5483               while (len < dim)
5484                 {
5485                   ONE_MORE_BYTE (c);
5486                   code = (code << 8) | c;
5487                   len++;
5488                 }
5489               CODING_DECODE_CHAR (coding, src, src_base,
5490                                   src_end, charset, code, c);
5491               if (c >= 0)
5492                 break;
5493               val = XCDR (val);
5494             }
5495         }
5496       if (c < 0)
5497         goto invalid_code;
5498       if (charset->id != charset_ascii
5499           && last_id != charset->id)
5500         {
5501           if (last_id != charset_ascii)
5502             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5503           last_id = charset->id;
5504           last_offset = char_offset;
5505         }
5506
5507       *charbuf++ = c;
5508       char_offset++;
5509       continue;
5510
5511     invalid_code:
5512       src = src_base;
5513       consumed_chars = consumed_chars_base;
5514       ONE_MORE_BYTE (c);
5515       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5516       char_offset++;
5517       coding->errors++;
5518     }
5519
5520  no_more_source:
5521   if (last_id != charset_ascii)
5522     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5523   coding->consumed_char += consumed_chars_base;
5524   coding->consumed = src_base - coding->source;
5525   coding->charbuf_used = charbuf - coding->charbuf;
5526 }
5527
5528 static bool
5529 encode_coding_charset (struct coding_system *coding)
5530 {
5531   bool multibytep = coding->dst_multibyte;
5532   int *charbuf = coding->charbuf;
5533   int *charbuf_end = charbuf + coding->charbuf_used;
5534   unsigned char *dst = coding->destination + coding->produced;
5535   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5536   int safe_room = MAX_MULTIBYTE_LENGTH;
5537   ptrdiff_t produced_chars = 0;
5538   Lisp_Object attrs, charset_list;
5539   bool ascii_compatible;
5540   int c;
5541
5542   CODING_GET_INFO (coding, attrs, charset_list);
5543   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5544
5545   while (charbuf < charbuf_end)
5546     {
5547       struct charset *charset;
5548       unsigned code;
5549
5550       ASSURE_DESTINATION (safe_room);
5551       c = *charbuf++;
5552       if (ascii_compatible && ASCII_CHAR_P (c))
5553         EMIT_ONE_ASCII_BYTE (c);
5554       else if (CHAR_BYTE8_P (c))
5555         {
5556           c = CHAR_TO_BYTE8 (c);
5557           EMIT_ONE_BYTE (c);
5558         }
5559       else
5560         {
5561           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5562                                &code, charset);
5563
5564           if (charset)
5565             {
5566               if (CHARSET_DIMENSION (charset) == 1)
5567                 EMIT_ONE_BYTE (code);
5568               else if (CHARSET_DIMENSION (charset) == 2)
5569                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5570               else if (CHARSET_DIMENSION (charset) == 3)
5571                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5572               else
5573                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5574                                  (code >> 8) & 0xFF, code & 0xFF);
5575             }
5576           else
5577             {
5578               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5579                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5580               else
5581                 c = coding->default_char;
5582               EMIT_ONE_BYTE (c);
5583             }
5584         }
5585     }
5586
5587   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5588   coding->produced_char += produced_chars;
5589   coding->produced = dst - coding->destination;
5590   return 0;
5591 }
5592
5593 \f
5594 /*** 7. C library functions ***/
5595
5596 /* Setup coding context CODING from information about CODING_SYSTEM.
5597    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5598    CODING_SYSTEM is invalid, signal an error.  */
5599
5600 void
5601 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5602 {
5603   Lisp_Object attrs;
5604   Lisp_Object eol_type;
5605   Lisp_Object coding_type;
5606   Lisp_Object val;
5607
5608   if (NILP (coding_system))
5609     coding_system = Qundecided;
5610
5611   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5612
5613   attrs = CODING_ID_ATTRS (coding->id);
5614   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5615
5616   coding->mode = 0;
5617   coding->head_ascii = -1;
5618   if (VECTORP (eol_type))
5619     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5620                             | CODING_REQUIRE_DETECTION_MASK);
5621   else if (! EQ (eol_type, Qunix))
5622     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5623                             | CODING_REQUIRE_ENCODING_MASK);
5624   else
5625     coding->common_flags = 0;
5626   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5627     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5628   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5629     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5630   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5631     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5632
5633   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5634   coding->max_charset_id = SCHARS (val) - 1;
5635   coding->safe_charsets = SDATA (val);
5636   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5637   coding->carryover_bytes = 0;
5638
5639   coding_type = CODING_ATTR_TYPE (attrs);
5640   if (EQ (coding_type, Qundecided))
5641     {
5642       coding->detector = NULL;
5643       coding->decoder = decode_coding_raw_text;
5644       coding->encoder = encode_coding_raw_text;
5645       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5646     }
5647   else if (EQ (coding_type, Qiso_2022))
5648     {
5649       int i;
5650       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5651
5652       /* Invoke graphic register 0 to plane 0.  */
5653       CODING_ISO_INVOCATION (coding, 0) = 0;
5654       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5655       CODING_ISO_INVOCATION (coding, 1)
5656         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5657       /* Setup the initial status of designation.  */
5658       for (i = 0; i < 4; i++)
5659         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5660       /* Not single shifting initially.  */
5661       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5662       /* Beginning of buffer should also be regarded as bol. */
5663       CODING_ISO_BOL (coding) = 1;
5664       coding->detector = detect_coding_iso_2022;
5665       coding->decoder = decode_coding_iso_2022;
5666       coding->encoder = encode_coding_iso_2022;
5667       if (flags & CODING_ISO_FLAG_SAFE)
5668         coding->mode |= CODING_MODE_SAFE_ENCODING;
5669       coding->common_flags
5670         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5671             | CODING_REQUIRE_FLUSHING_MASK);
5672       if (flags & CODING_ISO_FLAG_COMPOSITION)
5673         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5674       if (flags & CODING_ISO_FLAG_DESIGNATION)
5675         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5676       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5677         {
5678           setup_iso_safe_charsets (attrs);
5679           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5680           coding->max_charset_id = SCHARS (val) - 1;
5681           coding->safe_charsets = SDATA (val);
5682         }
5683       CODING_ISO_FLAGS (coding) = flags;
5684       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5685       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5686       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5687       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5688     }
5689   else if (EQ (coding_type, Qcharset))
5690     {
5691       coding->detector = detect_coding_charset;
5692       coding->decoder = decode_coding_charset;
5693       coding->encoder = encode_coding_charset;
5694       coding->common_flags
5695         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5696     }
5697   else if (EQ (coding_type, Qutf_8))
5698     {
5699       val = AREF (attrs, coding_attr_utf_bom);
5700       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5701                                    : EQ (val, Qt) ? utf_with_bom
5702                                    : utf_without_bom);
5703       coding->detector = detect_coding_utf_8;
5704       coding->decoder = decode_coding_utf_8;
5705       coding->encoder = encode_coding_utf_8;
5706       coding->common_flags
5707         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5709         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5710     }
5711   else if (EQ (coding_type, Qutf_16))
5712     {
5713       val = AREF (attrs, coding_attr_utf_bom);
5714       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5715                                     : EQ (val, Qt) ? utf_with_bom
5716                                     : utf_without_bom);
5717       val = AREF (attrs, coding_attr_utf_16_endian);
5718       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5719                                        : utf_16_little_endian);
5720       CODING_UTF_16_SURROGATE (coding) = 0;
5721       coding->detector = detect_coding_utf_16;
5722       coding->decoder = decode_coding_utf_16;
5723       coding->encoder = encode_coding_utf_16;
5724       coding->common_flags
5725         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5726       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5727         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5728     }
5729   else if (EQ (coding_type, Qccl))
5730     {
5731       coding->detector = detect_coding_ccl;
5732       coding->decoder = decode_coding_ccl;
5733       coding->encoder = encode_coding_ccl;
5734       coding->common_flags
5735         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5736             | CODING_REQUIRE_FLUSHING_MASK);
5737     }
5738   else if (EQ (coding_type, Qemacs_mule))
5739     {
5740       coding->detector = detect_coding_emacs_mule;
5741       coding->decoder = decode_coding_emacs_mule;
5742       coding->encoder = encode_coding_emacs_mule;
5743       coding->common_flags
5744         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5745       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5746           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5747         {
5748           Lisp_Object tail, safe_charsets;
5749           int max_charset_id = 0;
5750
5751           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5752                tail = XCDR (tail))
5753             if (max_charset_id < XFASTINT (XCAR (tail)))
5754               max_charset_id = XFASTINT (XCAR (tail));
5755           safe_charsets = make_uninit_string (max_charset_id + 1);
5756           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5757           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5758                tail = XCDR (tail))
5759             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5760           coding->max_charset_id = max_charset_id;
5761           coding->safe_charsets = SDATA (safe_charsets);
5762         }
5763       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5764       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5765     }
5766   else if (EQ (coding_type, Qshift_jis))
5767     {
5768       coding->detector = detect_coding_sjis;
5769       coding->decoder = decode_coding_sjis;
5770       coding->encoder = encode_coding_sjis;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5773     }
5774   else if (EQ (coding_type, Qbig5))
5775     {
5776       coding->detector = detect_coding_big5;
5777       coding->decoder = decode_coding_big5;
5778       coding->encoder = encode_coding_big5;
5779       coding->common_flags
5780         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781     }
5782   else                          /* EQ (coding_type, Qraw_text) */
5783     {
5784       coding->detector = NULL;
5785       coding->decoder = decode_coding_raw_text;
5786       coding->encoder = encode_coding_raw_text;
5787       if (! EQ (eol_type, Qunix))
5788         {
5789           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5790           if (! VECTORP (eol_type))
5791             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5792         }
5793
5794     }
5795
5796   return;
5797 }
5798
5799 /* Return a list of charsets supported by CODING.  */
5800
5801 Lisp_Object
5802 coding_charset_list (struct coding_system *coding)
5803 {
5804   Lisp_Object attrs, charset_list;
5805
5806   CODING_GET_INFO (coding, attrs, charset_list);
5807   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5808     {
5809       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5810
5811       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5812         charset_list = Viso_2022_charset_list;
5813     }
5814   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5815     {
5816       charset_list = Vemacs_mule_charset_list;
5817     }
5818   return charset_list;
5819 }
5820
5821
5822 /* Return a list of charsets supported by CODING-SYSTEM.  */
5823
5824 Lisp_Object
5825 coding_system_charset_list (Lisp_Object coding_system)
5826 {
5827   ptrdiff_t id;
5828   Lisp_Object attrs, charset_list;
5829
5830   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5831   attrs = CODING_ID_ATTRS (id);
5832
5833   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5834     {
5835       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5836
5837       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5838         charset_list = Viso_2022_charset_list;
5839       else
5840         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5841     }
5842   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5843     {
5844       charset_list = Vemacs_mule_charset_list;
5845     }
5846   else
5847     {
5848       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849     }
5850   return charset_list;
5851 }
5852
5853
5854 /* Return raw-text or one of its subsidiaries that has the same
5855    eol_type as CODING-SYSTEM.  */
5856
5857 Lisp_Object
5858 raw_text_coding_system (Lisp_Object coding_system)
5859 {
5860   Lisp_Object spec, attrs;
5861   Lisp_Object eol_type, raw_text_eol_type;
5862
5863   if (NILP (coding_system))
5864     return Qraw_text;
5865   spec = CODING_SYSTEM_SPEC (coding_system);
5866   attrs = AREF (spec, 0);
5867
5868   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5869     return coding_system;
5870
5871   eol_type = AREF (spec, 2);
5872   if (VECTORP (eol_type))
5873     return Qraw_text;
5874   spec = CODING_SYSTEM_SPEC (Qraw_text);
5875   raw_text_eol_type = AREF (spec, 2);
5876   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5877           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5878           : AREF (raw_text_eol_type, 2));
5879 }
5880
5881
5882 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5883    the subsidiary that has the same eol-spec as PARENT (if it is not
5884    nil and specifies end-of-line format) or the system's setting
5885    (system_eol_type).  */
5886
5887 Lisp_Object
5888 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5889 {
5890   Lisp_Object spec, eol_type;
5891
5892   if (NILP (coding_system))
5893     coding_system = Qraw_text;
5894   spec = CODING_SYSTEM_SPEC (coding_system);
5895   eol_type = AREF (spec, 2);
5896   if (VECTORP (eol_type))
5897     {
5898       Lisp_Object parent_eol_type;
5899
5900       if (! NILP (parent))
5901         {
5902           Lisp_Object parent_spec;
5903
5904           parent_spec = CODING_SYSTEM_SPEC (parent);
5905           parent_eol_type = AREF (parent_spec, 2);
5906           if (VECTORP (parent_eol_type))
5907             parent_eol_type = system_eol_type;
5908         }
5909       else
5910         parent_eol_type = system_eol_type;
5911       if (EQ (parent_eol_type, Qunix))
5912         coding_system = AREF (eol_type, 0);
5913       else if (EQ (parent_eol_type, Qdos))
5914         coding_system = AREF (eol_type, 1);
5915       else if (EQ (parent_eol_type, Qmac))
5916         coding_system = AREF (eol_type, 2);
5917     }
5918   return coding_system;
5919 }
5920
5921
5922 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5923    decided for writing to a process.  If not, complement them, and
5924    return a new coding system.  */
5925
5926 Lisp_Object
5927 complement_process_encoding_system (Lisp_Object coding_system)
5928 {
5929   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5930   Lisp_Object spec, attrs;
5931   int i;
5932
5933   for (i = 0; i < 3; i++)
5934     {
5935       if (i == 1)
5936         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5937       else if (i == 2)
5938         coding_system = preferred_coding_system ();
5939       spec = CODING_SYSTEM_SPEC (coding_system);
5940       if (NILP (spec))
5941         continue;
5942       attrs = AREF (spec, 0);
5943       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5944         coding_base = CODING_ATTR_BASE_NAME (attrs);
5945       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5946         eol_base = coding_system;
5947       if (! NILP (coding_base) && ! NILP (eol_base))
5948         break;
5949     }
5950
5951   if (i > 0)
5952     /* The original CODING_SYSTEM didn't specify text-conversion or
5953        eol-conversion.  Be sure that we return a fully complemented
5954        coding system.  */
5955     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5956   return coding_system;
5957 }
5958
5959
5960 /* Emacs has a mechanism to automatically detect a coding system if it
5961    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5962    it's impossible to distinguish some coding systems accurately
5963    because they use the same range of codes.  So, at first, coding
5964    systems are categorized into 7, those are:
5965
5966    o coding-category-emacs-mule
5967
5968         The category for a coding system which has the same code range
5969         as Emacs' internal format.  Assigned the coding-system (Lisp
5970         symbol) `emacs-mule' by default.
5971
5972    o coding-category-sjis
5973
5974         The category for a coding system which has the same code range
5975         as SJIS.  Assigned the coding-system (Lisp
5976         symbol) `japanese-shift-jis' by default.
5977
5978    o coding-category-iso-7
5979
5980         The category for a coding system which has the same code range
5981         as ISO2022 of 7-bit environment.  This doesn't use any locking
5982         shift and single shift functions.  This can encode/decode all
5983         charsets.  Assigned the coding-system (Lisp symbol)
5984         `iso-2022-7bit' by default.
5985
5986    o coding-category-iso-7-tight
5987
5988         Same as coding-category-iso-7 except that this can
5989         encode/decode only the specified charsets.
5990
5991    o coding-category-iso-8-1
5992
5993         The category for a coding system which has the same code range
5994         as ISO2022 of 8-bit environment and graphic plane 1 used only
5995         for DIMENSION1 charset.  This doesn't use any locking shift
5996         and single shift functions.  Assigned the coding-system (Lisp
5997         symbol) `iso-latin-1' by default.
5998
5999    o coding-category-iso-8-2
6000
6001         The category for a coding system which has the same code range
6002         as ISO2022 of 8-bit environment and graphic plane 1 used only
6003         for DIMENSION2 charset.  This doesn't use any locking shift
6004         and single shift functions.  Assigned the coding-system (Lisp
6005         symbol) `japanese-iso-8bit' by default.
6006
6007    o coding-category-iso-7-else
6008
6009         The category for a coding system which has the same code range
6010         as ISO2022 of 7-bit environment but uses locking shift or
6011         single shift functions.  Assigned the coding-system (Lisp
6012         symbol) `iso-2022-7bit-lock' by default.
6013
6014    o coding-category-iso-8-else
6015
6016         The category for a coding system which has the same code range
6017         as ISO2022 of 8-bit environment but uses locking shift or
6018         single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `iso-2022-8bit-ss2' by default.
6020
6021    o coding-category-big5
6022
6023         The category for a coding system which has the same code range
6024         as BIG5.  Assigned the coding-system (Lisp symbol)
6025         `cn-big5' by default.
6026
6027    o coding-category-utf-8
6028
6029         The category for a coding system which has the same code range
6030         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6031         symbol) `utf-8' by default.
6032
6033    o coding-category-utf-16-be
6034
6035         The category for a coding system in which a text has an
6036         Unicode signature (cf. Unicode Standard) in the order of BIG
6037         endian at the head.  Assigned the coding-system (Lisp symbol)
6038         `utf-16-be' by default.
6039
6040    o coding-category-utf-16-le
6041
6042         The category for a coding system in which a text has an
6043         Unicode signature (cf. Unicode Standard) in the order of
6044         LITTLE endian at the head.  Assigned the coding-system (Lisp
6045         symbol) `utf-16-le' by default.
6046
6047    o coding-category-ccl
6048
6049         The category for a coding system of which encoder/decoder is
6050         written in CCL programs.  The default value is nil, i.e., no
6051         coding system is assigned.
6052
6053    o coding-category-binary
6054
6055         The category for a coding system not categorized in any of the
6056         above.  Assigned the coding-system (Lisp symbol)
6057         `no-conversion' by default.
6058
6059    Each of them is a Lisp symbol and the value is an actual
6060    `coding-system's (this is also a Lisp symbol) assigned by a user.
6061    What Emacs does actually is to detect a category of coding system.
6062    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6063    decide only one possible category, it selects a category of the
6064    highest priority.  Priorities of categories are also specified by a
6065    user in a Lisp variable `coding-category-list'.
6066
6067 */
6068
6069 #define EOL_SEEN_NONE   0
6070 #define EOL_SEEN_LF     1
6071 #define EOL_SEEN_CR     2
6072 #define EOL_SEEN_CRLF   4
6073
6074 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6075    SOURCE is encoded.  If CATEGORY is one of
6076    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6077    two-byte, else they are encoded by one-byte.
6078
6079    Return one of EOL_SEEN_XXX.  */
6080
6081 #define MAX_EOL_CHECK_COUNT 3
6082
6083 static int
6084 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6085             enum coding_category category)
6086 {
6087   const unsigned char *src = source, *src_end = src + src_bytes;
6088   unsigned char c;
6089   int total  = 0;
6090   int eol_seen = EOL_SEEN_NONE;
6091
6092   if ((1 << category) & CATEGORY_MASK_UTF_16)
6093     {
6094       bool msb = category == (coding_category_utf_16_le
6095                               | coding_category_utf_16_le_nosig);
6096       bool lsb = !msb;
6097
6098       while (src + 1 < src_end)
6099         {
6100           c = src[lsb];
6101           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6102             {
6103               int this_eol;
6104
6105               if (c == '\n')
6106                 this_eol = EOL_SEEN_LF;
6107               else if (src + 3 >= src_end
6108                        || src[msb + 2] != 0
6109                        || src[lsb + 2] != '\n')
6110                 this_eol = EOL_SEEN_CR;
6111               else
6112                 {
6113                   this_eol = EOL_SEEN_CRLF;
6114                   src += 2;
6115                 }
6116
6117               if (eol_seen == EOL_SEEN_NONE)
6118                 /* This is the first end-of-line.  */
6119                 eol_seen = this_eol;
6120               else if (eol_seen != this_eol)
6121                 {
6122                   /* The found type is different from what found before.
6123                      Allow for stray ^M characters in DOS EOL files.  */
6124                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6125                       || (eol_seen == EOL_SEEN_CRLF
6126                           && this_eol == EOL_SEEN_CR))
6127                     eol_seen = EOL_SEEN_CRLF;
6128                   else
6129                     {
6130                       eol_seen = EOL_SEEN_LF;
6131                       break;
6132                     }
6133                 }
6134               if (++total == MAX_EOL_CHECK_COUNT)
6135                 break;
6136             }
6137           src += 2;
6138         }
6139     }
6140   else
6141     while (src < src_end)
6142       {
6143         c = *src++;
6144         if (c == '\n' || c == '\r')
6145           {
6146             int this_eol;
6147
6148             if (c == '\n')
6149               this_eol = EOL_SEEN_LF;
6150             else if (src >= src_end || *src != '\n')
6151               this_eol = EOL_SEEN_CR;
6152             else
6153               this_eol = EOL_SEEN_CRLF, src++;
6154
6155             if (eol_seen == EOL_SEEN_NONE)
6156               /* This is the first end-of-line.  */
6157               eol_seen = this_eol;
6158             else if (eol_seen != this_eol)
6159               {
6160                 /* The found type is different from what found before.
6161                    Allow for stray ^M characters in DOS EOL files.  */
6162                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6163                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6164                   eol_seen = EOL_SEEN_CRLF;
6165                 else
6166                   {
6167                     eol_seen = EOL_SEEN_LF;
6168                     break;
6169                   }
6170               }
6171             if (++total == MAX_EOL_CHECK_COUNT)
6172               break;
6173           }
6174       }
6175   return eol_seen;
6176 }
6177
6178
6179 static Lisp_Object
6180 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6181 {
6182   Lisp_Object eol_type;
6183
6184   eol_type = CODING_ID_EOL_TYPE (coding->id);
6185   if (eol_seen & EOL_SEEN_LF)
6186     {
6187       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6188       eol_type = Qunix;
6189     }
6190   else if (eol_seen & EOL_SEEN_CRLF)
6191     {
6192       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6193       eol_type = Qdos;
6194     }
6195   else if (eol_seen & EOL_SEEN_CR)
6196     {
6197       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6198       eol_type = Qmac;
6199     }
6200   return eol_type;
6201 }
6202
6203 /* Detect how a text specified in CODING is encoded.  If a coding
6204    system is detected, update fields of CODING by the detected coding
6205    system.  */
6206
6207 static void
6208 detect_coding (struct coding_system *coding)
6209 {
6210   const unsigned char *src, *src_end;
6211   unsigned int saved_mode = coding->mode;
6212
6213   coding->consumed = coding->consumed_char = 0;
6214   coding->produced = coding->produced_char = 0;
6215   coding_set_source (coding);
6216
6217   src_end = coding->source + coding->src_bytes;
6218   coding->head_ascii = 0;
6219
6220   /* If we have not yet decided the text encoding type, detect it
6221      now.  */
6222   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6223     {
6224       int c, i;
6225       struct coding_detection_info detect_info;
6226       bool null_byte_found = 0, eight_bit_found = 0;
6227
6228       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6229       for (src = coding->source; src < src_end; src++)
6230         {
6231           c = *src;
6232           if (c & 0x80)
6233             {
6234               eight_bit_found = 1;
6235               if (null_byte_found)
6236                 break;
6237             }
6238           else if (c < 0x20)
6239             {
6240               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6241                   && ! inhibit_iso_escape_detection
6242                   && ! detect_info.checked)
6243                 {
6244                   if (detect_coding_iso_2022 (coding, &detect_info))
6245                     {
6246                       /* We have scanned the whole data.  */
6247                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6248                         {
6249                           /* We didn't find an 8-bit code.  We may
6250                              have found a null-byte, but it's very
6251                              rare that a binary file conforms to
6252                              ISO-2022.  */
6253                           src = src_end;
6254                           coding->head_ascii = src - coding->source;
6255                         }
6256                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6257                       break;
6258                     }
6259                 }
6260               else if (! c && !inhibit_null_byte_detection)
6261                 {
6262                   null_byte_found = 1;
6263                   if (eight_bit_found)
6264                     break;
6265                 }
6266               if (! eight_bit_found)
6267                 coding->head_ascii++;
6268             }
6269           else if (! eight_bit_found)
6270             coding->head_ascii++;
6271         }
6272
6273       if (null_byte_found || eight_bit_found
6274           || coding->head_ascii < coding->src_bytes
6275           || detect_info.found)
6276         {
6277           enum coding_category category;
6278           struct coding_system *this;
6279
6280           if (coding->head_ascii == coding->src_bytes)
6281             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6282             for (i = 0; i < coding_category_raw_text; i++)
6283               {
6284                 category = coding_priorities[i];
6285                 this = coding_categories + category;
6286                 if (detect_info.found & (1 << category))
6287                   break;
6288               }
6289           else
6290             {
6291               if (null_byte_found)
6292                 {
6293                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6294                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6295                 }
6296               for (i = 0; i < coding_category_raw_text; i++)
6297                 {
6298                   category = coding_priorities[i];
6299                   this = coding_categories + category;
6300                   /* Some of this->detector (e.g. detect_coding_sjis)
6301                      require this information.  */
6302                   coding->id = this->id;
6303                   if (this->id < 0)
6304                     {
6305                       /* No coding system of this category is defined.  */
6306                       detect_info.rejected |= (1 << category);
6307                     }
6308                   else if (category >= coding_category_raw_text)
6309                     continue;
6310                   else if (detect_info.checked & (1 << category))
6311                     {
6312                       if (detect_info.found & (1 << category))
6313                         break;
6314                     }
6315                   else if ((*(this->detector)) (coding, &detect_info)
6316                            && detect_info.found & (1 << category))
6317                     {
6318                       if (category == coding_category_utf_16_auto)
6319                         {
6320                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6321                             category = coding_category_utf_16_le;
6322                           else
6323                             category = coding_category_utf_16_be;
6324                         }
6325                       break;
6326                     }
6327                 }
6328             }
6329
6330           if (i < coding_category_raw_text)
6331             setup_coding_system (CODING_ID_NAME (this->id), coding);
6332           else if (null_byte_found)
6333             setup_coding_system (Qno_conversion, coding);
6334           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6335                    == CATEGORY_MASK_ANY)
6336             setup_coding_system (Qraw_text, coding);
6337           else if (detect_info.rejected)
6338             for (i = 0; i < coding_category_raw_text; i++)
6339               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6340                 {
6341                   this = coding_categories + coding_priorities[i];
6342                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6343                   break;
6344                 }
6345         }
6346     }
6347   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6348            == coding_category_utf_8_auto)
6349     {
6350       Lisp_Object coding_systems;
6351       struct coding_detection_info detect_info;
6352
6353       coding_systems
6354         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6355       detect_info.found = detect_info.rejected = 0;
6356       for (src = coding->source; src < src_end; src++)
6357         {
6358           if (*src & 0x80)
6359             break;
6360         }
6361       coding->head_ascii = src - coding->source;
6362       if (CONSP (coding_systems)
6363           && detect_coding_utf_8 (coding, &detect_info))
6364         {
6365           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6366             setup_coding_system (XCAR (coding_systems), coding);
6367           else
6368             setup_coding_system (XCDR (coding_systems), coding);
6369         }
6370     }
6371   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6372            == coding_category_utf_16_auto)
6373     {
6374       Lisp_Object coding_systems;
6375       struct coding_detection_info detect_info;
6376
6377       coding_systems
6378         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6379       detect_info.found = detect_info.rejected = 0;
6380       coding->head_ascii = 0;
6381       if (CONSP (coding_systems)
6382           && detect_coding_utf_16 (coding, &detect_info))
6383         {
6384           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6385             setup_coding_system (XCAR (coding_systems), coding);
6386           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6387             setup_coding_system (XCDR (coding_systems), coding);
6388         }
6389     }
6390   coding->mode = saved_mode;
6391 }
6392
6393
6394 static void
6395 decode_eol (struct coding_system *coding)
6396 {
6397   Lisp_Object eol_type;
6398   unsigned char *p, *pbeg, *pend;
6399
6400   eol_type = CODING_ID_EOL_TYPE (coding->id);
6401   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6402     return;
6403
6404   if (NILP (coding->dst_object))
6405     pbeg = coding->destination;
6406   else
6407     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6408   pend = pbeg + coding->produced;
6409
6410   if (VECTORP (eol_type))
6411     {
6412       int eol_seen = EOL_SEEN_NONE;
6413
6414       for (p = pbeg; p < pend; p++)
6415         {
6416           if (*p == '\n')
6417             eol_seen |= EOL_SEEN_LF;
6418           else if (*p == '\r')
6419             {
6420               if (p + 1 < pend && *(p + 1) == '\n')
6421                 {
6422                   eol_seen |= EOL_SEEN_CRLF;
6423                   p++;
6424                 }
6425               else
6426                 eol_seen |= EOL_SEEN_CR;
6427             }
6428         }
6429       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6430       if ((eol_seen & EOL_SEEN_CRLF) != 0
6431           && (eol_seen & EOL_SEEN_CR) != 0
6432           && (eol_seen & EOL_SEEN_LF) == 0)
6433         eol_seen = EOL_SEEN_CRLF;
6434       else if (eol_seen != EOL_SEEN_NONE
6435           && eol_seen != EOL_SEEN_LF
6436           && eol_seen != EOL_SEEN_CRLF
6437           && eol_seen != EOL_SEEN_CR)
6438         eol_seen = EOL_SEEN_LF;
6439       if (eol_seen != EOL_SEEN_NONE)
6440         eol_type = adjust_coding_eol_type (coding, eol_seen);
6441     }
6442
6443   if (EQ (eol_type, Qmac))
6444     {
6445       for (p = pbeg; p < pend; p++)
6446         if (*p == '\r')
6447           *p = '\n';
6448     }
6449   else if (EQ (eol_type, Qdos))
6450     {
6451       ptrdiff_t n = 0;
6452
6453       if (NILP (coding->dst_object))
6454         {
6455           /* Start deleting '\r' from the tail to minimize the memory
6456              movement.  */
6457           for (p = pend - 2; p >= pbeg; p--)
6458             if (*p == '\r')
6459               {
6460                 memmove (p, p + 1, pend-- - p - 1);
6461                 n++;
6462               }
6463         }
6464       else
6465         {
6466           ptrdiff_t pos_byte = coding->dst_pos_byte;
6467           ptrdiff_t pos = coding->dst_pos;
6468           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6469
6470           while (pos < pos_end)
6471             {
6472               p = BYTE_POS_ADDR (pos_byte);
6473               if (*p == '\r' && p[1] == '\n')
6474                 {
6475                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6476                   n++;
6477                   pos_end--;
6478                 }
6479               pos++;
6480               if (coding->dst_multibyte)
6481                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6482               else
6483                 pos_byte++;
6484             }
6485         }
6486       coding->produced -= n;
6487       coding->produced_char -= n;
6488     }
6489 }
6490
6491
6492 /* Return a translation table (or list of them) from coding system
6493    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6494    not ENCODEP). */
6495
6496 static Lisp_Object
6497 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6498 {
6499   Lisp_Object standard, translation_table;
6500   Lisp_Object val;
6501
6502   if (NILP (Venable_character_translation))
6503     {
6504       if (max_lookup)
6505         *max_lookup = 0;
6506       return Qnil;
6507     }
6508   if (encodep)
6509     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6510       standard = Vstandard_translation_table_for_encode;
6511   else
6512     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6513       standard = Vstandard_translation_table_for_decode;
6514   if (NILP (translation_table))
6515     translation_table = standard;
6516   else
6517     {
6518       if (SYMBOLP (translation_table))
6519         translation_table = Fget (translation_table, Qtranslation_table);
6520       else if (CONSP (translation_table))
6521         {
6522           translation_table = Fcopy_sequence (translation_table);
6523           for (val = translation_table; CONSP (val); val = XCDR (val))
6524             if (SYMBOLP (XCAR (val)))
6525               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6526         }
6527       if (CHAR_TABLE_P (standard))
6528         {
6529           if (CONSP (translation_table))
6530             translation_table = nconc2 (translation_table,
6531                                         Fcons (standard, Qnil));
6532           else
6533             translation_table = Fcons (translation_table,
6534                                        Fcons (standard, Qnil));
6535         }
6536     }
6537
6538   if (max_lookup)
6539     {
6540       *max_lookup = 1;
6541       if (CHAR_TABLE_P (translation_table)
6542           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6543         {
6544           val = XCHAR_TABLE (translation_table)->extras[1];
6545           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6546             *max_lookup = XFASTINT (val);
6547         }
6548       else if (CONSP (translation_table))
6549         {
6550           Lisp_Object tail;
6551
6552           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6553             if (CHAR_TABLE_P (XCAR (tail))
6554                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6555               {
6556                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6557                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6558                   *max_lookup = XFASTINT (tailval);
6559               }
6560         }
6561     }
6562   return translation_table;
6563 }
6564
6565 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6566   do {                                                          \
6567     trans = Qnil;                                               \
6568     if (CHAR_TABLE_P (table))                                   \
6569       {                                                         \
6570         trans = CHAR_TABLE_REF (table, c);                      \
6571         if (CHARACTERP (trans))                                 \
6572           c = XFASTINT (trans), trans = Qnil;                   \
6573       }                                                         \
6574     else if (CONSP (table))                                     \
6575       {                                                         \
6576         Lisp_Object tail;                                       \
6577                                                                 \
6578         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6579           if (CHAR_TABLE_P (XCAR (tail)))                       \
6580             {                                                   \
6581               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6582               if (CHARACTERP (trans))                           \
6583                 c = XFASTINT (trans), trans = Qnil;             \
6584               else if (! NILP (trans))                          \
6585                 break;                                          \
6586             }                                                   \
6587       }                                                         \
6588   } while (0)
6589
6590
6591 /* Return a translation of character(s) at BUF according to TRANS.
6592    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6593    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6594    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6595    translation is found, and Qnil if not found..
6596    If BUF is too short to lookup characters in FROM, return Qt.  */
6597
6598 static Lisp_Object
6599 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6600 {
6601
6602   if (INTEGERP (trans))
6603     return trans;
6604   for (; CONSP (trans); trans = XCDR (trans))
6605     {
6606       Lisp_Object val = XCAR (trans);
6607       Lisp_Object from = XCAR (val);
6608       ptrdiff_t len = ASIZE (from);
6609       ptrdiff_t i;
6610
6611       for (i = 0; i < len; i++)
6612         {
6613           if (buf + i == buf_end)
6614             return Qt;
6615           if (XINT (AREF (from, i)) != buf[i])
6616             break;
6617         }
6618       if (i == len)
6619         return val;
6620     }
6621   return Qnil;
6622 }
6623
6624
6625 static int
6626 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6627                bool last_block)
6628 {
6629   unsigned char *dst = coding->destination + coding->produced;
6630   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6631   ptrdiff_t produced;
6632   ptrdiff_t produced_chars = 0;
6633   int carryover = 0;
6634
6635   if (! coding->chars_at_source)
6636     {
6637       /* Source characters are in coding->charbuf.  */
6638       int *buf = coding->charbuf;
6639       int *buf_end = buf + coding->charbuf_used;
6640
6641       if (EQ (coding->src_object, coding->dst_object))
6642         {
6643           coding_set_source (coding);
6644           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6645         }
6646
6647       while (buf < buf_end)
6648         {
6649           int c = *buf;
6650           ptrdiff_t i;
6651
6652           if (c >= 0)
6653             {
6654               ptrdiff_t from_nchars = 1, to_nchars = 1;
6655               Lisp_Object trans = Qnil;
6656
6657               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6658               if (! NILP (trans))
6659                 {
6660                   trans = get_translation (trans, buf, buf_end);
6661                   if (INTEGERP (trans))
6662                     c = XINT (trans);
6663                   else if (CONSP (trans))
6664                     {
6665                       from_nchars = ASIZE (XCAR (trans));
6666                       trans = XCDR (trans);
6667                       if (INTEGERP (trans))
6668                         c = XINT (trans);
6669                       else
6670                         {
6671                           to_nchars = ASIZE (trans);
6672                           c = XINT (AREF (trans, 0));
6673                         }
6674                     }
6675                   else if (EQ (trans, Qt) && ! last_block)
6676                     break;
6677                 }
6678
6679               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6680                 {
6681                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6682                        / MAX_MULTIBYTE_LENGTH)
6683                       < to_nchars)
6684                     memory_full (SIZE_MAX);
6685                   dst = alloc_destination (coding,
6686                                            buf_end - buf
6687                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6688                                            dst);
6689                   if (EQ (coding->src_object, coding->dst_object))
6690                     {
6691                       coding_set_source (coding);
6692                       dst_end = (((unsigned char *) coding->source)
6693                                  + coding->consumed);
6694                     }
6695                   else
6696                     dst_end = coding->destination + coding->dst_bytes;
6697                 }
6698
6699               for (i = 0; i < to_nchars; i++)
6700                 {
6701                   if (i > 0)
6702                     c = XINT (AREF (trans, i));
6703                   if (coding->dst_multibyte
6704                       || ! CHAR_BYTE8_P (c))
6705                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6706                   else
6707                     *dst++ = CHAR_TO_BYTE8 (c);
6708                 }
6709               produced_chars += to_nchars;
6710               buf += from_nchars;
6711             }
6712           else
6713             /* This is an annotation datum.  (-C) is the length.  */
6714             buf += -c;
6715         }
6716       carryover = buf_end - buf;
6717     }
6718   else
6719     {
6720       /* Source characters are at coding->source.  */
6721       const unsigned char *src = coding->source;
6722       const unsigned char *src_end = src + coding->consumed;
6723
6724       if (EQ (coding->dst_object, coding->src_object))
6725         dst_end = (unsigned char *) src;
6726       if (coding->src_multibyte != coding->dst_multibyte)
6727         {
6728           if (coding->src_multibyte)
6729             {
6730               bool multibytep = 1;
6731               ptrdiff_t consumed_chars = 0;
6732
6733               while (1)
6734                 {
6735                   const unsigned char *src_base = src;
6736                   int c;
6737
6738                   ONE_MORE_BYTE (c);
6739                   if (dst == dst_end)
6740                     {
6741                       if (EQ (coding->src_object, coding->dst_object))
6742                         dst_end = (unsigned char *) src;
6743                       if (dst == dst_end)
6744                         {
6745                           ptrdiff_t offset = src - coding->source;
6746
6747                           dst = alloc_destination (coding, src_end - src + 1,
6748                                                    dst);
6749                           dst_end = coding->destination + coding->dst_bytes;
6750                           coding_set_source (coding);
6751                           src = coding->source + offset;
6752                           src_end = coding->source + coding->consumed;
6753                           if (EQ (coding->src_object, coding->dst_object))
6754                             dst_end = (unsigned char *) src;
6755                         }
6756                     }
6757                   *dst++ = c;
6758                   produced_chars++;
6759                 }
6760             no_more_source:
6761               ;
6762             }
6763           else
6764             while (src < src_end)
6765               {
6766                 bool multibytep = 1;
6767                 int c = *src++;
6768
6769                 if (dst >= dst_end - 1)
6770                   {
6771                     if (EQ (coding->src_object, coding->dst_object))
6772                       dst_end = (unsigned char *) src;
6773                     if (dst >= dst_end - 1)
6774                       {
6775                         ptrdiff_t offset = src - coding->source;
6776                         ptrdiff_t more_bytes;
6777
6778                         if (EQ (coding->src_object, coding->dst_object))
6779                           more_bytes = ((src_end - src) / 2) + 2;
6780                         else
6781                           more_bytes = src_end - src + 2;
6782                         dst = alloc_destination (coding, more_bytes, dst);
6783                         dst_end = coding->destination + coding->dst_bytes;
6784                         coding_set_source (coding);
6785                         src = coding->source + offset;
6786                         src_end = coding->source + coding->consumed;
6787                         if (EQ (coding->src_object, coding->dst_object))
6788                           dst_end = (unsigned char *) src;
6789                       }
6790                   }
6791                 EMIT_ONE_BYTE (c);
6792               }
6793         }
6794       else
6795         {
6796           if (!EQ (coding->src_object, coding->dst_object))
6797             {
6798               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6799
6800               if (require > 0)
6801                 {
6802                   ptrdiff_t offset = src - coding->source;
6803
6804                   dst = alloc_destination (coding, require, dst);
6805                   coding_set_source (coding);
6806                   src = coding->source + offset;
6807                   src_end = coding->source + coding->consumed;
6808                 }
6809             }
6810           produced_chars = coding->consumed_char;
6811           while (src < src_end)
6812             *dst++ = *src++;
6813         }
6814     }
6815
6816   produced = dst - (coding->destination + coding->produced);
6817   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6818     insert_from_gap (produced_chars, produced);
6819   coding->produced += produced;
6820   coding->produced_char += produced_chars;
6821   return carryover;
6822 }
6823
6824 /* Compose text in CODING->object according to the annotation data at
6825    CHARBUF.  CHARBUF is an array:
6826      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6827  */
6828
6829 static void
6830 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6831 {
6832   int len;
6833   ptrdiff_t to;
6834   enum composition_method method;
6835   Lisp_Object components;
6836
6837   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6838   to = pos + charbuf[2];
6839   method = (enum composition_method) (charbuf[4]);
6840
6841   if (method == COMPOSITION_RELATIVE)
6842     components = Qnil;
6843   else
6844     {
6845       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6846       int i, j;
6847
6848       if (method == COMPOSITION_WITH_RULE)
6849         len = charbuf[2] * 3 - 2;
6850       charbuf += MAX_ANNOTATION_LENGTH;
6851       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6852       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6853         {
6854           if (charbuf[i] >= 0)
6855             args[j] = make_number (charbuf[i]);
6856           else
6857             {
6858               i++;
6859               args[j] = make_number (charbuf[i] % 0x100);
6860             }
6861         }
6862       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6863     }
6864   compose_text (pos, to, components, Qnil, coding->dst_object);
6865 }
6866
6867
6868 /* Put `charset' property on text in CODING->object according to
6869    the annotation data at CHARBUF.  CHARBUF is an array:
6870      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6871  */
6872
6873 static void
6874 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6875 {
6876   ptrdiff_t from = pos - charbuf[2];
6877   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6878
6879   Fput_text_property (make_number (from), make_number (pos),
6880                       Qcharset, CHARSET_NAME (charset),
6881                       coding->dst_object);
6882 }
6883
6884
6885 #define CHARBUF_SIZE 0x4000
6886
6887 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6888   do {                                                                  \
6889     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
6890     coding->charbuf_size = CHARBUF_SIZE;                                \
6891   } while (0)
6892
6893
6894 static void
6895 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6896 {
6897   int *charbuf = coding->charbuf;
6898   int *charbuf_end = charbuf + coding->charbuf_used;
6899
6900   if (NILP (coding->dst_object))
6901     return;
6902
6903   while (charbuf < charbuf_end)
6904     {
6905       if (*charbuf >= 0)
6906         pos++, charbuf++;
6907       else
6908         {
6909           int len = -*charbuf;
6910
6911           if (len > 2)
6912             switch (charbuf[1])
6913               {
6914               case CODING_ANNOTATE_COMPOSITION_MASK:
6915                 produce_composition (coding, charbuf, pos);
6916                 break;
6917               case CODING_ANNOTATE_CHARSET_MASK:
6918                 produce_charset (coding, charbuf, pos);
6919                 break;
6920               }
6921           charbuf += len;
6922         }
6923     }
6924 }
6925
6926 /* Decode the data at CODING->src_object into CODING->dst_object.
6927    CODING->src_object is a buffer, a string, or nil.
6928    CODING->dst_object is a buffer.
6929
6930    If CODING->src_object is a buffer, it must be the current buffer.
6931    In this case, if CODING->src_pos is positive, it is a position of
6932    the source text in the buffer, otherwise, the source text is in the
6933    gap area of the buffer, and CODING->src_pos specifies the offset of
6934    the text from GPT (which must be the same as PT).  If this is the
6935    same buffer as CODING->dst_object, CODING->src_pos must be
6936    negative.
6937
6938    If CODING->src_object is a string, CODING->src_pos is an index to
6939    that string.
6940
6941    If CODING->src_object is nil, CODING->source must already point to
6942    the non-relocatable memory area.  In this case, CODING->src_pos is
6943    an offset from CODING->source.
6944
6945    The decoded data is inserted at the current point of the buffer
6946    CODING->dst_object.
6947 */
6948
6949 static void
6950 decode_coding (struct coding_system *coding)
6951 {
6952   Lisp_Object attrs;
6953   Lisp_Object undo_list;
6954   Lisp_Object translation_table;
6955   struct ccl_spec cclspec;
6956   int carryover;
6957   int i;
6958
6959   USE_SAFE_ALLOCA;
6960
6961   if (BUFFERP (coding->src_object)
6962       && coding->src_pos > 0
6963       && coding->src_pos < GPT
6964       && coding->src_pos + coding->src_chars > GPT)
6965     move_gap_both (coding->src_pos, coding->src_pos_byte);
6966
6967   undo_list = Qt;
6968   if (BUFFERP (coding->dst_object))
6969     {
6970       set_buffer_internal (XBUFFER (coding->dst_object));
6971       if (GPT != PT)
6972         move_gap_both (PT, PT_BYTE);
6973
6974       /* We must disable undo_list in order to record the whole insert
6975          transaction via record_insert at the end.  But doing so also
6976          disables the recording of the first change to the undo_list.
6977          Therefore we check for first change here and record it via
6978          record_first_change if needed.  */
6979       if (MODIFF <= SAVE_MODIFF)
6980         record_first_change ();
6981
6982       undo_list = BVAR (current_buffer, undo_list);
6983       bset_undo_list (current_buffer, Qt);
6984     }
6985
6986   coding->consumed = coding->consumed_char = 0;
6987   coding->produced = coding->produced_char = 0;
6988   coding->chars_at_source = 0;
6989   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6990   coding->errors = 0;
6991
6992   ALLOC_CONVERSION_WORK_AREA (coding);
6993
6994   attrs = CODING_ID_ATTRS (coding->id);
6995   translation_table = get_translation_table (attrs, 0, NULL);
6996
6997   carryover = 0;
6998   if (coding->decoder == decode_coding_ccl)
6999     {
7000       coding->spec.ccl = &cclspec;
7001       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7002     }
7003   do
7004     {
7005       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7006
7007       coding_set_source (coding);
7008       coding->annotated = 0;
7009       coding->charbuf_used = carryover;
7010       (*(coding->decoder)) (coding);
7011       coding_set_destination (coding);
7012       carryover = produce_chars (coding, translation_table, 0);
7013       if (coding->annotated)
7014         produce_annotation (coding, pos);
7015       for (i = 0; i < carryover; i++)
7016         coding->charbuf[i]
7017           = coding->charbuf[coding->charbuf_used - carryover + i];
7018     }
7019   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7020          || (coding->consumed < coding->src_bytes
7021              && (coding->result == CODING_RESULT_SUCCESS
7022                  || coding->result == CODING_RESULT_INVALID_SRC)));
7023
7024   if (carryover > 0)
7025     {
7026       coding_set_destination (coding);
7027       coding->charbuf_used = carryover;
7028       produce_chars (coding, translation_table, 1);
7029     }
7030
7031   coding->carryover_bytes = 0;
7032   if (coding->consumed < coding->src_bytes)
7033     {
7034       int nbytes = coding->src_bytes - coding->consumed;
7035       const unsigned char *src;
7036
7037       coding_set_source (coding);
7038       coding_set_destination (coding);
7039       src = coding->source + coding->consumed;
7040
7041       if (coding->mode & CODING_MODE_LAST_BLOCK)
7042         {
7043           /* Flush out unprocessed data as binary chars.  We are sure
7044              that the number of data is less than the size of
7045              coding->charbuf.  */
7046           coding->charbuf_used = 0;
7047           coding->chars_at_source = 0;
7048
7049           while (nbytes-- > 0)
7050             {
7051               int c = *src++;
7052
7053               if (c & 0x80)
7054                 c = BYTE8_TO_CHAR (c);
7055               coding->charbuf[coding->charbuf_used++] = c;
7056             }
7057           produce_chars (coding, Qnil, 1);
7058         }
7059       else
7060         {
7061           /* Record unprocessed bytes in coding->carryover.  We are
7062              sure that the number of data is less than the size of
7063              coding->carryover.  */
7064           unsigned char *p = coding->carryover;
7065
7066           if (nbytes > sizeof coding->carryover)
7067             nbytes = sizeof coding->carryover;
7068           coding->carryover_bytes = nbytes;
7069           while (nbytes-- > 0)
7070             *p++ = *src++;
7071         }
7072       coding->consumed = coding->src_bytes;
7073     }
7074
7075   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7076       && !inhibit_eol_conversion)
7077     decode_eol (coding);
7078   if (BUFFERP (coding->dst_object))
7079     {
7080       bset_undo_list (current_buffer, undo_list);
7081       record_insert (coding->dst_pos, coding->produced_char);
7082     }
7083
7084   SAFE_FREE ();
7085 }
7086
7087
7088 /* Extract an annotation datum from a composition starting at POS and
7089    ending before LIMIT of CODING->src_object (buffer or string), store
7090    the data in BUF, set *STOP to a starting position of the next
7091    composition (if any) or to LIMIT, and return the address of the
7092    next element of BUF.
7093
7094    If such an annotation is not found, set *STOP to a starting
7095    position of a composition after POS (if any) or to LIMIT, and
7096    return BUF.  */
7097
7098 static int *
7099 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7100                                struct coding_system *coding, int *buf,
7101                                ptrdiff_t *stop)
7102 {
7103   ptrdiff_t start, end;
7104   Lisp_Object prop;
7105
7106   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7107       || end > limit)
7108     *stop = limit;
7109   else if (start > pos)
7110     *stop = start;
7111   else
7112     {
7113       if (start == pos)
7114         {
7115           /* We found a composition.  Store the corresponding
7116              annotation data in BUF.  */
7117           int *head = buf;
7118           enum composition_method method = COMPOSITION_METHOD (prop);
7119           int nchars = COMPOSITION_LENGTH (prop);
7120
7121           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7122           if (method != COMPOSITION_RELATIVE)
7123             {
7124               Lisp_Object components;
7125               ptrdiff_t i, len, i_byte;
7126
7127               components = COMPOSITION_COMPONENTS (prop);
7128               if (VECTORP (components))
7129                 {
7130                   len = ASIZE (components);
7131                   for (i = 0; i < len; i++)
7132                     *buf++ = XINT (AREF (components, i));
7133                 }
7134               else if (STRINGP (components))
7135                 {
7136                   len = SCHARS (components);
7137                   i = i_byte = 0;
7138                   while (i < len)
7139                     {
7140                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7141                       buf++;
7142                     }
7143                 }
7144               else if (INTEGERP (components))
7145                 {
7146                   len = 1;
7147                   *buf++ = XINT (components);
7148                 }
7149               else if (CONSP (components))
7150                 {
7151                   for (len = 0; CONSP (components);
7152                        len++, components = XCDR (components))
7153                     *buf++ = XINT (XCAR (components));
7154                 }
7155               else
7156                 emacs_abort ();
7157               *head -= len;
7158             }
7159         }
7160
7161       if (find_composition (end, limit, &start, &end, &prop,
7162                             coding->src_object)
7163           && end <= limit)
7164         *stop = start;
7165       else
7166         *stop = limit;
7167     }
7168   return buf;
7169 }
7170
7171
7172 /* Extract an annotation datum from a text property `charset' at POS of
7173    CODING->src_object (buffer of string), store the data in BUF, set
7174    *STOP to the position where the value of `charset' property changes
7175    (limiting by LIMIT), and return the address of the next element of
7176    BUF.
7177
7178    If the property value is nil, set *STOP to the position where the
7179    property value is non-nil (limiting by LIMIT), and return BUF.  */
7180
7181 static int *
7182 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7183                            struct coding_system *coding, int *buf,
7184                            ptrdiff_t *stop)
7185 {
7186   Lisp_Object val, next;
7187   int id;
7188
7189   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7190   if (! NILP (val) && CHARSETP (val))
7191     id = XINT (CHARSET_SYMBOL_ID (val));
7192   else
7193     id = -1;
7194   ADD_CHARSET_DATA (buf, 0, id);
7195   next = Fnext_single_property_change (make_number (pos), Qcharset,
7196                                        coding->src_object,
7197                                        make_number (limit));
7198   *stop = XINT (next);
7199   return buf;
7200 }
7201
7202
7203 static void
7204 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7205                int max_lookup)
7206 {
7207   int *buf = coding->charbuf;
7208   int *buf_end = coding->charbuf + coding->charbuf_size;
7209   const unsigned char *src = coding->source + coding->consumed;
7210   const unsigned char *src_end = coding->source + coding->src_bytes;
7211   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7212   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7213   bool multibytep = coding->src_multibyte;
7214   Lisp_Object eol_type;
7215   int c;
7216   ptrdiff_t stop, stop_composition, stop_charset;
7217   int *lookup_buf = NULL;
7218
7219   if (! NILP (translation_table))
7220     lookup_buf = alloca (sizeof (int) * max_lookup);
7221
7222   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7223   if (VECTORP (eol_type))
7224     eol_type = Qunix;
7225
7226   /* Note: composition handling is not yet implemented.  */
7227   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7228
7229   if (NILP (coding->src_object))
7230     stop = stop_composition = stop_charset = end_pos;
7231   else
7232     {
7233       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7234         stop = stop_composition = pos;
7235       else
7236         stop = stop_composition = end_pos;
7237       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7238         stop = stop_charset = pos;
7239       else
7240         stop_charset = end_pos;
7241     }
7242
7243   /* Compensate for CRLF and conversion.  */
7244   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7245   while (buf < buf_end)
7246     {
7247       Lisp_Object trans;
7248
7249       if (pos == stop)
7250         {
7251           if (pos == end_pos)
7252             break;
7253           if (pos == stop_composition)
7254             buf = handle_composition_annotation (pos, end_pos, coding,
7255                                                  buf, &stop_composition);
7256           if (pos == stop_charset)
7257             buf = handle_charset_annotation (pos, end_pos, coding,
7258                                              buf, &stop_charset);
7259           stop = (stop_composition < stop_charset
7260                   ? stop_composition : stop_charset);
7261         }
7262
7263       if (! multibytep)
7264         {
7265           int bytes;
7266
7267           if (coding->encoder == encode_coding_raw_text
7268               || coding->encoder == encode_coding_ccl)
7269             c = *src++, pos++;
7270           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7271             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7272           else
7273             c = BYTE8_TO_CHAR (*src), src++, pos++;
7274         }
7275       else
7276         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7277       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7278         c = '\n';
7279       if (! EQ (eol_type, Qunix))
7280         {
7281           if (c == '\n')
7282             {
7283               if (EQ (eol_type, Qdos))
7284                 *buf++ = '\r';
7285               else
7286                 c = '\r';
7287             }
7288         }
7289
7290       trans = Qnil;
7291       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7292       if (NILP (trans))
7293         *buf++ = c;
7294       else
7295         {
7296           ptrdiff_t from_nchars = 1, to_nchars = 1;
7297           int *lookup_buf_end;
7298           const unsigned char *p = src;
7299           int i;
7300
7301           lookup_buf[0] = c;
7302           for (i = 1; i < max_lookup && p < src_end; i++)
7303             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7304           lookup_buf_end = lookup_buf + i;
7305           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7306           if (INTEGERP (trans))
7307             c = XINT (trans);
7308           else if (CONSP (trans))
7309             {
7310               from_nchars = ASIZE (XCAR (trans));
7311               trans = XCDR (trans);
7312               if (INTEGERP (trans))
7313                 c = XINT (trans);
7314               else
7315                 {
7316                   to_nchars = ASIZE (trans);
7317                   if (buf_end - buf < to_nchars)
7318                     break;
7319                   c = XINT (AREF (trans, 0));
7320                 }
7321             }
7322           else
7323             break;
7324           *buf++ = c;
7325           for (i = 1; i < to_nchars; i++)
7326             *buf++ = XINT (AREF (trans, i));
7327           for (i = 1; i < from_nchars; i++, pos++)
7328             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7329         }
7330     }
7331
7332   coding->consumed = src - coding->source;
7333   coding->consumed_char = pos - coding->src_pos;
7334   coding->charbuf_used = buf - coding->charbuf;
7335   coding->chars_at_source = 0;
7336 }
7337
7338
7339 /* Encode the text at CODING->src_object into CODING->dst_object.
7340    CODING->src_object is a buffer or a string.
7341    CODING->dst_object is a buffer or nil.
7342
7343    If CODING->src_object is a buffer, it must be the current buffer.
7344    In this case, if CODING->src_pos is positive, it is a position of
7345    the source text in the buffer, otherwise. the source text is in the
7346    gap area of the buffer, and coding->src_pos specifies the offset of
7347    the text from GPT (which must be the same as PT).  If this is the
7348    same buffer as CODING->dst_object, CODING->src_pos must be
7349    negative and CODING should not have `pre-write-conversion'.
7350
7351    If CODING->src_object is a string, CODING should not have
7352    `pre-write-conversion'.
7353
7354    If CODING->dst_object is a buffer, the encoded data is inserted at
7355    the current point of that buffer.
7356
7357    If CODING->dst_object is nil, the encoded data is placed at the
7358    memory area specified by CODING->destination.  */
7359
7360 static void
7361 encode_coding (struct coding_system *coding)
7362 {
7363   Lisp_Object attrs;
7364   Lisp_Object translation_table;
7365   int max_lookup;
7366   struct ccl_spec cclspec;
7367
7368   USE_SAFE_ALLOCA;
7369
7370   attrs = CODING_ID_ATTRS (coding->id);
7371   if (coding->encoder == encode_coding_raw_text)
7372     translation_table = Qnil, max_lookup = 0;
7373   else
7374     translation_table = get_translation_table (attrs, 1, &max_lookup);
7375
7376   if (BUFFERP (coding->dst_object))
7377     {
7378       set_buffer_internal (XBUFFER (coding->dst_object));
7379       coding->dst_multibyte
7380         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7381     }
7382
7383   coding->consumed = coding->consumed_char = 0;
7384   coding->produced = coding->produced_char = 0;
7385   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7386   coding->errors = 0;
7387
7388   ALLOC_CONVERSION_WORK_AREA (coding);
7389
7390   if (coding->encoder == encode_coding_ccl)
7391     {
7392       coding->spec.ccl = &cclspec;
7393       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7394     }
7395   do {
7396     coding_set_source (coding);
7397     consume_chars (coding, translation_table, max_lookup);
7398     coding_set_destination (coding);
7399     (*(coding->encoder)) (coding);
7400   } while (coding->consumed_char < coding->src_chars);
7401
7402   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7403     insert_from_gap (coding->produced_char, coding->produced);
7404
7405   SAFE_FREE ();
7406 }
7407
7408
7409 /* Name (or base name) of work buffer for code conversion.  */
7410 static Lisp_Object Vcode_conversion_workbuf_name;
7411
7412 /* A working buffer used by the top level conversion.  Once it is
7413    created, it is never destroyed.  It has the name
7414    Vcode_conversion_workbuf_name.  The other working buffers are
7415    destroyed after the use is finished, and their names are modified
7416    versions of Vcode_conversion_workbuf_name.  */
7417 static Lisp_Object Vcode_conversion_reused_workbuf;
7418
7419 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7420 static bool reused_workbuf_in_use;
7421
7422
7423 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7424    multibyteness of returning buffer.  */
7425
7426 static Lisp_Object
7427 make_conversion_work_buffer (bool multibyte)
7428 {
7429   Lisp_Object name, workbuf;
7430   struct buffer *current;
7431
7432   if (reused_workbuf_in_use)
7433     {
7434       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7435       workbuf = Fget_buffer_create (name);
7436     }
7437   else
7438     {
7439       reused_workbuf_in_use = 1;
7440       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7441         Vcode_conversion_reused_workbuf
7442           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7443       workbuf = Vcode_conversion_reused_workbuf;
7444     }
7445   current = current_buffer;
7446   set_buffer_internal (XBUFFER (workbuf));
7447   /* We can't allow modification hooks to run in the work buffer.  For
7448      instance, directory_files_internal assumes that file decoding
7449      doesn't compile new regexps.  */
7450   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7451   Ferase_buffer ();
7452   bset_undo_list (current_buffer, Qt);
7453   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7454   set_buffer_internal (current);
7455   return workbuf;
7456 }
7457
7458
7459 static Lisp_Object
7460 code_conversion_restore (Lisp_Object arg)
7461 {
7462   Lisp_Object current, workbuf;
7463   struct gcpro gcpro1;
7464
7465   GCPRO1 (arg);
7466   current = XCAR (arg);
7467   workbuf = XCDR (arg);
7468   if (! NILP (workbuf))
7469     {
7470       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7471         reused_workbuf_in_use = 0;
7472       else
7473         Fkill_buffer (workbuf);
7474     }
7475   set_buffer_internal (XBUFFER (current));
7476   UNGCPRO;
7477   return Qnil;
7478 }
7479
7480 Lisp_Object
7481 code_conversion_save (bool with_work_buf, bool multibyte)
7482 {
7483   Lisp_Object workbuf = Qnil;
7484
7485   if (with_work_buf)
7486     workbuf = make_conversion_work_buffer (multibyte);
7487   record_unwind_protect (code_conversion_restore,
7488                          Fcons (Fcurrent_buffer (), workbuf));
7489   return workbuf;
7490 }
7491
7492 void
7493 decode_coding_gap (struct coding_system *coding,
7494                    ptrdiff_t chars, ptrdiff_t bytes)
7495 {
7496   ptrdiff_t count = SPECPDL_INDEX ();
7497   Lisp_Object attrs;
7498
7499   coding->src_object = Fcurrent_buffer ();
7500   coding->src_chars = chars;
7501   coding->src_bytes = bytes;
7502   coding->src_pos = -chars;
7503   coding->src_pos_byte = -bytes;
7504   coding->src_multibyte = chars < bytes;
7505   coding->dst_object = coding->src_object;
7506   coding->dst_pos = PT;
7507   coding->dst_pos_byte = PT_BYTE;
7508   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7509
7510   if (CODING_REQUIRE_DETECTION (coding))
7511     detect_coding (coding);
7512   attrs = CODING_ID_ATTRS (coding->id);
7513 #ifndef CODING_DISABLE_ASCII_OPTIMIZATION
7514   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7515       && NILP (CODING_ATTR_POST_READ (attrs))
7516       && NILP (get_translation_table (attrs, 0, NULL))
7517       && (inhibit_eol_conversion
7518           || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
7519     {
7520       /* We can skip the conversion if all source bytes are ASCII.  */
7521       if (coding->head_ascii < 0)
7522         {
7523           /* We have not yet counted the number of ASCII bytes at the
7524              head of the source.  Do it now.  */
7525           const unsigned char *src, *src_end;
7526
7527           coding_set_source (coding);
7528           src_end = coding->source + coding->src_bytes;
7529           for (src = coding->source; src < src_end; src++)
7530             {
7531               if (*src & 0x80)
7532                 break;
7533             }
7534           coding->head_ascii = src - coding->source;
7535         }
7536       if (coding->src_bytes == coding->head_ascii)
7537         {
7538           /* No need of conversion.  Use the data in the gap as is.  */
7539           coding->produced_char = chars;
7540           coding->produced = bytes;
7541           adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
7542           return;
7543         }
7544     }
7545 #endif  /* not CODING_DISABLE_ASCII_OPTIMIZATION */
7546   code_conversion_save (0, 0);
7547
7548   coding->mode |= CODING_MODE_LAST_BLOCK;
7549   current_buffer->text->inhibit_shrinking = 1;
7550   decode_coding (coding);
7551   current_buffer->text->inhibit_shrinking = 0;
7552
7553   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7554     {
7555       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7556       Lisp_Object val;
7557
7558       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7559       val = call1 (CODING_ATTR_POST_READ (attrs),
7560                    make_number (coding->produced_char));
7561       CHECK_NATNUM (val);
7562       coding->produced_char += Z - prev_Z;
7563       coding->produced += Z_BYTE - prev_Z_BYTE;
7564     }
7565
7566   unbind_to (count, Qnil);
7567 }
7568
7569
7570 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7571    SRC_OBJECT into DST_OBJECT by coding context CODING.
7572
7573    SRC_OBJECT is a buffer, a string, or Qnil.
7574
7575    If it is a buffer, the text is at point of the buffer.  FROM and TO
7576    are positions in the buffer.
7577
7578    If it is a string, the text is at the beginning of the string.
7579    FROM and TO are indices to the string.
7580
7581    If it is nil, the text is at coding->source.  FROM and TO are
7582    indices to coding->source.
7583
7584    DST_OBJECT is a buffer, Qt, or Qnil.
7585
7586    If it is a buffer, the decoded text is inserted at point of the
7587    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7588    is deleted.
7589
7590    If it is Qt, a string is made from the decoded text, and
7591    set in CODING->dst_object.
7592
7593    If it is Qnil, the decoded text is stored at CODING->destination.
7594    The caller must allocate CODING->dst_bytes bytes at
7595    CODING->destination by xmalloc.  If the decoded text is longer than
7596    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7597  */
7598
7599 void
7600 decode_coding_object (struct coding_system *coding,
7601                       Lisp_Object src_object,
7602                       ptrdiff_t from, ptrdiff_t from_byte,
7603                       ptrdiff_t to, ptrdiff_t to_byte,
7604                       Lisp_Object dst_object)
7605 {
7606   ptrdiff_t count = SPECPDL_INDEX ();
7607   unsigned char *destination IF_LINT (= NULL);
7608   ptrdiff_t dst_bytes IF_LINT (= 0);
7609   ptrdiff_t chars = to - from;
7610   ptrdiff_t bytes = to_byte - from_byte;
7611   Lisp_Object attrs;
7612   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7613   bool need_marker_adjustment = 0;
7614   Lisp_Object old_deactivate_mark;
7615
7616   old_deactivate_mark = Vdeactivate_mark;
7617
7618   if (NILP (dst_object))
7619     {
7620       destination = coding->destination;
7621       dst_bytes = coding->dst_bytes;
7622     }
7623
7624   coding->src_object = src_object;
7625   coding->src_chars = chars;
7626   coding->src_bytes = bytes;
7627   coding->src_multibyte = chars < bytes;
7628
7629   if (STRINGP (src_object))
7630     {
7631       coding->src_pos = from;
7632       coding->src_pos_byte = from_byte;
7633     }
7634   else if (BUFFERP (src_object))
7635     {
7636       set_buffer_internal (XBUFFER (src_object));
7637       if (from != GPT)
7638         move_gap_both (from, from_byte);
7639       if (EQ (src_object, dst_object))
7640         {
7641           struct Lisp_Marker *tail;
7642
7643           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7644             {
7645               tail->need_adjustment
7646                 = tail->charpos == (tail->insertion_type ? from : to);
7647               need_marker_adjustment |= tail->need_adjustment;
7648             }
7649           saved_pt = PT, saved_pt_byte = PT_BYTE;
7650           TEMP_SET_PT_BOTH (from, from_byte);
7651           current_buffer->text->inhibit_shrinking = 1;
7652           del_range_both (from, from_byte, to, to_byte, 1);
7653           coding->src_pos = -chars;
7654           coding->src_pos_byte = -bytes;
7655         }
7656       else
7657         {
7658           coding->src_pos = from;
7659           coding->src_pos_byte = from_byte;
7660         }
7661     }
7662
7663   if (CODING_REQUIRE_DETECTION (coding))
7664     detect_coding (coding);
7665   attrs = CODING_ID_ATTRS (coding->id);
7666
7667   if (EQ (dst_object, Qt)
7668       || (! NILP (CODING_ATTR_POST_READ (attrs))
7669           && NILP (dst_object)))
7670     {
7671       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7672       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7673       coding->dst_pos = BEG;
7674       coding->dst_pos_byte = BEG_BYTE;
7675     }
7676   else if (BUFFERP (dst_object))
7677     {
7678       code_conversion_save (0, 0);
7679       coding->dst_object = dst_object;
7680       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7681       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7682       coding->dst_multibyte
7683         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7684     }
7685   else
7686     {
7687       code_conversion_save (0, 0);
7688       coding->dst_object = Qnil;
7689       /* Most callers presume this will return a multibyte result, and they
7690          won't use `binary' or `raw-text' anyway, so let's not worry about
7691          CODING_FOR_UNIBYTE.  */
7692       coding->dst_multibyte = 1;
7693     }
7694
7695   decode_coding (coding);
7696
7697   if (BUFFERP (coding->dst_object))
7698     set_buffer_internal (XBUFFER (coding->dst_object));
7699
7700   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7701     {
7702       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7703       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7704       Lisp_Object val;
7705
7706       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7707       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7708               old_deactivate_mark);
7709       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7710                         make_number (coding->produced_char));
7711       UNGCPRO;
7712       CHECK_NATNUM (val);
7713       coding->produced_char += Z - prev_Z;
7714       coding->produced += Z_BYTE - prev_Z_BYTE;
7715     }
7716
7717   if (EQ (dst_object, Qt))
7718     {
7719       coding->dst_object = Fbuffer_string ();
7720     }
7721   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7722     {
7723       set_buffer_internal (XBUFFER (coding->dst_object));
7724       if (dst_bytes < coding->produced)
7725         {
7726           eassert (coding->produced > 0);
7727           destination = xrealloc (destination, coding->produced);
7728           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7729             move_gap_both (BEGV, BEGV_BYTE);
7730           memcpy (destination, BEGV_ADDR, coding->produced);
7731           coding->destination = destination;
7732         }
7733     }
7734
7735   if (saved_pt >= 0)
7736     {
7737       /* This is the case of:
7738          (BUFFERP (src_object) && EQ (src_object, dst_object))
7739          As we have moved PT while replacing the original buffer
7740          contents, we must recover it now.  */
7741       set_buffer_internal (XBUFFER (src_object));
7742       current_buffer->text->inhibit_shrinking = 0;
7743       if (saved_pt < from)
7744         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7745       else if (saved_pt < from + chars)
7746         TEMP_SET_PT_BOTH (from, from_byte);
7747       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7748         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7749                           saved_pt_byte + (coding->produced - bytes));
7750       else
7751         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7752                           saved_pt_byte + (coding->produced - bytes));
7753
7754       if (need_marker_adjustment)
7755         {
7756           struct Lisp_Marker *tail;
7757
7758           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7759             if (tail->need_adjustment)
7760               {
7761                 tail->need_adjustment = 0;
7762                 if (tail->insertion_type)
7763                   {
7764                     tail->bytepos = from_byte;
7765                     tail->charpos = from;
7766                   }
7767                 else
7768                   {
7769                     tail->bytepos = from_byte + coding->produced;
7770                     tail->charpos
7771                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7772                          ? tail->bytepos : from + coding->produced_char);
7773                   }
7774               }
7775         }
7776     }
7777
7778   Vdeactivate_mark = old_deactivate_mark;
7779   unbind_to (count, coding->dst_object);
7780 }
7781
7782
7783 void
7784 encode_coding_object (struct coding_system *coding,
7785                       Lisp_Object src_object,
7786                       ptrdiff_t from, ptrdiff_t from_byte,
7787                       ptrdiff_t to, ptrdiff_t to_byte,
7788                       Lisp_Object dst_object)
7789 {
7790   ptrdiff_t count = SPECPDL_INDEX ();
7791   ptrdiff_t chars = to - from;
7792   ptrdiff_t bytes = to_byte - from_byte;
7793   Lisp_Object attrs;
7794   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7795   bool need_marker_adjustment = 0;
7796   bool kill_src_buffer = 0;
7797   Lisp_Object old_deactivate_mark;
7798
7799   old_deactivate_mark = Vdeactivate_mark;
7800
7801   coding->src_object = src_object;
7802   coding->src_chars = chars;
7803   coding->src_bytes = bytes;
7804   coding->src_multibyte = chars < bytes;
7805
7806   attrs = CODING_ID_ATTRS (coding->id);
7807
7808   if (EQ (src_object, dst_object))
7809     {
7810       struct Lisp_Marker *tail;
7811
7812       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7813         {
7814           tail->need_adjustment
7815             = tail->charpos == (tail->insertion_type ? from : to);
7816           need_marker_adjustment |= tail->need_adjustment;
7817         }
7818     }
7819
7820   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7821     {
7822       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7823       set_buffer_internal (XBUFFER (coding->src_object));
7824       if (STRINGP (src_object))
7825         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7826       else if (BUFFERP (src_object))
7827         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7828       else
7829         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7830
7831       if (EQ (src_object, dst_object))
7832         {
7833           set_buffer_internal (XBUFFER (src_object));
7834           saved_pt = PT, saved_pt_byte = PT_BYTE;
7835           del_range_both (from, from_byte, to, to_byte, 1);
7836           set_buffer_internal (XBUFFER (coding->src_object));
7837         }
7838
7839       {
7840         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7841
7842         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7843                 old_deactivate_mark);
7844         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7845                     make_number (BEG), make_number (Z));
7846         UNGCPRO;
7847       }
7848       if (XBUFFER (coding->src_object) != current_buffer)
7849         kill_src_buffer = 1;
7850       coding->src_object = Fcurrent_buffer ();
7851       if (BEG != GPT)
7852         move_gap_both (BEG, BEG_BYTE);
7853       coding->src_chars = Z - BEG;
7854       coding->src_bytes = Z_BYTE - BEG_BYTE;
7855       coding->src_pos = BEG;
7856       coding->src_pos_byte = BEG_BYTE;
7857       coding->src_multibyte = Z < Z_BYTE;
7858     }
7859   else if (STRINGP (src_object))
7860     {
7861       code_conversion_save (0, 0);
7862       coding->src_pos = from;
7863       coding->src_pos_byte = from_byte;
7864     }
7865   else if (BUFFERP (src_object))
7866     {
7867       code_conversion_save (0, 0);
7868       set_buffer_internal (XBUFFER (src_object));
7869       if (EQ (src_object, dst_object))
7870         {
7871           saved_pt = PT, saved_pt_byte = PT_BYTE;
7872           coding->src_object = del_range_1 (from, to, 1, 1);
7873           coding->src_pos = 0;
7874           coding->src_pos_byte = 0;
7875         }
7876       else
7877         {
7878           if (from < GPT && to >= GPT)
7879             move_gap_both (from, from_byte);
7880           coding->src_pos = from;
7881           coding->src_pos_byte = from_byte;
7882         }
7883     }
7884   else
7885     code_conversion_save (0, 0);
7886
7887   if (BUFFERP (dst_object))
7888     {
7889       coding->dst_object = dst_object;
7890       if (EQ (src_object, dst_object))
7891         {
7892           coding->dst_pos = from;
7893           coding->dst_pos_byte = from_byte;
7894         }
7895       else
7896         {
7897           struct buffer *current = current_buffer;
7898
7899           set_buffer_temp (XBUFFER (dst_object));
7900           coding->dst_pos = PT;
7901           coding->dst_pos_byte = PT_BYTE;
7902           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7903           set_buffer_temp (current);
7904         }
7905       coding->dst_multibyte
7906         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7907     }
7908   else if (EQ (dst_object, Qt))
7909     {
7910       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7911       coding->dst_object = Qnil;
7912       coding->destination = xmalloc (dst_bytes);
7913       coding->dst_bytes = dst_bytes;
7914       coding->dst_multibyte = 0;
7915     }
7916   else
7917     {
7918       coding->dst_object = Qnil;
7919       coding->dst_multibyte = 0;
7920     }
7921
7922   encode_coding (coding);
7923
7924   if (EQ (dst_object, Qt))
7925     {
7926       if (BUFFERP (coding->dst_object))
7927         coding->dst_object = Fbuffer_string ();
7928       else
7929         {
7930           coding->dst_object
7931             = make_unibyte_string ((char *) coding->destination,
7932                                    coding->produced);
7933           xfree (coding->destination);
7934         }
7935     }
7936
7937   if (saved_pt >= 0)
7938     {
7939       /* This is the case of:
7940          (BUFFERP (src_object) && EQ (src_object, dst_object))
7941          As we have moved PT while replacing the original buffer
7942          contents, we must recover it now.  */
7943       set_buffer_internal (XBUFFER (src_object));
7944       if (saved_pt < from)
7945         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7946       else if (saved_pt < from + chars)
7947         TEMP_SET_PT_BOTH (from, from_byte);
7948       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7949         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7950                           saved_pt_byte + (coding->produced - bytes));
7951       else
7952         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7953                           saved_pt_byte + (coding->produced - bytes));
7954
7955       if (need_marker_adjustment)
7956         {
7957           struct Lisp_Marker *tail;
7958
7959           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7960             if (tail->need_adjustment)
7961               {
7962                 tail->need_adjustment = 0;
7963                 if (tail->insertion_type)
7964                   {
7965                     tail->bytepos = from_byte;
7966                     tail->charpos = from;
7967                   }
7968                 else
7969                   {
7970                     tail->bytepos = from_byte + coding->produced;
7971                     tail->charpos
7972                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7973                          ? tail->bytepos : from + coding->produced_char);
7974                   }
7975               }
7976         }
7977     }
7978
7979   if (kill_src_buffer)
7980     Fkill_buffer (coding->src_object);
7981
7982   Vdeactivate_mark = old_deactivate_mark;
7983   unbind_to (count, Qnil);
7984 }
7985
7986
7987 Lisp_Object
7988 preferred_coding_system (void)
7989 {
7990   int id = coding_categories[coding_priorities[0]].id;
7991
7992   return CODING_ID_NAME (id);
7993 }
7994
7995 #if defined (WINDOWSNT) || defined (CYGWIN)
7996
7997 Lisp_Object
7998 from_unicode (Lisp_Object str)
7999 {
8000   CHECK_STRING (str);
8001   if (!STRING_MULTIBYTE (str) &&
8002       SBYTES (str) & 1)
8003     {
8004       str = Fsubstring (str, make_number (0), make_number (-1));
8005     }
8006
8007   return code_convert_string_norecord (str, Qutf_16le, 0);
8008 }
8009
8010 Lisp_Object
8011 from_unicode_buffer (const wchar_t* wstr)
8012 {
8013     return from_unicode (
8014         make_unibyte_string (
8015             (char*) wstr,
8016             /* we get one of the two final 0 bytes for free. */
8017             1 + sizeof (wchar_t) * wcslen (wstr)));
8018 }
8019
8020 wchar_t *
8021 to_unicode (Lisp_Object str, Lisp_Object *buf)
8022 {
8023   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8024   /* We need to make another copy (in addition to the one made by
8025      code_convert_string_norecord) to ensure that the final string is
8026      _doubly_ zero terminated --- that is, that the string is
8027      terminated by two zero bytes and one utf-16le null character.
8028      Because strings are already terminated with a single zero byte,
8029      we just add one additional zero. */
8030   str = make_uninit_string (SBYTES (*buf) + 1);
8031   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8032   SDATA (str) [SBYTES (*buf)] = '\0';
8033   *buf = str;
8034   return WCSDATA (*buf);
8035 }
8036
8037 #endif /* WINDOWSNT || CYGWIN */
8038
8039 \f
8040 #ifdef emacs
8041 /*** 8. Emacs Lisp library functions ***/
8042
8043 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8044        doc: /* Return t if OBJECT is nil or a coding-system.
8045 See the documentation of `define-coding-system' for information
8046 about coding-system objects.  */)
8047   (Lisp_Object object)
8048 {
8049   if (NILP (object)
8050       || CODING_SYSTEM_ID (object) >= 0)
8051     return Qt;
8052   if (! SYMBOLP (object)
8053       || NILP (Fget (object, Qcoding_system_define_form)))
8054     return Qnil;
8055   return Qt;
8056 }
8057
8058 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8059        Sread_non_nil_coding_system, 1, 1, 0,
8060        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8061   (Lisp_Object prompt)
8062 {
8063   Lisp_Object val;
8064   do
8065     {
8066       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8067                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8068     }
8069   while (SCHARS (val) == 0);
8070   return (Fintern (val, Qnil));
8071 }
8072
8073 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8074        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8075 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8076 Ignores case when completing coding systems (all Emacs coding systems
8077 are lower-case).  */)
8078   (Lisp_Object prompt, Lisp_Object default_coding_system)
8079 {
8080   Lisp_Object val;
8081   ptrdiff_t count = SPECPDL_INDEX ();
8082
8083   if (SYMBOLP (default_coding_system))
8084     default_coding_system = SYMBOL_NAME (default_coding_system);
8085   specbind (Qcompletion_ignore_case, Qt);
8086   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8087                           Qt, Qnil, Qcoding_system_history,
8088                           default_coding_system, Qnil);
8089   unbind_to (count, Qnil);
8090   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8091 }
8092
8093 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8094        1, 1, 0,
8095        doc: /* Check validity of CODING-SYSTEM.
8096 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8097 It is valid if it is nil or a symbol defined as a coding system by the
8098 function `define-coding-system'.  */)
8099   (Lisp_Object coding_system)
8100 {
8101   Lisp_Object define_form;
8102
8103   define_form = Fget (coding_system, Qcoding_system_define_form);
8104   if (! NILP (define_form))
8105     {
8106       Fput (coding_system, Qcoding_system_define_form, Qnil);
8107       safe_eval (define_form);
8108     }
8109   if (!NILP (Fcoding_system_p (coding_system)))
8110     return coding_system;
8111   xsignal1 (Qcoding_system_error, coding_system);
8112 }
8113
8114 \f
8115 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8116    HIGHEST, return the coding system of the highest
8117    priority among the detected coding systems.  Otherwise return a
8118    list of detected coding systems sorted by their priorities.  If
8119    MULTIBYTEP, it is assumed that the bytes are in correct
8120    multibyte form but contains only ASCII and eight-bit chars.
8121    Otherwise, the bytes are raw bytes.
8122
8123    CODING-SYSTEM controls the detection as below:
8124
8125    If it is nil, detect both text-format and eol-format.  If the
8126    text-format part of CODING-SYSTEM is already specified
8127    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8128    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8129    detect only text-format.  */
8130
8131 Lisp_Object
8132 detect_coding_system (const unsigned char *src,
8133                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8134                       bool highest, bool multibytep,
8135                       Lisp_Object coding_system)
8136 {
8137   const unsigned char *src_end = src + src_bytes;
8138   Lisp_Object attrs, eol_type;
8139   Lisp_Object val = Qnil;
8140   struct coding_system coding;
8141   ptrdiff_t id;
8142   struct coding_detection_info detect_info;
8143   enum coding_category base_category;
8144   bool null_byte_found = 0, eight_bit_found = 0;
8145
8146   if (NILP (coding_system))
8147     coding_system = Qundecided;
8148   setup_coding_system (coding_system, &coding);
8149   attrs = CODING_ID_ATTRS (coding.id);
8150   eol_type = CODING_ID_EOL_TYPE (coding.id);
8151   coding_system = CODING_ATTR_BASE_NAME (attrs);
8152
8153   coding.source = src;
8154   coding.src_chars = src_chars;
8155   coding.src_bytes = src_bytes;
8156   coding.src_multibyte = multibytep;
8157   coding.consumed = 0;
8158   coding.mode |= CODING_MODE_LAST_BLOCK;
8159   coding.head_ascii = 0;
8160
8161   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8162
8163   /* At first, detect text-format if necessary.  */
8164   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8165   if (base_category == coding_category_undecided)
8166     {
8167       enum coding_category category IF_LINT (= 0);
8168       struct coding_system *this IF_LINT (= NULL);
8169       int c, i;
8170
8171       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8172       for (; src < src_end; src++)
8173         {
8174           c = *src;
8175           if (c & 0x80)
8176             {
8177               eight_bit_found = 1;
8178               if (null_byte_found)
8179                 break;
8180             }
8181           else if (c < 0x20)
8182             {
8183               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8184                   && ! inhibit_iso_escape_detection
8185                   && ! detect_info.checked)
8186                 {
8187                   if (detect_coding_iso_2022 (&coding, &detect_info))
8188                     {
8189                       /* We have scanned the whole data.  */
8190                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8191                         {
8192                           /* We didn't find an 8-bit code.  We may
8193                              have found a null-byte, but it's very
8194                              rare that a binary file confirm to
8195                              ISO-2022.  */
8196                           src = src_end;
8197                           coding.head_ascii = src - coding.source;
8198                         }
8199                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8200                       break;
8201                     }
8202                 }
8203               else if (! c && !inhibit_null_byte_detection)
8204                 {
8205                   null_byte_found = 1;
8206                   if (eight_bit_found)
8207                     break;
8208                 }
8209               if (! eight_bit_found)
8210                 coding.head_ascii++;
8211             }
8212           else if (! eight_bit_found)
8213             coding.head_ascii++;
8214         }
8215
8216       if (null_byte_found || eight_bit_found
8217           || coding.head_ascii < coding.src_bytes
8218           || detect_info.found)
8219         {
8220           if (coding.head_ascii == coding.src_bytes)
8221             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8222             for (i = 0; i < coding_category_raw_text; i++)
8223               {
8224                 category = coding_priorities[i];
8225                 this = coding_categories + category;
8226                 if (detect_info.found & (1 << category))
8227                   break;
8228               }
8229           else
8230             {
8231               if (null_byte_found)
8232                 {
8233                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8234                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8235                 }
8236               for (i = 0; i < coding_category_raw_text; i++)
8237                 {
8238                   category = coding_priorities[i];
8239                   this = coding_categories + category;
8240
8241                   if (this->id < 0)
8242                     {
8243                       /* No coding system of this category is defined.  */
8244                       detect_info.rejected |= (1 << category);
8245                     }
8246                   else if (category >= coding_category_raw_text)
8247                     continue;
8248                   else if (detect_info.checked & (1 << category))
8249                     {
8250                       if (highest
8251                           && (detect_info.found & (1 << category)))
8252                         break;
8253                     }
8254                   else if ((*(this->detector)) (&coding, &detect_info)
8255                            && highest
8256                            && (detect_info.found & (1 << category)))
8257                     {
8258                       if (category == coding_category_utf_16_auto)
8259                         {
8260                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8261                             category = coding_category_utf_16_le;
8262                           else
8263                             category = coding_category_utf_16_be;
8264                         }
8265                       break;
8266                     }
8267                 }
8268             }
8269         }
8270
8271       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8272           || null_byte_found)
8273         {
8274           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8275           id = CODING_SYSTEM_ID (Qno_conversion);
8276           val = Fcons (make_number (id), Qnil);
8277         }
8278       else if (! detect_info.rejected && ! detect_info.found)
8279         {
8280           detect_info.found = CATEGORY_MASK_ANY;
8281           id = coding_categories[coding_category_undecided].id;
8282           val = Fcons (make_number (id), Qnil);
8283         }
8284       else if (highest)
8285         {
8286           if (detect_info.found)
8287             {
8288               detect_info.found = 1 << category;
8289               val = Fcons (make_number (this->id), Qnil);
8290             }
8291           else
8292             for (i = 0; i < coding_category_raw_text; i++)
8293               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8294                 {
8295                   detect_info.found = 1 << coding_priorities[i];
8296                   id = coding_categories[coding_priorities[i]].id;
8297                   val = Fcons (make_number (id), Qnil);
8298                   break;
8299                 }
8300         }
8301       else
8302         {
8303           int mask = detect_info.rejected | detect_info.found;
8304           int found = 0;
8305
8306           for (i = coding_category_raw_text - 1; i >= 0; i--)
8307             {
8308               category = coding_priorities[i];
8309               if (! (mask & (1 << category)))
8310                 {
8311                   found |= 1 << category;
8312                   id = coding_categories[category].id;
8313                   if (id >= 0)
8314                     val = Fcons (make_number (id), val);
8315                 }
8316             }
8317           for (i = coding_category_raw_text - 1; i >= 0; i--)
8318             {
8319               category = coding_priorities[i];
8320               if (detect_info.found & (1 << category))
8321                 {
8322                   id = coding_categories[category].id;
8323                   val = Fcons (make_number (id), val);
8324                 }
8325             }
8326           detect_info.found |= found;
8327         }
8328     }
8329   else if (base_category == coding_category_utf_8_auto)
8330     {
8331       if (detect_coding_utf_8 (&coding, &detect_info))
8332         {
8333           struct coding_system *this;
8334
8335           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8336             this = coding_categories + coding_category_utf_8_sig;
8337           else
8338             this = coding_categories + coding_category_utf_8_nosig;
8339           val = Fcons (make_number (this->id), Qnil);
8340         }
8341     }
8342   else if (base_category == coding_category_utf_16_auto)
8343     {
8344       if (detect_coding_utf_16 (&coding, &detect_info))
8345         {
8346           struct coding_system *this;
8347
8348           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8349             this = coding_categories + coding_category_utf_16_le;
8350           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8351             this = coding_categories + coding_category_utf_16_be;
8352           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8353             this = coding_categories + coding_category_utf_16_be_nosig;
8354           else
8355             this = coding_categories + coding_category_utf_16_le_nosig;
8356           val = Fcons (make_number (this->id), Qnil);
8357         }
8358     }
8359   else
8360     {
8361       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8362       val = Fcons (make_number (coding.id), Qnil);
8363     }
8364
8365   /* Then, detect eol-format if necessary.  */
8366   {
8367     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8368     Lisp_Object tail;
8369
8370     if (VECTORP (eol_type))
8371       {
8372         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8373           {
8374             if (null_byte_found)
8375               normal_eol = EOL_SEEN_LF;
8376             else
8377               normal_eol = detect_eol (coding.source, src_bytes,
8378                                        coding_category_raw_text);
8379           }
8380         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8381                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8382           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8383                                       coding_category_utf_16_be);
8384         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8385                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8386           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8387                                       coding_category_utf_16_le);
8388       }
8389     else
8390       {
8391         if (EQ (eol_type, Qunix))
8392           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8393         else if (EQ (eol_type, Qdos))
8394           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8395         else
8396           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8397       }
8398
8399     for (tail = val; CONSP (tail); tail = XCDR (tail))
8400       {
8401         enum coding_category category;
8402         int this_eol;
8403
8404         id = XINT (XCAR (tail));
8405         attrs = CODING_ID_ATTRS (id);
8406         category = XINT (CODING_ATTR_CATEGORY (attrs));
8407         eol_type = CODING_ID_EOL_TYPE (id);
8408         if (VECTORP (eol_type))
8409           {
8410             if (category == coding_category_utf_16_be
8411                 || category == coding_category_utf_16_be_nosig)
8412               this_eol = utf_16_be_eol;
8413             else if (category == coding_category_utf_16_le
8414                      || category == coding_category_utf_16_le_nosig)
8415               this_eol = utf_16_le_eol;
8416             else
8417               this_eol = normal_eol;
8418
8419             if (this_eol == EOL_SEEN_LF)
8420               XSETCAR (tail, AREF (eol_type, 0));
8421             else if (this_eol == EOL_SEEN_CRLF)
8422               XSETCAR (tail, AREF (eol_type, 1));
8423             else if (this_eol == EOL_SEEN_CR)
8424               XSETCAR (tail, AREF (eol_type, 2));
8425             else
8426               XSETCAR (tail, CODING_ID_NAME (id));
8427           }
8428         else
8429           XSETCAR (tail, CODING_ID_NAME (id));
8430       }
8431   }
8432
8433   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8434 }
8435
8436
8437 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8438        2, 3, 0,
8439        doc: /* Detect coding system of the text in the region between START and END.
8440 Return a list of possible coding systems ordered by priority.
8441 The coding systems to try and their priorities follows what
8442 the function `coding-system-priority-list' (which see) returns.
8443
8444 If only ASCII characters are found (except for such ISO-2022 control
8445 characters as ESC), it returns a list of single element `undecided'
8446 or its subsidiary coding system according to a detected end-of-line
8447 format.
8448
8449 If optional argument HIGHEST is non-nil, return the coding system of
8450 highest priority.  */)
8451   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8452 {
8453   ptrdiff_t from, to;
8454   ptrdiff_t from_byte, to_byte;
8455
8456   validate_region (&start, &end);
8457   from = XINT (start), to = XINT (end);
8458   from_byte = CHAR_TO_BYTE (from);
8459   to_byte = CHAR_TO_BYTE (to);
8460
8461   if (from < GPT && to >= GPT)
8462     move_gap_both (to, to_byte);
8463
8464   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8465                                to - from, to_byte - from_byte,
8466                                !NILP (highest),
8467                                !NILP (BVAR (current_buffer
8468                                       , enable_multibyte_characters)),
8469                                Qnil);
8470 }
8471
8472 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8473        1, 2, 0,
8474        doc: /* Detect coding system of the text in STRING.
8475 Return a list of possible coding systems ordered by priority.
8476 The coding systems to try and their priorities follows what
8477 the function `coding-system-priority-list' (which see) returns.
8478
8479 If only ASCII characters are found (except for such ISO-2022 control
8480 characters as ESC), it returns a list of single element `undecided'
8481 or its subsidiary coding system according to a detected end-of-line
8482 format.
8483
8484 If optional argument HIGHEST is non-nil, return the coding system of
8485 highest priority.  */)
8486   (Lisp_Object string, Lisp_Object highest)
8487 {
8488   CHECK_STRING (string);
8489
8490   return detect_coding_system (SDATA (string),
8491                                SCHARS (string), SBYTES (string),
8492                                !NILP (highest), STRING_MULTIBYTE (string),
8493                                Qnil);
8494 }
8495
8496
8497 static bool
8498 char_encodable_p (int c, Lisp_Object attrs)
8499 {
8500   Lisp_Object tail;
8501   struct charset *charset;
8502   Lisp_Object translation_table;
8503
8504   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8505   if (! NILP (translation_table))
8506     c = translate_char (translation_table, c);
8507   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8508        CONSP (tail); tail = XCDR (tail))
8509     {
8510       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8511       if (CHAR_CHARSET_P (c, charset))
8512         break;
8513     }
8514   return (! NILP (tail));
8515 }
8516
8517
8518 /* Return a list of coding systems that safely encode the text between
8519    START and END.  If EXCLUDE is non-nil, it is a list of coding
8520    systems not to check.  The returned list doesn't contain any such
8521    coding systems.  In any case, if the text contains only ASCII or is
8522    unibyte, return t.  */
8523
8524 DEFUN ("find-coding-systems-region-internal",
8525        Ffind_coding_systems_region_internal,
8526        Sfind_coding_systems_region_internal, 2, 3, 0,
8527        doc: /* Internal use only.  */)
8528   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8529 {
8530   Lisp_Object coding_attrs_list, safe_codings;
8531   ptrdiff_t start_byte, end_byte;
8532   const unsigned char *p, *pbeg, *pend;
8533   int c;
8534   Lisp_Object tail, elt, work_table;
8535
8536   if (STRINGP (start))
8537     {
8538       if (!STRING_MULTIBYTE (start)
8539           || SCHARS (start) == SBYTES (start))
8540         return Qt;
8541       start_byte = 0;
8542       end_byte = SBYTES (start);
8543     }
8544   else
8545     {
8546       CHECK_NUMBER_COERCE_MARKER (start);
8547       CHECK_NUMBER_COERCE_MARKER (end);
8548       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8549         args_out_of_range (start, end);
8550       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8551         return Qt;
8552       start_byte = CHAR_TO_BYTE (XINT (start));
8553       end_byte = CHAR_TO_BYTE (XINT (end));
8554       if (XINT (end) - XINT (start) == end_byte - start_byte)
8555         return Qt;
8556
8557       if (XINT (start) < GPT && XINT (end) > GPT)
8558         {
8559           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8560             move_gap_both (XINT (start), start_byte);
8561           else
8562             move_gap_both (XINT (end), end_byte);
8563         }
8564     }
8565
8566   coding_attrs_list = Qnil;
8567   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8568     if (NILP (exclude)
8569         || NILP (Fmemq (XCAR (tail), exclude)))
8570       {
8571         Lisp_Object attrs;
8572
8573         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8574         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8575             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8576           {
8577             ASET (attrs, coding_attr_trans_tbl,
8578                   get_translation_table (attrs, 1, NULL));
8579             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8580           }
8581       }
8582
8583   if (STRINGP (start))
8584     p = pbeg = SDATA (start);
8585   else
8586     p = pbeg = BYTE_POS_ADDR (start_byte);
8587   pend = p + (end_byte - start_byte);
8588
8589   while (p < pend && ASCII_BYTE_P (*p)) p++;
8590   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8591
8592   work_table = Fmake_char_table (Qnil, Qnil);
8593   while (p < pend)
8594     {
8595       if (ASCII_BYTE_P (*p))
8596         p++;
8597       else
8598         {
8599           c = STRING_CHAR_ADVANCE (p);
8600           if (!NILP (char_table_ref (work_table, c)))
8601             /* This character was already checked.  Ignore it.  */
8602             continue;
8603
8604           charset_map_loaded = 0;
8605           for (tail = coding_attrs_list; CONSP (tail);)
8606             {
8607               elt = XCAR (tail);
8608               if (NILP (elt))
8609                 tail = XCDR (tail);
8610               else if (char_encodable_p (c, elt))
8611                 tail = XCDR (tail);
8612               else if (CONSP (XCDR (tail)))
8613                 {
8614                   XSETCAR (tail, XCAR (XCDR (tail)));
8615                   XSETCDR (tail, XCDR (XCDR (tail)));
8616                 }
8617               else
8618                 {
8619                   XSETCAR (tail, Qnil);
8620                   tail = XCDR (tail);
8621                 }
8622             }
8623           if (charset_map_loaded)
8624             {
8625               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8626
8627               if (STRINGP (start))
8628                 pbeg = SDATA (start);
8629               else
8630                 pbeg = BYTE_POS_ADDR (start_byte);
8631               p = pbeg + p_offset;
8632               pend = pbeg + pend_offset;
8633             }
8634           char_table_set (work_table, c, Qt);
8635         }
8636     }
8637
8638   safe_codings = list2 (Qraw_text, Qno_conversion);
8639   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8640     if (! NILP (XCAR (tail)))
8641       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8642
8643   return safe_codings;
8644 }
8645
8646
8647 DEFUN ("unencodable-char-position", Funencodable_char_position,
8648        Sunencodable_char_position, 3, 5, 0,
8649        doc: /*
8650 Return position of first un-encodable character in a region.
8651 START and END specify the region and CODING-SYSTEM specifies the
8652 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8653
8654 If optional 4th argument COUNT is non-nil, it specifies at most how
8655 many un-encodable characters to search.  In this case, the value is a
8656 list of positions.
8657
8658 If optional 5th argument STRING is non-nil, it is a string to search
8659 for un-encodable characters.  In that case, START and END are indexes
8660 to the string.  */)
8661   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8662 {
8663   EMACS_INT n;
8664   struct coding_system coding;
8665   Lisp_Object attrs, charset_list, translation_table;
8666   Lisp_Object positions;
8667   ptrdiff_t from, to;
8668   const unsigned char *p, *stop, *pend;
8669   bool ascii_compatible;
8670
8671   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8672   attrs = CODING_ID_ATTRS (coding.id);
8673   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8674     return Qnil;
8675   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8676   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8677   translation_table = get_translation_table (attrs, 1, NULL);
8678
8679   if (NILP (string))
8680     {
8681       validate_region (&start, &end);
8682       from = XINT (start);
8683       to = XINT (end);
8684       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8685           || (ascii_compatible
8686               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8687         return Qnil;
8688       p = CHAR_POS_ADDR (from);
8689       pend = CHAR_POS_ADDR (to);
8690       if (from < GPT && to >= GPT)
8691         stop = GPT_ADDR;
8692       else
8693         stop = pend;
8694     }
8695   else
8696     {
8697       CHECK_STRING (string);
8698       CHECK_NATNUM (start);
8699       CHECK_NATNUM (end);
8700       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8701         args_out_of_range_3 (string, start, end);
8702       from = XINT (start);
8703       to = XINT (end);
8704       if (! STRING_MULTIBYTE (string))
8705         return Qnil;
8706       p = SDATA (string) + string_char_to_byte (string, from);
8707       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8708       if (ascii_compatible && (to - from) == (pend - p))
8709         return Qnil;
8710     }
8711
8712   if (NILP (count))
8713     n = 1;
8714   else
8715     {
8716       CHECK_NATNUM (count);
8717       n = XINT (count);
8718     }
8719
8720   positions = Qnil;
8721   charset_map_loaded = 0;
8722   while (1)
8723     {
8724       int c;
8725
8726       if (ascii_compatible)
8727         while (p < stop && ASCII_BYTE_P (*p))
8728           p++, from++;
8729       if (p >= stop)
8730         {
8731           if (p >= pend)
8732             break;
8733           stop = pend;
8734           p = GAP_END_ADDR;
8735         }
8736
8737       c = STRING_CHAR_ADVANCE (p);
8738       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8739           && ! char_charset (translate_char (translation_table, c),
8740                              charset_list, NULL))
8741         {
8742           positions = Fcons (make_number (from), positions);
8743           n--;
8744           if (n == 0)
8745             break;
8746         }
8747
8748       from++;
8749       if (charset_map_loaded && NILP (string))
8750         {
8751           p = CHAR_POS_ADDR (from);
8752           pend = CHAR_POS_ADDR (to);
8753           if (from < GPT && to >= GPT)
8754             stop = GPT_ADDR;
8755           else
8756             stop = pend;
8757           charset_map_loaded = 0;
8758         }
8759     }
8760
8761   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8762 }
8763
8764
8765 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8766        Scheck_coding_systems_region, 3, 3, 0,
8767        doc: /* Check if the region is encodable by coding systems.
8768
8769 START and END are buffer positions specifying the region.
8770 CODING-SYSTEM-LIST is a list of coding systems to check.
8771
8772 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8773 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8774 whole region, POS0, POS1, ... are buffer positions where non-encodable
8775 characters are found.
8776
8777 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8778 value is nil.
8779
8780 START may be a string.  In that case, check if the string is
8781 encodable, and the value contains indices to the string instead of
8782 buffer positions.  END is ignored.
8783
8784 If the current buffer (or START if it is a string) is unibyte, the value
8785 is nil.  */)
8786   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8787 {
8788   Lisp_Object list;
8789   ptrdiff_t start_byte, end_byte;
8790   ptrdiff_t pos;
8791   const unsigned char *p, *pbeg, *pend;
8792   int c;
8793   Lisp_Object tail, elt, attrs;
8794
8795   if (STRINGP (start))
8796     {
8797       if (!STRING_MULTIBYTE (start)
8798           || SCHARS (start) == SBYTES (start))
8799         return Qnil;
8800       start_byte = 0;
8801       end_byte = SBYTES (start);
8802       pos = 0;
8803     }
8804   else
8805     {
8806       CHECK_NUMBER_COERCE_MARKER (start);
8807       CHECK_NUMBER_COERCE_MARKER (end);
8808       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8809         args_out_of_range (start, end);
8810       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8811         return Qnil;
8812       start_byte = CHAR_TO_BYTE (XINT (start));
8813       end_byte = CHAR_TO_BYTE (XINT (end));
8814       if (XINT (end) - XINT (start) == end_byte - start_byte)
8815         return Qnil;
8816
8817       if (XINT (start) < GPT && XINT (end) > GPT)
8818         {
8819           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8820             move_gap_both (XINT (start), start_byte);
8821           else
8822             move_gap_both (XINT (end), end_byte);
8823         }
8824       pos = XINT (start);
8825     }
8826
8827   list = Qnil;
8828   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8829     {
8830       elt = XCAR (tail);
8831       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8832       ASET (attrs, coding_attr_trans_tbl,
8833             get_translation_table (attrs, 1, NULL));
8834       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8835     }
8836
8837   if (STRINGP (start))
8838     p = pbeg = SDATA (start);
8839   else
8840     p = pbeg = BYTE_POS_ADDR (start_byte);
8841   pend = p + (end_byte - start_byte);
8842
8843   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8844   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8845
8846   while (p < pend)
8847     {
8848       if (ASCII_BYTE_P (*p))
8849         p++;
8850       else
8851         {
8852           c = STRING_CHAR_ADVANCE (p);
8853
8854           charset_map_loaded = 0;
8855           for (tail = list; CONSP (tail); tail = XCDR (tail))
8856             {
8857               elt = XCDR (XCAR (tail));
8858               if (! char_encodable_p (c, XCAR (elt)))
8859                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8860             }
8861           if (charset_map_loaded)
8862             {
8863               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8864
8865               if (STRINGP (start))
8866                 pbeg = SDATA (start);
8867               else
8868                 pbeg = BYTE_POS_ADDR (start_byte);
8869               p = pbeg + p_offset;
8870               pend = pbeg + pend_offset;
8871             }
8872         }
8873       pos++;
8874     }
8875
8876   tail = list;
8877   list = Qnil;
8878   for (; CONSP (tail); tail = XCDR (tail))
8879     {
8880       elt = XCAR (tail);
8881       if (CONSP (XCDR (XCDR (elt))))
8882         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8883                       list);
8884     }
8885
8886   return list;
8887 }
8888
8889
8890 static Lisp_Object
8891 code_convert_region (Lisp_Object start, Lisp_Object end,
8892                      Lisp_Object coding_system, Lisp_Object dst_object,
8893                      bool encodep, bool norecord)
8894 {
8895   struct coding_system coding;
8896   ptrdiff_t from, from_byte, to, to_byte;
8897   Lisp_Object src_object;
8898
8899   if (NILP (coding_system))
8900     coding_system = Qno_conversion;
8901   else
8902     CHECK_CODING_SYSTEM (coding_system);
8903   src_object = Fcurrent_buffer ();
8904   if (NILP (dst_object))
8905     dst_object = src_object;
8906   else if (! EQ (dst_object, Qt))
8907     CHECK_BUFFER (dst_object);
8908
8909   validate_region (&start, &end);
8910   from = XFASTINT (start);
8911   from_byte = CHAR_TO_BYTE (from);
8912   to = XFASTINT (end);
8913   to_byte = CHAR_TO_BYTE (to);
8914
8915   setup_coding_system (coding_system, &coding);
8916   coding.mode |= CODING_MODE_LAST_BLOCK;
8917
8918   if (encodep)
8919     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8920                           dst_object);
8921   else
8922     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8923                           dst_object);
8924   if (! norecord)
8925     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8926
8927   return (BUFFERP (dst_object)
8928           ? make_number (coding.produced_char)
8929           : coding.dst_object);
8930 }
8931
8932
8933 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8934        3, 4, "r\nzCoding system: ",
8935        doc: /* Decode the current region from the specified coding system.
8936 When called from a program, takes four arguments:
8937         START, END, CODING-SYSTEM, and DESTINATION.
8938 START and END are buffer positions.
8939
8940 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8941 If nil, the region between START and END is replaced by the decoded text.
8942 If buffer, the decoded text is inserted in that buffer after point (point
8943 does not move).
8944 In those cases, the length of the decoded text is returned.
8945 If DESTINATION is t, the decoded text is returned.
8946
8947 This function sets `last-coding-system-used' to the precise coding system
8948 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8949 not fully specified.)  */)
8950   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8951 {
8952   return code_convert_region (start, end, coding_system, destination, 0, 0);
8953 }
8954
8955 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8956        3, 4, "r\nzCoding system: ",
8957        doc: /* Encode the current region by specified coding system.
8958 When called from a program, takes four arguments:
8959         START, END, CODING-SYSTEM and DESTINATION.
8960 START and END are buffer positions.
8961
8962 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8963 If nil, the region between START and END is replace by the encoded text.
8964 If buffer, the encoded text is inserted in that buffer after point (point
8965 does not move).
8966 In those cases, the length of the encoded text is returned.
8967 If DESTINATION is t, the encoded text is returned.
8968
8969 This function sets `last-coding-system-used' to the precise coding system
8970 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8971 not fully specified.)  */)
8972   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8973 {
8974   return code_convert_region (start, end, coding_system, destination, 1, 0);
8975 }
8976
8977 Lisp_Object
8978 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8979                      Lisp_Object dst_object, bool encodep, bool nocopy,
8980                      bool norecord)
8981 {
8982   struct coding_system coding;
8983   ptrdiff_t chars, bytes;
8984
8985   CHECK_STRING (string);
8986   if (NILP (coding_system))
8987     {
8988       if (! norecord)
8989         Vlast_coding_system_used = Qno_conversion;
8990       if (NILP (dst_object))
8991         return (nocopy ? Fcopy_sequence (string) : string);
8992     }
8993
8994   if (NILP (coding_system))
8995     coding_system = Qno_conversion;
8996   else
8997     CHECK_CODING_SYSTEM (coding_system);
8998   if (NILP (dst_object))
8999     dst_object = Qt;
9000   else if (! EQ (dst_object, Qt))
9001     CHECK_BUFFER (dst_object);
9002
9003   setup_coding_system (coding_system, &coding);
9004   coding.mode |= CODING_MODE_LAST_BLOCK;
9005   chars = SCHARS (string);
9006   bytes = SBYTES (string);
9007   if (encodep)
9008     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9009   else
9010     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9011   if (! norecord)
9012     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9013
9014   return (BUFFERP (dst_object)
9015           ? make_number (coding.produced_char)
9016           : coding.dst_object);
9017 }
9018
9019
9020 /* Encode or decode STRING according to CODING_SYSTEM.
9021    Do not set Vlast_coding_system_used.
9022
9023    This function is called only from macros DECODE_FILE and
9024    ENCODE_FILE, thus we ignore character composition.  */
9025
9026 Lisp_Object
9027 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9028                               bool encodep)
9029 {
9030   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9031 }
9032
9033
9034 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9035        2, 4, 0,
9036        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9037
9038 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9039 if the decoding operation is trivial.
9040
9041 Optional fourth arg BUFFER non-nil means that the decoded text is
9042 inserted in that buffer after point (point does not move).  In this
9043 case, the return value is the length of the decoded text.
9044
9045 This function sets `last-coding-system-used' to the precise coding system
9046 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9047 not fully specified.)  */)
9048   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9049 {
9050   return code_convert_string (string, coding_system, buffer,
9051                               0, ! NILP (nocopy), 0);
9052 }
9053
9054 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9055        2, 4, 0,
9056        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9057
9058 Optional third arg NOCOPY non-nil means it is OK to return STRING
9059 itself if the encoding operation is trivial.
9060
9061 Optional fourth arg BUFFER non-nil means that the encoded text is
9062 inserted in that buffer after point (point does not move).  In this
9063 case, the return value is the length of the encoded text.
9064
9065 This function sets `last-coding-system-used' to the precise coding system
9066 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9067 not fully specified.)  */)
9068   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9069 {
9070   return code_convert_string (string, coding_system, buffer,
9071                               1, ! NILP (nocopy), 0);
9072 }
9073
9074 \f
9075 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9076        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9077 Return the corresponding character.  */)
9078   (Lisp_Object code)
9079 {
9080   Lisp_Object spec, attrs, val;
9081   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9082   EMACS_INT ch;
9083   int c;
9084
9085   CHECK_NATNUM (code);
9086   ch = XFASTINT (code);
9087   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9088   attrs = AREF (spec, 0);
9089
9090   if (ASCII_BYTE_P (ch)
9091       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9092     return code;
9093
9094   val = CODING_ATTR_CHARSET_LIST (attrs);
9095   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9096   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9097   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9098
9099   if (ch <= 0x7F)
9100     {
9101       c = ch;
9102       charset = charset_roman;
9103     }
9104   else if (ch >= 0xA0 && ch < 0xDF)
9105     {
9106       c = ch - 0x80;
9107       charset = charset_kana;
9108     }
9109   else
9110     {
9111       EMACS_INT c1 = ch >> 8;
9112       int c2 = ch & 0xFF;
9113
9114       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9115           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9116         error ("Invalid code: %"pI"d", ch);
9117       c = ch;
9118       SJIS_TO_JIS (c);
9119       charset = charset_kanji;
9120     }
9121   c = DECODE_CHAR (charset, c);
9122   if (c < 0)
9123     error ("Invalid code: %"pI"d", ch);
9124   return make_number (c);
9125 }
9126
9127
9128 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9129        doc: /* Encode a Japanese character CH to shift_jis encoding.
9130 Return the corresponding code in SJIS.  */)
9131   (Lisp_Object ch)
9132 {
9133   Lisp_Object spec, attrs, charset_list;
9134   int c;
9135   struct charset *charset;
9136   unsigned code;
9137
9138   CHECK_CHARACTER (ch);
9139   c = XFASTINT (ch);
9140   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9141   attrs = AREF (spec, 0);
9142
9143   if (ASCII_CHAR_P (c)
9144       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9145     return ch;
9146
9147   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9148   charset = char_charset (c, charset_list, &code);
9149   if (code == CHARSET_INVALID_CODE (charset))
9150     error ("Can't encode by shift_jis encoding: %c", c);
9151   JIS_TO_SJIS (code);
9152
9153   return make_number (code);
9154 }
9155
9156 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9157        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9158 Return the corresponding character.  */)
9159   (Lisp_Object code)
9160 {
9161   Lisp_Object spec, attrs, val;
9162   struct charset *charset_roman, *charset_big5, *charset;
9163   EMACS_INT ch;
9164   int c;
9165
9166   CHECK_NATNUM (code);
9167   ch = XFASTINT (code);
9168   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9169   attrs = AREF (spec, 0);
9170
9171   if (ASCII_BYTE_P (ch)
9172       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9173     return code;
9174
9175   val = CODING_ATTR_CHARSET_LIST (attrs);
9176   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9177   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9178
9179   if (ch <= 0x7F)
9180     {
9181       c = ch;
9182       charset = charset_roman;
9183     }
9184   else
9185     {
9186       EMACS_INT b1 = ch >> 8;
9187       int b2 = ch & 0x7F;
9188       if (b1 < 0xA1 || b1 > 0xFE
9189           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9190         error ("Invalid code: %"pI"d", ch);
9191       c = ch;
9192       charset = charset_big5;
9193     }
9194   c = DECODE_CHAR (charset, c);
9195   if (c < 0)
9196     error ("Invalid code: %"pI"d", ch);
9197   return make_number (c);
9198 }
9199
9200 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9201        doc: /* Encode the Big5 character CH to BIG5 coding system.
9202 Return the corresponding character code in Big5.  */)
9203   (Lisp_Object ch)
9204 {
9205   Lisp_Object spec, attrs, charset_list;
9206   struct charset *charset;
9207   int c;
9208   unsigned code;
9209
9210   CHECK_CHARACTER (ch);
9211   c = XFASTINT (ch);
9212   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9213   attrs = AREF (spec, 0);
9214   if (ASCII_CHAR_P (c)
9215       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9216     return ch;
9217
9218   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9219   charset = char_charset (c, charset_list, &code);
9220   if (code == CHARSET_INVALID_CODE (charset))
9221     error ("Can't encode by Big5 encoding: %c", c);
9222
9223   return make_number (code);
9224 }
9225
9226 \f
9227 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9228        Sset_terminal_coding_system_internal, 1, 2, 0,
9229        doc: /* Internal use only.  */)
9230   (Lisp_Object coding_system, Lisp_Object terminal)
9231 {
9232   struct terminal *term = get_terminal (terminal, 1);
9233   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9234   CHECK_SYMBOL (coding_system);
9235   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9236   /* We had better not send unsafe characters to terminal.  */
9237   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9238   /* Character composition should be disabled.  */
9239   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9240   terminal_coding->src_multibyte = 1;
9241   terminal_coding->dst_multibyte = 0;
9242   tset_charset_list
9243     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9244             ? coding_charset_list (terminal_coding)
9245             : Fcons (make_number (charset_ascii), Qnil)));
9246   return Qnil;
9247 }
9248
9249 DEFUN ("set-safe-terminal-coding-system-internal",
9250        Fset_safe_terminal_coding_system_internal,
9251        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9252        doc: /* Internal use only.  */)
9253   (Lisp_Object coding_system)
9254 {
9255   CHECK_SYMBOL (coding_system);
9256   setup_coding_system (Fcheck_coding_system (coding_system),
9257                        &safe_terminal_coding);
9258   /* Character composition should be disabled.  */
9259   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9260   safe_terminal_coding.src_multibyte = 1;
9261   safe_terminal_coding.dst_multibyte = 0;
9262   return Qnil;
9263 }
9264
9265 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9266        Sterminal_coding_system, 0, 1, 0,
9267        doc: /* Return coding system specified for terminal output on the given terminal.
9268 TERMINAL may be a terminal object, a frame, or nil for the selected
9269 frame's terminal device.  */)
9270   (Lisp_Object terminal)
9271 {
9272   struct coding_system *terminal_coding
9273     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9274   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9275
9276   /* For backward compatibility, return nil if it is `undecided'.  */
9277   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9278 }
9279
9280 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9281        Sset_keyboard_coding_system_internal, 1, 2, 0,
9282        doc: /* Internal use only.  */)
9283   (Lisp_Object coding_system, Lisp_Object terminal)
9284 {
9285   struct terminal *t = get_terminal (terminal, 1);
9286   CHECK_SYMBOL (coding_system);
9287   if (NILP (coding_system))
9288     coding_system = Qno_conversion;
9289   else
9290     Fcheck_coding_system (coding_system);
9291   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9292   /* Character composition should be disabled.  */
9293   TERMINAL_KEYBOARD_CODING (t)->common_flags
9294     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9295   return Qnil;
9296 }
9297
9298 DEFUN ("keyboard-coding-system",
9299        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9300        doc: /* Return coding system specified for decoding keyboard input.  */)
9301   (Lisp_Object terminal)
9302 {
9303   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9304                          (get_terminal (terminal, 1))->id);
9305 }
9306
9307 \f
9308 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9309        Sfind_operation_coding_system,  1, MANY, 0,
9310        doc: /* Choose a coding system for an operation based on the target name.
9311 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9312 DECODING-SYSTEM is the coding system to use for decoding
9313 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9314 for encoding (in case OPERATION does encoding).
9315
9316 The first argument OPERATION specifies an I/O primitive:
9317   For file I/O, `insert-file-contents' or `write-region'.
9318   For process I/O, `call-process', `call-process-region', or `start-process'.
9319   For network I/O, `open-network-stream'.
9320
9321 The remaining arguments should be the same arguments that were passed
9322 to the primitive.  Depending on which primitive, one of those arguments
9323 is selected as the TARGET.  For example, if OPERATION does file I/O,
9324 whichever argument specifies the file name is TARGET.
9325
9326 TARGET has a meaning which depends on OPERATION:
9327   For file I/O, TARGET is a file name (except for the special case below).
9328   For process I/O, TARGET is a process name.
9329   For network I/O, TARGET is a service name or a port number.
9330
9331 This function looks up what is specified for TARGET in
9332 `file-coding-system-alist', `process-coding-system-alist',
9333 or `network-coding-system-alist' depending on OPERATION.
9334 They may specify a coding system, a cons of coding systems,
9335 or a function symbol to call.
9336 In the last case, we call the function with one argument,
9337 which is a list of all the arguments given to this function.
9338 If the function can't decide a coding system, it can return
9339 `undecided' so that the normal code-detection is performed.
9340
9341 If OPERATION is `insert-file-contents', the argument corresponding to
9342 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9343 file name to look up, and BUFFER is a buffer that contains the file's
9344 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9345 function to call for FILENAME, that function should examine the
9346 contents of BUFFER instead of reading the file.
9347
9348 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9349   (ptrdiff_t nargs, Lisp_Object *args)
9350 {
9351   Lisp_Object operation, target_idx, target, val;
9352   register Lisp_Object chain;
9353
9354   if (nargs < 2)
9355     error ("Too few arguments");
9356   operation = args[0];
9357   if (!SYMBOLP (operation)
9358       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9359     error ("Invalid first argument");
9360   if (nargs <= 1 + XFASTINT (target_idx))
9361     error ("Too few arguments for operation `%s'",
9362            SDATA (SYMBOL_NAME (operation)));
9363   target = args[XFASTINT (target_idx) + 1];
9364   if (!(STRINGP (target)
9365         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9366             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9367         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9368     error ("Invalid argument %"pI"d of operation `%s'",
9369            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9370   if (CONSP (target))
9371     target = XCAR (target);
9372
9373   chain = ((EQ (operation, Qinsert_file_contents)
9374             || EQ (operation, Qwrite_region))
9375            ? Vfile_coding_system_alist
9376            : (EQ (operation, Qopen_network_stream)
9377               ? Vnetwork_coding_system_alist
9378               : Vprocess_coding_system_alist));
9379   if (NILP (chain))
9380     return Qnil;
9381
9382   for (; CONSP (chain); chain = XCDR (chain))
9383     {
9384       Lisp_Object elt;
9385
9386       elt = XCAR (chain);
9387       if (CONSP (elt)
9388           && ((STRINGP (target)
9389                && STRINGP (XCAR (elt))
9390                && fast_string_match (XCAR (elt), target) >= 0)
9391               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9392         {
9393           val = XCDR (elt);
9394           /* Here, if VAL is both a valid coding system and a valid
9395              function symbol, we return VAL as a coding system.  */
9396           if (CONSP (val))
9397             return val;
9398           if (! SYMBOLP (val))
9399             return Qnil;
9400           if (! NILP (Fcoding_system_p (val)))
9401             return Fcons (val, val);
9402           if (! NILP (Ffboundp (val)))
9403             {
9404               /* We use call1 rather than safe_call1
9405                  so as to get bug reports about functions called here
9406                  which don't handle the current interface.  */
9407               val = call1 (val, Flist (nargs, args));
9408               if (CONSP (val))
9409                 return val;
9410               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9411                 return Fcons (val, val);
9412             }
9413           return Qnil;
9414         }
9415     }
9416   return Qnil;
9417 }
9418
9419 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9420        Sset_coding_system_priority, 0, MANY, 0,
9421        doc: /* Assign higher priority to the coding systems given as arguments.
9422 If multiple coding systems belong to the same category,
9423 all but the first one are ignored.
9424
9425 usage: (set-coding-system-priority &rest coding-systems)  */)
9426   (ptrdiff_t nargs, Lisp_Object *args)
9427 {
9428   ptrdiff_t i, j;
9429   bool changed[coding_category_max];
9430   enum coding_category priorities[coding_category_max];
9431
9432   memset (changed, 0, sizeof changed);
9433
9434   for (i = j = 0; i < nargs; i++)
9435     {
9436       enum coding_category category;
9437       Lisp_Object spec, attrs;
9438
9439       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9440       attrs = AREF (spec, 0);
9441       category = XINT (CODING_ATTR_CATEGORY (attrs));
9442       if (changed[category])
9443         /* Ignore this coding system because a coding system of the
9444            same category already had a higher priority.  */
9445         continue;
9446       changed[category] = 1;
9447       priorities[j++] = category;
9448       if (coding_categories[category].id >= 0
9449           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9450         setup_coding_system (args[i], &coding_categories[category]);
9451       Fset (AREF (Vcoding_category_table, category), args[i]);
9452     }
9453
9454   /* Now we have decided top J priorities.  Reflect the order of the
9455      original priorities to the remaining priorities.  */
9456
9457   for (i = j, j = 0; i < coding_category_max; i++, j++)
9458     {
9459       while (j < coding_category_max
9460              && changed[coding_priorities[j]])
9461         j++;
9462       if (j == coding_category_max)
9463         emacs_abort ();
9464       priorities[i] = coding_priorities[j];
9465     }
9466
9467   memcpy (coding_priorities, priorities, sizeof priorities);
9468
9469   /* Update `coding-category-list'.  */
9470   Vcoding_category_list = Qnil;
9471   for (i = coding_category_max; i-- > 0; )
9472     Vcoding_category_list
9473       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9474                Vcoding_category_list);
9475
9476   return Qnil;
9477 }
9478
9479 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9480        Scoding_system_priority_list, 0, 1, 0,
9481        doc: /* Return a list of coding systems ordered by their priorities.
9482 The list contains a subset of coding systems; i.e. coding systems
9483 assigned to each coding category (see `coding-category-list').
9484
9485 HIGHESTP non-nil means just return the highest priority one.  */)
9486   (Lisp_Object highestp)
9487 {
9488   int i;
9489   Lisp_Object val;
9490
9491   for (i = 0, val = Qnil; i < coding_category_max; i++)
9492     {
9493       enum coding_category category = coding_priorities[i];
9494       int id = coding_categories[category].id;
9495       Lisp_Object attrs;
9496
9497       if (id < 0)
9498         continue;
9499       attrs = CODING_ID_ATTRS (id);
9500       if (! NILP (highestp))
9501         return CODING_ATTR_BASE_NAME (attrs);
9502       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9503     }
9504   return Fnreverse (val);
9505 }
9506
9507 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9508
9509 static Lisp_Object
9510 make_subsidiaries (Lisp_Object base)
9511 {
9512   Lisp_Object subsidiaries;
9513   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9514   char *buf = alloca (base_name_len + 6);
9515   int i;
9516
9517   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9518   subsidiaries = make_uninit_vector (3);
9519   for (i = 0; i < 3; i++)
9520     {
9521       strcpy (buf + base_name_len, suffixes[i]);
9522       ASET (subsidiaries, i, intern (buf));
9523     }
9524   return subsidiaries;
9525 }
9526
9527
9528 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9529        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9530        doc: /* For internal use only.
9531 usage: (define-coding-system-internal ...)  */)
9532   (ptrdiff_t nargs, Lisp_Object *args)
9533 {
9534   Lisp_Object name;
9535   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9536   Lisp_Object attrs;            /* Vector of attributes.  */
9537   Lisp_Object eol_type;
9538   Lisp_Object aliases;
9539   Lisp_Object coding_type, charset_list, safe_charsets;
9540   enum coding_category category;
9541   Lisp_Object tail, val;
9542   int max_charset_id = 0;
9543   int i;
9544
9545   if (nargs < coding_arg_max)
9546     goto short_args;
9547
9548   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9549
9550   name = args[coding_arg_name];
9551   CHECK_SYMBOL (name);
9552   ASET (attrs, coding_attr_base_name, name);
9553
9554   val = args[coding_arg_mnemonic];
9555   if (! STRINGP (val))
9556     CHECK_CHARACTER (val);
9557   ASET (attrs, coding_attr_mnemonic, val);
9558
9559   coding_type = args[coding_arg_coding_type];
9560   CHECK_SYMBOL (coding_type);
9561   ASET (attrs, coding_attr_type, coding_type);
9562
9563   charset_list = args[coding_arg_charset_list];
9564   if (SYMBOLP (charset_list))
9565     {
9566       if (EQ (charset_list, Qiso_2022))
9567         {
9568           if (! EQ (coding_type, Qiso_2022))
9569             error ("Invalid charset-list");
9570           charset_list = Viso_2022_charset_list;
9571         }
9572       else if (EQ (charset_list, Qemacs_mule))
9573         {
9574           if (! EQ (coding_type, Qemacs_mule))
9575             error ("Invalid charset-list");
9576           charset_list = Vemacs_mule_charset_list;
9577         }
9578       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9579         {
9580           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9581             error ("Invalid charset-list");
9582           if (max_charset_id < XFASTINT (XCAR (tail)))
9583             max_charset_id = XFASTINT (XCAR (tail));
9584         }
9585     }
9586   else
9587     {
9588       charset_list = Fcopy_sequence (charset_list);
9589       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9590         {
9591           struct charset *charset;
9592
9593           val = XCAR (tail);
9594           CHECK_CHARSET_GET_CHARSET (val, charset);
9595           if (EQ (coding_type, Qiso_2022)
9596               ? CHARSET_ISO_FINAL (charset) < 0
9597               : EQ (coding_type, Qemacs_mule)
9598               ? CHARSET_EMACS_MULE_ID (charset) < 0
9599               : 0)
9600             error ("Can't handle charset `%s'",
9601                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9602
9603           XSETCAR (tail, make_number (charset->id));
9604           if (max_charset_id < charset->id)
9605             max_charset_id = charset->id;
9606         }
9607     }
9608   ASET (attrs, coding_attr_charset_list, charset_list);
9609
9610   safe_charsets = make_uninit_string (max_charset_id + 1);
9611   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9612   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9613     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9614   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9615
9616   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9617
9618   val = args[coding_arg_decode_translation_table];
9619   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9620     CHECK_SYMBOL (val);
9621   ASET (attrs, coding_attr_decode_tbl, val);
9622
9623   val = args[coding_arg_encode_translation_table];
9624   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9625     CHECK_SYMBOL (val);
9626   ASET (attrs, coding_attr_encode_tbl, val);
9627
9628   val = args[coding_arg_post_read_conversion];
9629   CHECK_SYMBOL (val);
9630   ASET (attrs, coding_attr_post_read, val);
9631
9632   val = args[coding_arg_pre_write_conversion];
9633   CHECK_SYMBOL (val);
9634   ASET (attrs, coding_attr_pre_write, val);
9635
9636   val = args[coding_arg_default_char];
9637   if (NILP (val))
9638     ASET (attrs, coding_attr_default_char, make_number (' '));
9639   else
9640     {
9641       CHECK_CHARACTER (val);
9642       ASET (attrs, coding_attr_default_char, val);
9643     }
9644
9645   val = args[coding_arg_for_unibyte];
9646   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9647
9648   val = args[coding_arg_plist];
9649   CHECK_LIST (val);
9650   ASET (attrs, coding_attr_plist, val);
9651
9652   if (EQ (coding_type, Qcharset))
9653     {
9654       /* Generate a lisp vector of 256 elements.  Each element is nil,
9655          integer, or a list of charset IDs.
9656
9657          If Nth element is nil, the byte code N is invalid in this
9658          coding system.
9659
9660          If Nth element is a number NUM, N is the first byte of a
9661          charset whose ID is NUM.
9662
9663          If Nth element is a list of charset IDs, N is the first byte
9664          of one of them.  The list is sorted by dimensions of the
9665          charsets.  A charset of smaller dimension comes first. */
9666       val = Fmake_vector (make_number (256), Qnil);
9667
9668       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9669         {
9670           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9671           int dim = CHARSET_DIMENSION (charset);
9672           int idx = (dim - 1) * 4;
9673
9674           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9675             ASET (attrs, coding_attr_ascii_compat, Qt);
9676
9677           for (i = charset->code_space[idx];
9678                i <= charset->code_space[idx + 1]; i++)
9679             {
9680               Lisp_Object tmp, tmp2;
9681               int dim2;
9682
9683               tmp = AREF (val, i);
9684               if (NILP (tmp))
9685                 tmp = XCAR (tail);
9686               else if (NUMBERP (tmp))
9687                 {
9688                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9689                   if (dim < dim2)
9690                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9691                   else
9692                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9693                 }
9694               else
9695                 {
9696                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9697                     {
9698                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9699                       if (dim < dim2)
9700                         break;
9701                     }
9702                   if (NILP (tmp2))
9703                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9704                   else
9705                     {
9706                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9707                       XSETCAR (tmp2, XCAR (tail));
9708                     }
9709                 }
9710               ASET (val, i, tmp);
9711             }
9712         }
9713       ASET (attrs, coding_attr_charset_valids, val);
9714       category = coding_category_charset;
9715     }
9716   else if (EQ (coding_type, Qccl))
9717     {
9718       Lisp_Object valids;
9719
9720       if (nargs < coding_arg_ccl_max)
9721         goto short_args;
9722
9723       val = args[coding_arg_ccl_decoder];
9724       CHECK_CCL_PROGRAM (val);
9725       if (VECTORP (val))
9726         val = Fcopy_sequence (val);
9727       ASET (attrs, coding_attr_ccl_decoder, val);
9728
9729       val = args[coding_arg_ccl_encoder];
9730       CHECK_CCL_PROGRAM (val);
9731       if (VECTORP (val))
9732         val = Fcopy_sequence (val);
9733       ASET (attrs, coding_attr_ccl_encoder, val);
9734
9735       val = args[coding_arg_ccl_valids];
9736       valids = Fmake_string (make_number (256), make_number (0));
9737       for (tail = val; CONSP (tail); tail = XCDR (tail))
9738         {
9739           int from, to;
9740
9741           val = XCAR (tail);
9742           if (INTEGERP (val))
9743             {
9744               if (! (0 <= XINT (val) && XINT (val) <= 255))
9745                 args_out_of_range_3 (val, make_number (0), make_number (255));
9746               from = to = XINT (val);
9747             }
9748           else
9749             {
9750               CHECK_CONS (val);
9751               CHECK_NATNUM_CAR (val);
9752               CHECK_NUMBER_CDR (val);
9753               if (XINT (XCAR (val)) > 255)
9754                 args_out_of_range_3 (XCAR (val),
9755                                      make_number (0), make_number (255));
9756               from = XINT (XCAR (val));
9757               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9758                 args_out_of_range_3 (XCDR (val),
9759                                      XCAR (val), make_number (255));
9760               to = XINT (XCDR (val));
9761             }
9762           for (i = from; i <= to; i++)
9763             SSET (valids, i, 1);
9764         }
9765       ASET (attrs, coding_attr_ccl_valids, valids);
9766
9767       category = coding_category_ccl;
9768     }
9769   else if (EQ (coding_type, Qutf_16))
9770     {
9771       Lisp_Object bom, endian;
9772
9773       ASET (attrs, coding_attr_ascii_compat, Qnil);
9774
9775       if (nargs < coding_arg_utf16_max)
9776         goto short_args;
9777
9778       bom = args[coding_arg_utf16_bom];
9779       if (! NILP (bom) && ! EQ (bom, Qt))
9780         {
9781           CHECK_CONS (bom);
9782           val = XCAR (bom);
9783           CHECK_CODING_SYSTEM (val);
9784           val = XCDR (bom);
9785           CHECK_CODING_SYSTEM (val);
9786         }
9787       ASET (attrs, coding_attr_utf_bom, bom);
9788
9789       endian = args[coding_arg_utf16_endian];
9790       CHECK_SYMBOL (endian);
9791       if (NILP (endian))
9792         endian = Qbig;
9793       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9794         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9795       ASET (attrs, coding_attr_utf_16_endian, endian);
9796
9797       category = (CONSP (bom)
9798                   ? coding_category_utf_16_auto
9799                   : NILP (bom)
9800                   ? (EQ (endian, Qbig)
9801                      ? coding_category_utf_16_be_nosig
9802                      : coding_category_utf_16_le_nosig)
9803                   : (EQ (endian, Qbig)
9804                      ? coding_category_utf_16_be
9805                      : coding_category_utf_16_le));
9806     }
9807   else if (EQ (coding_type, Qiso_2022))
9808     {
9809       Lisp_Object initial, reg_usage, request, flags;
9810
9811       if (nargs < coding_arg_iso2022_max)
9812         goto short_args;
9813
9814       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9815       CHECK_VECTOR (initial);
9816       for (i = 0; i < 4; i++)
9817         {
9818           val = AREF (initial, i);
9819           if (! NILP (val))
9820             {
9821               struct charset *charset;
9822
9823               CHECK_CHARSET_GET_CHARSET (val, charset);
9824               ASET (initial, i, make_number (CHARSET_ID (charset)));
9825               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9826                 ASET (attrs, coding_attr_ascii_compat, Qt);
9827             }
9828           else
9829             ASET (initial, i, make_number (-1));
9830         }
9831
9832       reg_usage = args[coding_arg_iso2022_reg_usage];
9833       CHECK_CONS (reg_usage);
9834       CHECK_NUMBER_CAR (reg_usage);
9835       CHECK_NUMBER_CDR (reg_usage);
9836
9837       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9838       for (tail = request; CONSP (tail); tail = XCDR (tail))
9839         {
9840           int id;
9841           Lisp_Object tmp1;
9842
9843           val = XCAR (tail);
9844           CHECK_CONS (val);
9845           tmp1 = XCAR (val);
9846           CHECK_CHARSET_GET_ID (tmp1, id);
9847           CHECK_NATNUM_CDR (val);
9848           if (XINT (XCDR (val)) >= 4)
9849             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9850           XSETCAR (val, make_number (id));
9851         }
9852
9853       flags = args[coding_arg_iso2022_flags];
9854       CHECK_NATNUM (flags);
9855       i = XINT (flags) & INT_MAX;
9856       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9857         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9858       flags = make_number (i);
9859
9860       ASET (attrs, coding_attr_iso_initial, initial);
9861       ASET (attrs, coding_attr_iso_usage, reg_usage);
9862       ASET (attrs, coding_attr_iso_request, request);
9863       ASET (attrs, coding_attr_iso_flags, flags);
9864       setup_iso_safe_charsets (attrs);
9865
9866       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9867         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9868                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9869                     ? coding_category_iso_7_else
9870                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9871                     ? coding_category_iso_7
9872                     : coding_category_iso_7_tight);
9873       else
9874         {
9875           int id = XINT (AREF (initial, 1));
9876
9877           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9878                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9879                        || id < 0)
9880                       ? coding_category_iso_8_else
9881                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9882                       ? coding_category_iso_8_1
9883                       : coding_category_iso_8_2);
9884         }
9885       if (category != coding_category_iso_8_1
9886           && category != coding_category_iso_8_2)
9887         ASET (attrs, coding_attr_ascii_compat, Qnil);
9888     }
9889   else if (EQ (coding_type, Qemacs_mule))
9890     {
9891       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9892         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9893       ASET (attrs, coding_attr_ascii_compat, Qt);
9894       category = coding_category_emacs_mule;
9895     }
9896   else if (EQ (coding_type, Qshift_jis))
9897     {
9898
9899       struct charset *charset;
9900
9901       if (XINT (Flength (charset_list)) != 3
9902           && XINT (Flength (charset_list)) != 4)
9903         error ("There should be three or four charsets");
9904
9905       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9906       if (CHARSET_DIMENSION (charset) != 1)
9907         error ("Dimension of charset %s is not one",
9908                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9909       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9910         ASET (attrs, coding_attr_ascii_compat, Qt);
9911
9912       charset_list = XCDR (charset_list);
9913       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9914       if (CHARSET_DIMENSION (charset) != 1)
9915         error ("Dimension of charset %s is not one",
9916                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9917
9918       charset_list = XCDR (charset_list);
9919       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9920       if (CHARSET_DIMENSION (charset) != 2)
9921         error ("Dimension of charset %s is not two",
9922                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9923
9924       charset_list = XCDR (charset_list);
9925       if (! NILP (charset_list))
9926         {
9927           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9928           if (CHARSET_DIMENSION (charset) != 2)
9929             error ("Dimension of charset %s is not two",
9930                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9931         }
9932
9933       category = coding_category_sjis;
9934       Vsjis_coding_system = name;
9935     }
9936   else if (EQ (coding_type, Qbig5))
9937     {
9938       struct charset *charset;
9939
9940       if (XINT (Flength (charset_list)) != 2)
9941         error ("There should be just two charsets");
9942
9943       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9944       if (CHARSET_DIMENSION (charset) != 1)
9945         error ("Dimension of charset %s is not one",
9946                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9947       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9948         ASET (attrs, coding_attr_ascii_compat, Qt);
9949
9950       charset_list = XCDR (charset_list);
9951       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9952       if (CHARSET_DIMENSION (charset) != 2)
9953         error ("Dimension of charset %s is not two",
9954                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9955
9956       category = coding_category_big5;
9957       Vbig5_coding_system = name;
9958     }
9959   else if (EQ (coding_type, Qraw_text))
9960     {
9961       category = coding_category_raw_text;
9962       ASET (attrs, coding_attr_ascii_compat, Qt);
9963     }
9964   else if (EQ (coding_type, Qutf_8))
9965     {
9966       Lisp_Object bom;
9967
9968       if (nargs < coding_arg_utf8_max)
9969         goto short_args;
9970
9971       bom = args[coding_arg_utf8_bom];
9972       if (! NILP (bom) && ! EQ (bom, Qt))
9973         {
9974           CHECK_CONS (bom);
9975           val = XCAR (bom);
9976           CHECK_CODING_SYSTEM (val);
9977           val = XCDR (bom);
9978           CHECK_CODING_SYSTEM (val);
9979         }
9980       ASET (attrs, coding_attr_utf_bom, bom);
9981       if (NILP (bom))
9982         ASET (attrs, coding_attr_ascii_compat, Qt);
9983
9984       category = (CONSP (bom) ? coding_category_utf_8_auto
9985                   : NILP (bom) ? coding_category_utf_8_nosig
9986                   : coding_category_utf_8_sig);
9987     }
9988   else if (EQ (coding_type, Qundecided))
9989     category = coding_category_undecided;
9990   else
9991     error ("Invalid coding system type: %s",
9992            SDATA (SYMBOL_NAME (coding_type)));
9993
9994   ASET (attrs, coding_attr_category, make_number (category));
9995   ASET (attrs, coding_attr_plist,
9996         Fcons (QCcategory,
9997                Fcons (AREF (Vcoding_category_table, category),
9998                       CODING_ATTR_PLIST (attrs))));
9999   ASET (attrs, coding_attr_plist,
10000         Fcons (QCascii_compatible_p,
10001                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10002                       CODING_ATTR_PLIST (attrs))));
10003
10004   eol_type = args[coding_arg_eol_type];
10005   if (! NILP (eol_type)
10006       && ! EQ (eol_type, Qunix)
10007       && ! EQ (eol_type, Qdos)
10008       && ! EQ (eol_type, Qmac))
10009     error ("Invalid eol-type");
10010
10011   aliases = Fcons (name, Qnil);
10012
10013   if (NILP (eol_type))
10014     {
10015       eol_type = make_subsidiaries (name);
10016       for (i = 0; i < 3; i++)
10017         {
10018           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10019
10020           this_name = AREF (eol_type, i);
10021           this_aliases = Fcons (this_name, Qnil);
10022           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10023           this_spec = make_uninit_vector (3);
10024           ASET (this_spec, 0, attrs);
10025           ASET (this_spec, 1, this_aliases);
10026           ASET (this_spec, 2, this_eol_type);
10027           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10028           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10029           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10030           if (NILP (val))
10031             Vcoding_system_alist
10032               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10033                        Vcoding_system_alist);
10034         }
10035     }
10036
10037   spec_vec = make_uninit_vector (3);
10038   ASET (spec_vec, 0, attrs);
10039   ASET (spec_vec, 1, aliases);
10040   ASET (spec_vec, 2, eol_type);
10041
10042   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10043   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10044   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10045   if (NILP (val))
10046     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10047                                   Vcoding_system_alist);
10048
10049   {
10050     int id = coding_categories[category].id;
10051
10052     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10053       setup_coding_system (name, &coding_categories[category]);
10054   }
10055
10056   return Qnil;
10057
10058  short_args:
10059   return Fsignal (Qwrong_number_of_arguments,
10060                   Fcons (intern ("define-coding-system-internal"),
10061                          make_number (nargs)));
10062 }
10063
10064
10065 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10066        3, 3, 0,
10067        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10068   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10069 {
10070   Lisp_Object spec, attrs;
10071
10072   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10073   attrs = AREF (spec, 0);
10074   if (EQ (prop, QCmnemonic))
10075     {
10076       if (! STRINGP (val))
10077         CHECK_CHARACTER (val);
10078       ASET (attrs, coding_attr_mnemonic, val);
10079     }
10080   else if (EQ (prop, QCdefault_char))
10081     {
10082       if (NILP (val))
10083         val = make_number (' ');
10084       else
10085         CHECK_CHARACTER (val);
10086       ASET (attrs, coding_attr_default_char, val);
10087     }
10088   else if (EQ (prop, QCdecode_translation_table))
10089     {
10090       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10091         CHECK_SYMBOL (val);
10092       ASET (attrs, coding_attr_decode_tbl, val);
10093     }
10094   else if (EQ (prop, QCencode_translation_table))
10095     {
10096       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10097         CHECK_SYMBOL (val);
10098       ASET (attrs, coding_attr_encode_tbl, val);
10099     }
10100   else if (EQ (prop, QCpost_read_conversion))
10101     {
10102       CHECK_SYMBOL (val);
10103       ASET (attrs, coding_attr_post_read, val);
10104     }
10105   else if (EQ (prop, QCpre_write_conversion))
10106     {
10107       CHECK_SYMBOL (val);
10108       ASET (attrs, coding_attr_pre_write, val);
10109     }
10110   else if (EQ (prop, QCascii_compatible_p))
10111     {
10112       ASET (attrs, coding_attr_ascii_compat, val);
10113     }
10114
10115   ASET (attrs, coding_attr_plist,
10116         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10117   return val;
10118 }
10119
10120
10121 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10122        Sdefine_coding_system_alias, 2, 2, 0,
10123        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10124   (Lisp_Object alias, Lisp_Object coding_system)
10125 {
10126   Lisp_Object spec, aliases, eol_type, val;
10127
10128   CHECK_SYMBOL (alias);
10129   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10130   aliases = AREF (spec, 1);
10131   /* ALIASES should be a list of length more than zero, and the first
10132      element is a base coding system.  Append ALIAS at the tail of the
10133      list.  */
10134   while (!NILP (XCDR (aliases)))
10135     aliases = XCDR (aliases);
10136   XSETCDR (aliases, Fcons (alias, Qnil));
10137
10138   eol_type = AREF (spec, 2);
10139   if (VECTORP (eol_type))
10140     {
10141       Lisp_Object subsidiaries;
10142       int i;
10143
10144       subsidiaries = make_subsidiaries (alias);
10145       for (i = 0; i < 3; i++)
10146         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10147                                      AREF (eol_type, i));
10148     }
10149
10150   Fputhash (alias, spec, Vcoding_system_hash_table);
10151   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10152   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10153   if (NILP (val))
10154     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10155                                   Vcoding_system_alist);
10156
10157   return Qnil;
10158 }
10159
10160 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10161        1, 1, 0,
10162        doc: /* Return the base of CODING-SYSTEM.
10163 Any alias or subsidiary coding system is not a base coding system.  */)
10164   (Lisp_Object coding_system)
10165 {
10166   Lisp_Object spec, attrs;
10167
10168   if (NILP (coding_system))
10169     return (Qno_conversion);
10170   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10171   attrs = AREF (spec, 0);
10172   return CODING_ATTR_BASE_NAME (attrs);
10173 }
10174
10175 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10176        1, 1, 0,
10177        doc: "Return the property list of CODING-SYSTEM.")
10178   (Lisp_Object coding_system)
10179 {
10180   Lisp_Object spec, attrs;
10181
10182   if (NILP (coding_system))
10183     coding_system = Qno_conversion;
10184   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10185   attrs = AREF (spec, 0);
10186   return CODING_ATTR_PLIST (attrs);
10187 }
10188
10189
10190 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10191        1, 1, 0,
10192        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10193   (Lisp_Object coding_system)
10194 {
10195   Lisp_Object spec;
10196
10197   if (NILP (coding_system))
10198     coding_system = Qno_conversion;
10199   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10200   return AREF (spec, 1);
10201 }
10202
10203 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10204        Scoding_system_eol_type, 1, 1, 0,
10205        doc: /* Return eol-type of CODING-SYSTEM.
10206 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10207
10208 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10209 and CR respectively.
10210
10211 A vector value indicates that a format of end-of-line should be
10212 detected automatically.  Nth element of the vector is the subsidiary
10213 coding system whose eol-type is N.  */)
10214   (Lisp_Object coding_system)
10215 {
10216   Lisp_Object spec, eol_type;
10217   int n;
10218
10219   if (NILP (coding_system))
10220     coding_system = Qno_conversion;
10221   if (! CODING_SYSTEM_P (coding_system))
10222     return Qnil;
10223   spec = CODING_SYSTEM_SPEC (coding_system);
10224   eol_type = AREF (spec, 2);
10225   if (VECTORP (eol_type))
10226     return Fcopy_sequence (eol_type);
10227   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10228   return make_number (n);
10229 }
10230
10231 #endif /* emacs */
10232
10233 \f
10234 /*** 9. Post-amble ***/
10235
10236 void
10237 init_coding_once (void)
10238 {
10239   int i;
10240
10241   for (i = 0; i < coding_category_max; i++)
10242     {
10243       coding_categories[i].id = -1;
10244       coding_priorities[i] = i;
10245     }
10246
10247   /* ISO2022 specific initialize routine.  */
10248   for (i = 0; i < 0x20; i++)
10249     iso_code_class[i] = ISO_control_0;
10250   for (i = 0x21; i < 0x7F; i++)
10251     iso_code_class[i] = ISO_graphic_plane_0;
10252   for (i = 0x80; i < 0xA0; i++)
10253     iso_code_class[i] = ISO_control_1;
10254   for (i = 0xA1; i < 0xFF; i++)
10255     iso_code_class[i] = ISO_graphic_plane_1;
10256   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10257   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10258   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10259   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10260   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10261   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10262   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10263   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10264   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10265
10266   for (i = 0; i < 256; i++)
10267     {
10268       emacs_mule_bytes[i] = 1;
10269     }
10270   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10271   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10272   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10273   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10274 }
10275
10276 #ifdef emacs
10277
10278 void
10279 syms_of_coding (void)
10280 {
10281   staticpro (&Vcoding_system_hash_table);
10282   {
10283     Lisp_Object args[2];
10284     args[0] = QCtest;
10285     args[1] = Qeq;
10286     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10287   }
10288
10289   staticpro (&Vsjis_coding_system);
10290   Vsjis_coding_system = Qnil;
10291
10292   staticpro (&Vbig5_coding_system);
10293   Vbig5_coding_system = Qnil;
10294
10295   staticpro (&Vcode_conversion_reused_workbuf);
10296   Vcode_conversion_reused_workbuf = Qnil;
10297
10298   staticpro (&Vcode_conversion_workbuf_name);
10299   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10300
10301   reused_workbuf_in_use = 0;
10302
10303   DEFSYM (Qcharset, "charset");
10304   DEFSYM (Qtarget_idx, "target-idx");
10305   DEFSYM (Qcoding_system_history, "coding-system-history");
10306   Fset (Qcoding_system_history, Qnil);
10307
10308   /* Target FILENAME is the first argument.  */
10309   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10310   /* Target FILENAME is the third argument.  */
10311   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10312
10313   DEFSYM (Qcall_process, "call-process");
10314   /* Target PROGRAM is the first argument.  */
10315   Fput (Qcall_process, Qtarget_idx, make_number (0));
10316
10317   DEFSYM (Qcall_process_region, "call-process-region");
10318   /* Target PROGRAM is the third argument.  */
10319   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10320
10321   DEFSYM (Qstart_process, "start-process");
10322   /* Target PROGRAM is the third argument.  */
10323   Fput (Qstart_process, Qtarget_idx, make_number (2));
10324
10325   DEFSYM (Qopen_network_stream, "open-network-stream");
10326   /* Target SERVICE is the fourth argument.  */
10327   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10328
10329   DEFSYM (Qcoding_system, "coding-system");
10330   DEFSYM (Qcoding_aliases, "coding-aliases");
10331
10332   DEFSYM (Qeol_type, "eol-type");
10333   DEFSYM (Qunix, "unix");
10334   DEFSYM (Qdos, "dos");
10335   DEFSYM (Qmac, "mac");
10336
10337   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10338   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10339   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10340   DEFSYM (Qdefault_char, "default-char");
10341   DEFSYM (Qundecided, "undecided");
10342   DEFSYM (Qno_conversion, "no-conversion");
10343   DEFSYM (Qraw_text, "raw-text");
10344
10345   DEFSYM (Qiso_2022, "iso-2022");
10346
10347   DEFSYM (Qutf_8, "utf-8");
10348   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10349
10350 #if defined (WINDOWSNT) || defined (CYGWIN)
10351   /* No, not utf-16-le: that one has a BOM.  */
10352   DEFSYM (Qutf_16le, "utf-16le");
10353 #endif
10354
10355   DEFSYM (Qutf_16, "utf-16");
10356   DEFSYM (Qbig, "big");
10357   DEFSYM (Qlittle, "little");
10358
10359   DEFSYM (Qshift_jis, "shift-jis");
10360   DEFSYM (Qbig5, "big5");
10361
10362   DEFSYM (Qcoding_system_p, "coding-system-p");
10363
10364   DEFSYM (Qcoding_system_error, "coding-system-error");
10365   Fput (Qcoding_system_error, Qerror_conditions,
10366         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10367   Fput (Qcoding_system_error, Qerror_message,
10368         build_pure_c_string ("Invalid coding system"));
10369
10370   /* Intern this now in case it isn't already done.
10371      Setting this variable twice is harmless.
10372      But don't staticpro it here--that is done in alloc.c.  */
10373   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10374
10375   DEFSYM (Qtranslation_table, "translation-table");
10376   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10377   DEFSYM (Qtranslation_table_id, "translation-table-id");
10378   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10379   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10380
10381   DEFSYM (Qvalid_codes, "valid-codes");
10382
10383   DEFSYM (Qemacs_mule, "emacs-mule");
10384
10385   DEFSYM (QCcategory, ":category");
10386   DEFSYM (QCmnemonic, ":mnemonic");
10387   DEFSYM (QCdefault_char, ":default-char");
10388   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10389   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10390   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10391   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10392   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10393
10394   Vcoding_category_table
10395     = Fmake_vector (make_number (coding_category_max), Qnil);
10396   staticpro (&Vcoding_category_table);
10397   /* Followings are target of code detection.  */
10398   ASET (Vcoding_category_table, coding_category_iso_7,
10399         intern_c_string ("coding-category-iso-7"));
10400   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10401         intern_c_string ("coding-category-iso-7-tight"));
10402   ASET (Vcoding_category_table, coding_category_iso_8_1,
10403         intern_c_string ("coding-category-iso-8-1"));
10404   ASET (Vcoding_category_table, coding_category_iso_8_2,
10405         intern_c_string ("coding-category-iso-8-2"));
10406   ASET (Vcoding_category_table, coding_category_iso_7_else,
10407         intern_c_string ("coding-category-iso-7-else"));
10408   ASET (Vcoding_category_table, coding_category_iso_8_else,
10409         intern_c_string ("coding-category-iso-8-else"));
10410   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10411         intern_c_string ("coding-category-utf-8-auto"));
10412   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10413         intern_c_string ("coding-category-utf-8"));
10414   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10415         intern_c_string ("coding-category-utf-8-sig"));
10416   ASET (Vcoding_category_table, coding_category_utf_16_be,
10417         intern_c_string ("coding-category-utf-16-be"));
10418   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10419         intern_c_string ("coding-category-utf-16-auto"));
10420   ASET (Vcoding_category_table, coding_category_utf_16_le,
10421         intern_c_string ("coding-category-utf-16-le"));
10422   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10423         intern_c_string ("coding-category-utf-16-be-nosig"));
10424   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10425         intern_c_string ("coding-category-utf-16-le-nosig"));
10426   ASET (Vcoding_category_table, coding_category_charset,
10427         intern_c_string ("coding-category-charset"));
10428   ASET (Vcoding_category_table, coding_category_sjis,
10429         intern_c_string ("coding-category-sjis"));
10430   ASET (Vcoding_category_table, coding_category_big5,
10431         intern_c_string ("coding-category-big5"));
10432   ASET (Vcoding_category_table, coding_category_ccl,
10433         intern_c_string ("coding-category-ccl"));
10434   ASET (Vcoding_category_table, coding_category_emacs_mule,
10435         intern_c_string ("coding-category-emacs-mule"));
10436   /* Followings are NOT target of code detection.  */
10437   ASET (Vcoding_category_table, coding_category_raw_text,
10438         intern_c_string ("coding-category-raw-text"));
10439   ASET (Vcoding_category_table, coding_category_undecided,
10440         intern_c_string ("coding-category-undecided"));
10441
10442   DEFSYM (Qinsufficient_source, "insufficient-source");
10443   DEFSYM (Qinvalid_source, "invalid-source");
10444   DEFSYM (Qinterrupted, "interrupted");
10445   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10446
10447   defsubr (&Scoding_system_p);
10448   defsubr (&Sread_coding_system);
10449   defsubr (&Sread_non_nil_coding_system);
10450   defsubr (&Scheck_coding_system);
10451   defsubr (&Sdetect_coding_region);
10452   defsubr (&Sdetect_coding_string);
10453   defsubr (&Sfind_coding_systems_region_internal);
10454   defsubr (&Sunencodable_char_position);
10455   defsubr (&Scheck_coding_systems_region);
10456   defsubr (&Sdecode_coding_region);
10457   defsubr (&Sencode_coding_region);
10458   defsubr (&Sdecode_coding_string);
10459   defsubr (&Sencode_coding_string);
10460   defsubr (&Sdecode_sjis_char);
10461   defsubr (&Sencode_sjis_char);
10462   defsubr (&Sdecode_big5_char);
10463   defsubr (&Sencode_big5_char);
10464   defsubr (&Sset_terminal_coding_system_internal);
10465   defsubr (&Sset_safe_terminal_coding_system_internal);
10466   defsubr (&Sterminal_coding_system);
10467   defsubr (&Sset_keyboard_coding_system_internal);
10468   defsubr (&Skeyboard_coding_system);
10469   defsubr (&Sfind_operation_coding_system);
10470   defsubr (&Sset_coding_system_priority);
10471   defsubr (&Sdefine_coding_system_internal);
10472   defsubr (&Sdefine_coding_system_alias);
10473   defsubr (&Scoding_system_put);
10474   defsubr (&Scoding_system_base);
10475   defsubr (&Scoding_system_plist);
10476   defsubr (&Scoding_system_aliases);
10477   defsubr (&Scoding_system_eol_type);
10478   defsubr (&Scoding_system_priority_list);
10479
10480   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10481                doc: /* List of coding systems.
10482
10483 Do not alter the value of this variable manually.  This variable should be
10484 updated by the functions `define-coding-system' and
10485 `define-coding-system-alias'.  */);
10486   Vcoding_system_list = Qnil;
10487
10488   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10489                doc: /* Alist of coding system names.
10490 Each element is one element list of coding system name.
10491 This variable is given to `completing-read' as COLLECTION argument.
10492
10493 Do not alter the value of this variable manually.  This variable should be
10494 updated by the functions `make-coding-system' and
10495 `define-coding-system-alias'.  */);
10496   Vcoding_system_alist = Qnil;
10497
10498   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10499                doc: /* List of coding-categories (symbols) ordered by priority.
10500
10501 On detecting a coding system, Emacs tries code detection algorithms
10502 associated with each coding-category one by one in this order.  When
10503 one algorithm agrees with a byte sequence of source text, the coding
10504 system bound to the corresponding coding-category is selected.
10505
10506 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10507   {
10508     int i;
10509
10510     Vcoding_category_list = Qnil;
10511     for (i = coding_category_max - 1; i >= 0; i--)
10512       Vcoding_category_list
10513         = Fcons (AREF (Vcoding_category_table, i),
10514                  Vcoding_category_list);
10515   }
10516
10517   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10518                doc: /* Specify the coding system for read operations.
10519 It is useful to bind this variable with `let', but do not set it globally.
10520 If the value is a coding system, it is used for decoding on read operation.
10521 If not, an appropriate element is used from one of the coding system alists.
10522 There are three such tables: `file-coding-system-alist',
10523 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10524   Vcoding_system_for_read = Qnil;
10525
10526   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10527                doc: /* Specify the coding system for write operations.
10528 Programs bind this variable with `let', but you should not set it globally.
10529 If the value is a coding system, it is used for encoding of output,
10530 when writing it to a file and when sending it to a file or subprocess.
10531
10532 If this does not specify a coding system, an appropriate element
10533 is used from one of the coding system alists.
10534 There are three such tables: `file-coding-system-alist',
10535 `process-coding-system-alist', and `network-coding-system-alist'.
10536 For output to files, if the above procedure does not specify a coding system,
10537 the value of `buffer-file-coding-system' is used.  */);
10538   Vcoding_system_for_write = Qnil;
10539
10540   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10541                doc: /*
10542 Coding system used in the latest file or process I/O.  */);
10543   Vlast_coding_system_used = Qnil;
10544
10545   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10546                doc: /*
10547 Error status of the last code conversion.
10548
10549 When an error was detected in the last code conversion, this variable
10550 is set to one of the following symbols.
10551   `insufficient-source'
10552   `inconsistent-eol'
10553   `invalid-source'
10554   `interrupted'
10555   `insufficient-memory'
10556 When no error was detected, the value doesn't change.  So, to check
10557 the error status of a code conversion by this variable, you must
10558 explicitly set this variable to nil before performing code
10559 conversion.  */);
10560   Vlast_code_conversion_error = Qnil;
10561
10562   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10563                doc: /*
10564 *Non-nil means always inhibit code conversion of end-of-line format.
10565 See info node `Coding Systems' and info node `Text and Binary' concerning
10566 such conversion.  */);
10567   inhibit_eol_conversion = 0;
10568
10569   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10570                doc: /*
10571 Non-nil means process buffer inherits coding system of process output.
10572 Bind it to t if the process output is to be treated as if it were a file
10573 read from some filesystem.  */);
10574   inherit_process_coding_system = 0;
10575
10576   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10577                doc: /*
10578 Alist to decide a coding system to use for a file I/O operation.
10579 The format is ((PATTERN . VAL) ...),
10580 where PATTERN is a regular expression matching a file name,
10581 VAL is a coding system, a cons of coding systems, or a function symbol.
10582 If VAL is a coding system, it is used for both decoding and encoding
10583 the file contents.
10584 If VAL is a cons of coding systems, the car part is used for decoding,
10585 and the cdr part is used for encoding.
10586 If VAL is a function symbol, the function must return a coding system
10587 or a cons of coding systems which are used as above.  The function is
10588 called with an argument that is a list of the arguments with which
10589 `find-operation-coding-system' was called.  If the function can't decide
10590 a coding system, it can return `undecided' so that the normal
10591 code-detection is performed.
10592
10593 See also the function `find-operation-coding-system'
10594 and the variable `auto-coding-alist'.  */);
10595   Vfile_coding_system_alist = Qnil;
10596
10597   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10598                doc: /*
10599 Alist to decide a coding system to use for a process I/O operation.
10600 The format is ((PATTERN . VAL) ...),
10601 where PATTERN is a regular expression matching a program name,
10602 VAL is a coding system, a cons of coding systems, or a function symbol.
10603 If VAL is a coding system, it is used for both decoding what received
10604 from the program and encoding what sent to the program.
10605 If VAL is a cons of coding systems, the car part is used for decoding,
10606 and the cdr part is used for encoding.
10607 If VAL is a function symbol, the function must return a coding system
10608 or a cons of coding systems which are used as above.
10609
10610 See also the function `find-operation-coding-system'.  */);
10611   Vprocess_coding_system_alist = Qnil;
10612
10613   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10614                doc: /*
10615 Alist to decide a coding system to use for a network I/O operation.
10616 The format is ((PATTERN . VAL) ...),
10617 where PATTERN is a regular expression matching a network service name
10618 or is a port number to connect to,
10619 VAL is a coding system, a cons of coding systems, or a function symbol.
10620 If VAL is a coding system, it is used for both decoding what received
10621 from the network stream and encoding what sent to the network stream.
10622 If VAL is a cons of coding systems, the car part is used for decoding,
10623 and the cdr part is used for encoding.
10624 If VAL is a function symbol, the function must return a coding system
10625 or a cons of coding systems which are used as above.
10626
10627 See also the function `find-operation-coding-system'.  */);
10628   Vnetwork_coding_system_alist = Qnil;
10629
10630   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10631                doc: /* Coding system to use with system messages.
10632 Also used for decoding keyboard input on X Window system.  */);
10633   Vlocale_coding_system = Qnil;
10634
10635   /* The eol mnemonics are reset in startup.el system-dependently.  */
10636   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10637                doc: /*
10638 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10639   eol_mnemonic_unix = build_pure_c_string (":");
10640
10641   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10642                doc: /*
10643 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10644   eol_mnemonic_dos = build_pure_c_string ("\\");
10645
10646   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10647                doc: /*
10648 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10649   eol_mnemonic_mac = build_pure_c_string ("/");
10650
10651   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10652                doc: /*
10653 *String displayed in mode line when end-of-line format is not yet determined.  */);
10654   eol_mnemonic_undecided = build_pure_c_string (":");
10655
10656   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10657                doc: /*
10658 *Non-nil enables character translation while encoding and decoding.  */);
10659   Venable_character_translation = Qt;
10660
10661   DEFVAR_LISP ("standard-translation-table-for-decode",
10662                Vstandard_translation_table_for_decode,
10663                doc: /* Table for translating characters while decoding.  */);
10664   Vstandard_translation_table_for_decode = Qnil;
10665
10666   DEFVAR_LISP ("standard-translation-table-for-encode",
10667                Vstandard_translation_table_for_encode,
10668                doc: /* Table for translating characters while encoding.  */);
10669   Vstandard_translation_table_for_encode = Qnil;
10670
10671   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10672                doc: /* Alist of charsets vs revision numbers.
10673 While encoding, if a charset (car part of an element) is found,
10674 designate it with the escape sequence identifying revision (cdr part
10675 of the element).  */);
10676   Vcharset_revision_table = Qnil;
10677
10678   DEFVAR_LISP ("default-process-coding-system",
10679                Vdefault_process_coding_system,
10680                doc: /* Cons of coding systems used for process I/O by default.
10681 The car part is used for decoding a process output,
10682 the cdr part is used for encoding a text to be sent to a process.  */);
10683   Vdefault_process_coding_system = Qnil;
10684
10685   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10686                doc: /*
10687 Table of extra Latin codes in the range 128..159 (inclusive).
10688 This is a vector of length 256.
10689 If Nth element is non-nil, the existence of code N in a file
10690 \(or output of subprocess) doesn't prevent it to be detected as
10691 a coding system of ISO 2022 variant which has a flag
10692 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10693 or reading output of a subprocess.
10694 Only 128th through 159th elements have a meaning.  */);
10695   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10696
10697   DEFVAR_LISP ("select-safe-coding-system-function",
10698                Vselect_safe_coding_system_function,
10699                doc: /*
10700 Function to call to select safe coding system for encoding a text.
10701
10702 If set, this function is called to force a user to select a proper
10703 coding system which can encode the text in the case that a default
10704 coding system used in each operation can't encode the text.  The
10705 function should take care that the buffer is not modified while
10706 the coding system is being selected.
10707
10708 The default value is `select-safe-coding-system' (which see).  */);
10709   Vselect_safe_coding_system_function = Qnil;
10710
10711   DEFVAR_BOOL ("coding-system-require-warning",
10712                coding_system_require_warning,
10713                doc: /* Internal use only.
10714 If non-nil, on writing a file, `select-safe-coding-system-function' is
10715 called even if `coding-system-for-write' is non-nil.  The command
10716 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10717   coding_system_require_warning = 0;
10718
10719
10720   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10721                inhibit_iso_escape_detection,
10722                doc: /*
10723 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10724
10725 When Emacs reads text, it tries to detect how the text is encoded.
10726 This code detection is sensitive to escape sequences.  If Emacs sees
10727 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10728 of the ISO2022 encodings, and decodes text by the corresponding coding
10729 system (e.g. `iso-2022-7bit').
10730
10731 However, there may be a case that you want to read escape sequences in
10732 a file as is.  In such a case, you can set this variable to non-nil.
10733 Then the code detection will ignore any escape sequences, and no text is
10734 detected as encoded in some ISO-2022 encoding.  The result is that all
10735 escape sequences become visible in a buffer.
10736
10737 The default value is nil, and it is strongly recommended not to change
10738 it.  That is because many Emacs Lisp source files that contain
10739 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10740 in Emacs's distribution, and they won't be decoded correctly on
10741 reading if you suppress escape sequence detection.
10742
10743 The other way to read escape sequences in a file without decoding is
10744 to explicitly specify some coding system that doesn't use ISO-2022
10745 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
10746   inhibit_iso_escape_detection = 0;
10747
10748   DEFVAR_BOOL ("inhibit-null-byte-detection",
10749                inhibit_null_byte_detection,
10750                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10751 By default, Emacs treats it as binary data, and does not attempt to
10752 decode it.  The effect is as if you specified `no-conversion' for
10753 reading that text.
10754
10755 Set this to non-nil when a regular text happens to include null bytes.
10756 Examples are Index nodes of Info files and null-byte delimited output
10757 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10758 decode text as usual.  */);
10759   inhibit_null_byte_detection = 0;
10760
10761   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10762                doc: /* Char table for translating self-inserting characters.
10763 This is applied to the result of input methods, not their input.
10764 See also `keyboard-translate-table'.
10765
10766 Use of this variable for character code unification was rendered
10767 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10768 internal character representation.  */);
10769     Vtranslation_table_for_input = Qnil;
10770
10771   {
10772     Lisp_Object args[coding_arg_max];
10773     Lisp_Object plist[16];
10774     int i;
10775
10776     for (i = 0; i < coding_arg_max; i++)
10777       args[i] = Qnil;
10778
10779     plist[0] = intern_c_string (":name");
10780     plist[1] = args[coding_arg_name] = Qno_conversion;
10781     plist[2] = intern_c_string (":mnemonic");
10782     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10783     plist[4] = intern_c_string (":coding-type");
10784     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10785     plist[6] = intern_c_string (":ascii-compatible-p");
10786     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10787     plist[8] = intern_c_string (":default-char");
10788     plist[9] = args[coding_arg_default_char] = make_number (0);
10789     plist[10] = intern_c_string (":for-unibyte");
10790     plist[11] = args[coding_arg_for_unibyte] = Qt;
10791     plist[12] = intern_c_string (":docstring");
10792     plist[13] = build_pure_c_string ("Do no conversion.\n\
10793 \n\
10794 When you visit a file with this coding, the file is read into a\n\
10795 unibyte buffer as is, thus each byte of a file is treated as a\n\
10796 character.");
10797     plist[14] = intern_c_string (":eol-type");
10798     plist[15] = args[coding_arg_eol_type] = Qunix;
10799     args[coding_arg_plist] = Flist (16, plist);
10800     Fdefine_coding_system_internal (coding_arg_max, args);
10801
10802     plist[1] = args[coding_arg_name] = Qundecided;
10803     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10804     plist[5] = args[coding_arg_coding_type] = Qundecided;
10805     /* This is already set.
10806        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10807     plist[8] = intern_c_string (":charset-list");
10808     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10809     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10810     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10811     plist[15] = args[coding_arg_eol_type] = Qnil;
10812     args[coding_arg_plist] = Flist (16, plist);
10813     Fdefine_coding_system_internal (coding_arg_max, args);
10814   }
10815
10816   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10817
10818   {
10819     int i;
10820
10821     for (i = 0; i < coding_category_max; i++)
10822       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10823   }
10824 #if defined (DOS_NT)
10825   system_eol_type = Qdos;
10826 #else
10827   system_eol_type = Qunix;
10828 #endif
10829   staticpro (&system_eol_type);
10830 }
10831
10832 char *
10833 emacs_strerror (int error_number)
10834 {
10835   char *str;
10836
10837   synchronize_system_messages_locale ();
10838   str = strerror (error_number);
10839
10840   if (! NILP (Vlocale_coding_system))
10841     {
10842       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10843                                                       Vlocale_coding_system,
10844                                                       0);
10845       str = SSDATA (dec);
10846     }
10847
10848   return str;
10849 }
10850
10851 #endif /* emacs */