src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 static Lisp_Object Qmac;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 #if defined (WINDOWSNT) || defined (CYGWIN)
 348 static Lisp_Object Qutf_16le;
 349 #endif
 350
 351 /* Coding-systems are handed between Emacs Lisp programs and C internal
 352    routines by the following three variables.  */
 353 /* Coding system to be used to encode text for terminal display when
 354    terminal coding system is nil.  */
 355 struct coding_system safe_terminal_coding;
 356
 357 #endif /* emacs */
 358
 359 Lisp_Object Qtranslation_table;
 360 Lisp_Object Qtranslation_table_id;
 361 static Lisp_Object Qtranslation_table_for_decode;
 362 static Lisp_Object Qtranslation_table_for_encode;
 363
 364 /* Two special coding systems.  */
 365 static Lisp_Object Vsjis_coding_system;
 366 static Lisp_Object Vbig5_coding_system;
 367
 368 /* ISO2022 section */
 369
 370 #define CODING_ISO_INITIAL(coding, reg)                 \
 371   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 372                      coding_attr_iso_initial),          \
 373                reg)))
 374
 375
 376 #define CODING_ISO_REQUEST(coding, charset_id)          \
 377   (((charset_id) <= (coding)->max_charset_id            \
 378     ? ((coding)->safe_charsets[charset_id] != 255       \
 379        ? (coding)->safe_charsets[charset_id]            \
 380        : -1)                                            \
 381     : -1))
 382
 383
 384 #define CODING_ISO_FLAGS(coding)        \
 385   ((coding)->spec.iso_2022.flags)
 386 #define CODING_ISO_DESIGNATION(coding, reg)     \
 387   ((coding)->spec.iso_2022.current_designation[reg])
 388 #define CODING_ISO_INVOCATION(coding, plane)    \
 389   ((coding)->spec.iso_2022.current_invocation[plane])
 390 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 391   ((coding)->spec.iso_2022.single_shifting)
 392 #define CODING_ISO_BOL(coding)  \
 393   ((coding)->spec.iso_2022.bol)
 394 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 395   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 396 #define CODING_ISO_CMP_STATUS(coding)   \
 397   (&(coding)->spec.iso_2022.cmp_status)
 398 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 399   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 400 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 401   ((coding)->spec.iso_2022.embedded_utf_8)
 402
 403 /* Control characters of ISO2022.  */
 404                         /* code */      /* function */
 405 #define ISO_CODE_SO     0x0E            /* shift-out */
 406 #define ISO_CODE_SI     0x0F            /* shift-in */
 407 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 408 #define ISO_CODE_ESC    0x1B            /* escape */
 409 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 410 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 411 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 412
 413 /* All code (1-byte) of ISO2022 is classified into one of the
 414    followings.  */
 415 enum iso_code_class_type
 416   {
 417     ISO_control_0,              /* Control codes in the range
 418                                    0x00..0x1F and 0x7F, except for the
 419                                    following 5 codes.  */
 420     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 421     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 422     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 423     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 424     ISO_control_1,              /* Control codes in the range
 425                                    0x80..0x9F, except for the
 426                                    following 3 codes.  */
 427     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 428     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 429     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 430     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 431     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 432     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 433     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 434   };
 435
 436 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 437     `iso-flags' attribute of an iso2022 coding system.  */
 438
 439 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 440    instead of the correct short-form sequence (e.g. ESC $ A).  */
 441 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 442
 443 /* If set, reset graphic planes and registers at end-of-line to the
 444    initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 446
 447 /* If set, reset graphic planes and registers before any control
 448    characters to the initial state.  */
 449 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 450
 451 /* If set, encode by 7-bit environment.  */
 452 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 453
 454 /* If set, use locking-shift function.  */
 455 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 456
 457 /* If set, use single-shift function.  Overwrite
 458    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 459 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 460
 461 /* If set, use designation escape sequence.  */
 462 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 463
 464 /* If set, produce revision number sequence.  */
 465 #define CODING_ISO_FLAG_REVISION        0x0080
 466
 467 /* If set, produce ISO6429's direction specifying sequence.  */
 468 #define CODING_ISO_FLAG_DIRECTION       0x0100
 469
 470 /* If set, assume designation states are reset at beginning of line on
 471    output.  */
 472 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 473
 474 /* If set, designation sequence should be placed at beginning of line
 475    on output.  */
 476 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 477
 478 /* If set, do not encode unsafe characters on output.  */
 479 #define CODING_ISO_FLAG_SAFE            0x0800
 480
 481 /* If set, extra latin codes (128..159) are accepted as a valid code
 482    on input.  */
 483 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 484
 485 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 486
 487 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 488
 489 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 490
 491 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 492
 493 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 494
 495 /* A character to be produced on output if encoding of the original
 496    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 497 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 498
 499 /* UTF-8 section */
 500 #define CODING_UTF_8_BOM(coding)        \
 501   ((coding)->spec.utf_8_bom)
 502
 503 /* UTF-16 section */
 504 #define CODING_UTF_16_BOM(coding)       \
 505   ((coding)->spec.utf_16.bom)
 506
 507 #define CODING_UTF_16_ENDIAN(coding)    \
 508   ((coding)->spec.utf_16.endian)
 509
 510 #define CODING_UTF_16_SURROGATE(coding) \
 511   ((coding)->spec.utf_16.surrogate)
 512
 513
 514 /* CCL section */
 515 #define CODING_CCL_DECODER(coding)      \
 516   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 517 #define CODING_CCL_ENCODER(coding)      \
 518   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 519 #define CODING_CCL_VALIDS(coding)                                          \
 520   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 521
 522 /* Index for each coding category in `coding_categories' */
 523
 524 enum coding_category
 525   {
 526     coding_category_iso_7,
 527     coding_category_iso_7_tight,
 528     coding_category_iso_8_1,
 529     coding_category_iso_8_2,
 530     coding_category_iso_7_else,
 531     coding_category_iso_8_else,
 532     coding_category_utf_8_auto,
 533     coding_category_utf_8_nosig,
 534     coding_category_utf_8_sig,
 535     coding_category_utf_16_auto,
 536     coding_category_utf_16_be,
 537     coding_category_utf_16_le,
 538     coding_category_utf_16_be_nosig,
 539     coding_category_utf_16_le_nosig,
 540     coding_category_charset,
 541     coding_category_sjis,
 542     coding_category_big5,
 543     coding_category_ccl,
 544     coding_category_emacs_mule,
 545     /* All above are targets of code detection.  */
 546     coding_category_raw_text,
 547     coding_category_undecided,
 548     coding_category_max
 549   };
 550
 551 /* Definitions of flag bits used in detect_coding_XXXX.  */
 552 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 553 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 554 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 555 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 556 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 557 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 558 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 559 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 560 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 561 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 562 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 563 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 564 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 565 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 566 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 567 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 568 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 569 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 570 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 571 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 572
 573 /* This value is returned if detect_coding_mask () find nothing other
 574    than ASCII characters.  */
 575 #define CATEGORY_MASK_ANY               \
 576   (CATEGORY_MASK_ISO_7                  \
 577    | CATEGORY_MASK_ISO_7_TIGHT          \
 578    | CATEGORY_MASK_ISO_8_1              \
 579    | CATEGORY_MASK_ISO_8_2              \
 580    | CATEGORY_MASK_ISO_7_ELSE           \
 581    | CATEGORY_MASK_ISO_8_ELSE           \
 582    | CATEGORY_MASK_UTF_8_AUTO           \
 583    | CATEGORY_MASK_UTF_8_NOSIG          \
 584    | CATEGORY_MASK_UTF_8_SIG            \
 585    | CATEGORY_MASK_UTF_16_AUTO          \
 586    | CATEGORY_MASK_UTF_16_BE            \
 587    | CATEGORY_MASK_UTF_16_LE            \
 588    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 589    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 590    | CATEGORY_MASK_CHARSET              \
 591    | CATEGORY_MASK_SJIS                 \
 592    | CATEGORY_MASK_BIG5                 \
 593    | CATEGORY_MASK_CCL                  \
 594    | CATEGORY_MASK_EMACS_MULE)
 595
 596
 597 #define CATEGORY_MASK_ISO_7BIT \
 598   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 599
 600 #define CATEGORY_MASK_ISO_8BIT \
 601   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 602
 603 #define CATEGORY_MASK_ISO_ELSE \
 604   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 605
 606 #define CATEGORY_MASK_ISO_ESCAPE        \
 607   (CATEGORY_MASK_ISO_7                  \
 608    | CATEGORY_MASK_ISO_7_TIGHT          \
 609    | CATEGORY_MASK_ISO_7_ELSE           \
 610    | CATEGORY_MASK_ISO_8_ELSE)
 611
 612 #define CATEGORY_MASK_ISO       \
 613   (  CATEGORY_MASK_ISO_7BIT     \
 614      | CATEGORY_MASK_ISO_8BIT   \
 615      | CATEGORY_MASK_ISO_ELSE)
 616
 617 #define CATEGORY_MASK_UTF_16            \
 618   (CATEGORY_MASK_UTF_16_AUTO            \
 619    | CATEGORY_MASK_UTF_16_BE            \
 620    | CATEGORY_MASK_UTF_16_LE            \
 621    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 622    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 623
 624 #define CATEGORY_MASK_UTF_8     \
 625   (CATEGORY_MASK_UTF_8_AUTO     \
 626    | CATEGORY_MASK_UTF_8_NOSIG  \
 627    | CATEGORY_MASK_UTF_8_SIG)
 628
 629 /* Table of coding categories (Lisp symbols).  This variable is for
 630    internal use only.  */
 631 static Lisp_Object Vcoding_category_table;
 632
 633 /* Table of coding-categories ordered by priority.  */
 634 static enum coding_category coding_priorities[coding_category_max];
 635
 636 /* Nth element is a coding context for the coding system bound to the
 637    Nth coding category.  */
 638 static struct coding_system coding_categories[coding_category_max];
 639
 640 /*** Commonly used macros and functions ***/
 641
 642 #ifndef min
 643 #define min(a, b) ((a) < (b) ? (a) : (b))
 644 #endif
 645 #ifndef max
 646 #define max(a, b) ((a) > (b) ? (a) : (b))
 647 #endif
 648
 649 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 650   do {                                                  \
 651     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 652     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 653   } while (0)
 654
 655
 656 /* Safely get one byte from the source text pointed by SRC which ends
 657    at SRC_END, and set C to that byte.  If there are not enough bytes
 658    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 659    and a multibyte character is found at SRC, set C to the
 660    negative value of the character code.  The caller should declare
 661    and set these variables appropriately in advance:
 662         src, src_end, multibytep */
 663
 664 #define ONE_MORE_BYTE(c)                                \
 665   do {                                                  \
 666     if (src == src_end)                                 \
 667       {                                                 \
 668         if (src_base < src)                             \
 669           record_conversion_result                      \
 670             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 671         goto no_more_source;                            \
 672       }                                                 \
 673     c = *src++;                                         \
 674     if (multibytep && (c & 0x80))                       \
 675       {                                                 \
 676         if ((c & 0xFE) == 0xC0)                         \
 677           c = ((c & 1) << 6) | *src++;                  \
 678         else                                            \
 679           {                                             \
 680             src--;                                      \
 681             c = - string_char (src, &src, NULL);        \
 682             record_conversion_result                    \
 683               (coding, CODING_RESULT_INVALID_SRC);      \
 684           }                                             \
 685       }                                                 \
 686     consumed_chars++;                                   \
 687   } while (0)
 688
 689 /* Safely get two bytes from the source text pointed by SRC which ends
 690    at SRC_END, and set C1 and C2 to those bytes while skipping the
 691    heading multibyte characters.  If there are not enough bytes in the
 692    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 693    a multibyte character is found for C2, set C2 to the negative value
 694    of the character code.  The caller should declare and set these
 695    variables appropriately in advance:
 696         src, src_end, multibytep
 697    It is intended that this macro is used in detect_coding_utf_16.  */
 698
 699 #define TWO_MORE_BYTES(c1, c2)                          \
 700   do {                                                  \
 701     do {                                                \
 702       if (src == src_end)                               \
 703         goto no_more_source;                            \
 704       c1 = *src++;                                      \
 705       if (multibytep && (c1 & 0x80))                    \
 706         {                                               \
 707           if ((c1 & 0xFE) == 0xC0)                      \
 708             c1 = ((c1 & 1) << 6) | *src++;              \
 709           else                                          \
 710             {                                           \
 711               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 712               c1 = -1;                                  \
 713             }                                           \
 714         }                                               \
 715     } while (c1 < 0);                                   \
 716     if (src == src_end)                                 \
 717       goto no_more_source;                              \
 718     c2 = *src++;                                        \
 719     if (multibytep && (c2 & 0x80))                      \
 720       {                                                 \
 721         if ((c2 & 0xFE) == 0xC0)                        \
 722           c2 = ((c2 & 1) << 6) | *src++;                \
 723         else                                            \
 724           c2 = -1;                                      \
 725       }                                                 \
 726   } while (0)
 727
 728
 729 /* Store a byte C in the place pointed by DST and increment DST to the
 730    next free point, and increment PRODUCED_CHARS.  The caller should
 731    assure that C is 0..127, and declare and set the variable `dst'
 732    appropriately in advance.
 733 */
 734
 735
 736 #define EMIT_ONE_ASCII_BYTE(c)  \
 737   do {                          \
 738     produced_chars++;           \
 739     *dst++ = (c);               \
 740   } while (0)
 741
 742
 743 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 744
 745 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 746   do {                                  \
 747     produced_chars += 2;                \
 748     *dst++ = (c1), *dst++ = (c2);       \
 749   } while (0)
 750
 751
 752 /* Store a byte C in the place pointed by DST and increment DST to the
 753    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 754    store in an appropriate multibyte form.  The caller should
 755    declare and set the variables `dst' and `multibytep' appropriately
 756    in advance.  */
 757
 758 #define EMIT_ONE_BYTE(c)                \
 759   do {                                  \
 760     produced_chars++;                   \
 761     if (multibytep)                     \
 762       {                                 \
 763         unsigned ch = (c);              \
 764         if (ch >= 0x80)                 \
 765           ch = BYTE8_TO_CHAR (ch);      \
 766         CHAR_STRING_ADVANCE (ch, dst);  \
 767       }                                 \
 768     else                                \
 769       *dst++ = (c);                     \
 770   } while (0)
 771
 772
 773 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 774
 775 #define EMIT_TWO_BYTES(c1, c2)          \
 776   do {                                  \
 777     produced_chars += 2;                \
 778     if (multibytep)                     \
 779       {                                 \
 780         unsigned ch;                    \
 781                                         \
 782         ch = (c1);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786         ch = (c2);                      \
 787         if (ch >= 0x80)                 \
 788           ch = BYTE8_TO_CHAR (ch);      \
 789         CHAR_STRING_ADVANCE (ch, dst);  \
 790       }                                 \
 791     else                                \
 792       {                                 \
 793         *dst++ = (c1);                  \
 794         *dst++ = (c2);                  \
 795       }                                 \
 796   } while (0)
 797
 798
 799 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 800   do {                                  \
 801     EMIT_ONE_BYTE (c1);                 \
 802     EMIT_TWO_BYTES (c2, c3);            \
 803   } while (0)
 804
 805
 806 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 807   do {                                          \
 808     EMIT_TWO_BYTES (c1, c2);                    \
 809     EMIT_TWO_BYTES (c3, c4);                    \
 810   } while (0)
 811
 812
 813 static void
 814 record_conversion_result (struct coding_system *coding,
 815                           enum coding_result_code result)
 816 {
 817   coding->result = result;
 818   switch (result)
 819     {
 820     case CODING_RESULT_INSUFFICIENT_SRC:
 821       Vlast_code_conversion_error = Qinsufficient_source;
 822       break;
 823     case CODING_RESULT_INCONSISTENT_EOL:
 824       Vlast_code_conversion_error = Qinconsistent_eol;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_MEM:
 833       Vlast_code_conversion_error = Qinsufficient_memory;
 834       break;
 835     case CODING_RESULT_INSUFFICIENT_DST:
 836       /* Don't record this error in Vlast_code_conversion_error
 837          because it happens just temporarily and is resolved when the
 838          whole conversion is finished.  */
 839       break;
 840     case CODING_RESULT_SUCCESS:
 841       break;
 842     default:
 843       Vlast_code_conversion_error = intern ("Unknown error");
 844     }
 845 }
 846
 847 /* These wrapper macros are used to preserve validity of pointers into
 848    buffer text across calls to decode_char, encode_char, etc, which
 849    could cause relocation of buffers if it loads a charset map,
 850    because loading a charset map allocates large structures.  */
 851
 852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 853   do {                                                                       \
 854     ptrdiff_t offset;                                                        \
 855                                                                              \
 856     charset_map_loaded = 0;                                                  \
 857     c = DECODE_CHAR (charset, code);                                         \
 858     if (charset_map_loaded                                                   \
 859         && (offset = coding_change_source (coding)))                         \
 860       {                                                                      \
 861         src += offset;                                                       \
 862         src_base += offset;                                                  \
 863         src_end += offset;                                                   \
 864       }                                                                      \
 865   } while (0)
 866
 867 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 868   do {                                                                  \
 869     ptrdiff_t offset;                                                   \
 870                                                                         \
 871     charset_map_loaded = 0;                                             \
 872     code = ENCODE_CHAR (charset, c);                                    \
 873     if (charset_map_loaded                                              \
 874         && (offset = coding_change_destination (coding)))               \
 875       {                                                                 \
 876         dst += offset;                                                  \
 877         dst_end += offset;                                              \
 878       }                                                                 \
 879   } while (0)
 880
 881 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 882   do {                                                                  \
 883     ptrdiff_t offset;                                                   \
 884                                                                         \
 885     charset_map_loaded = 0;                                             \
 886     charset = char_charset (c, charset_list, code_return);              \
 887     if (charset_map_loaded                                              \
 888         && (offset = coding_change_destination (coding)))               \
 889       {                                                                 \
 890         dst += offset;                                                  \
 891         dst_end += offset;                                              \
 892       }                                                                 \
 893   } while (0)
 894
 895 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 896   do {                                                                  \
 897     ptrdiff_t offset;                                                   \
 898                                                                         \
 899     charset_map_loaded = 0;                                             \
 900     result = CHAR_CHARSET_P (c, charset);                               \
 901     if (charset_map_loaded                                              \
 902         && (offset = coding_change_destination (coding)))               \
 903       {                                                                 \
 904         dst += offset;                                                  \
 905         dst_end += offset;                                              \
 906       }                                                                 \
 907   } while (0)
 908
 909
 910 /* If there are at least BYTES length of room at dst, allocate memory
 911    for coding->destination and update dst and dst_end.  We don't have
 912    to take care of coding->source which will be relocated.  It is
 913    handled by calling coding_set_source in encode_coding.  */
 914
 915 #define ASSURE_DESTINATION(bytes)                               \
 916   do {                                                          \
 917     if (dst + (bytes) >= dst_end)                               \
 918       {                                                         \
 919         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 920                                                                 \
 921         dst = alloc_destination (coding, more_bytes, dst);      \
 922         dst_end = coding->destination + coding->dst_bytes;      \
 923       }                                                         \
 924   } while (0)
 925
 926
 927 /* Store multibyte form of the character C in P, and advance P to the
 928    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 929    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 930    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 931
 932 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 933
 934 /* Return the character code of character whose multibyte form is at
 935    P, and advance P to the end of the multibyte form.  This used to be
 936    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 937    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 938
 939 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 940
 941 /* Set coding->source from coding->src_object.  */
 942
 943 static void
 944 coding_set_source (struct coding_system *coding)
 945 {
 946   if (BUFFERP (coding->src_object))
 947     {
 948       struct buffer *buf = XBUFFER (coding->src_object);
 949
 950       if (coding->src_pos < 0)
 951         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 952       else
 953         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 954     }
 955   else if (STRINGP (coding->src_object))
 956     {
 957       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 958     }
 959   else
 960     {
 961       /* Otherwise, the source is C string and is never relocated
 962          automatically.  Thus we don't have to update anything.  */
 963     }
 964 }
 965
 966
 967 /* Set coding->source from coding->src_object, and return how many
 968    bytes coding->source was changed.  */
 969
 970 static ptrdiff_t
 971 coding_change_source (struct coding_system *coding)
 972 {
 973   const unsigned char *orig = coding->source;
 974   coding_set_source (coding);
 975   return coding->source - orig;
 976 }
 977
 978
 979 /* Set coding->destination from coding->dst_object.  */
 980
 981 static void
 982 coding_set_destination (struct coding_system *coding)
 983 {
 984   if (BUFFERP (coding->dst_object))
 985     {
 986       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 987         {
 988           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 989           coding->dst_bytes = (GAP_END_ADDR
 990                                - (coding->src_bytes - coding->consumed)
 991                                - coding->destination);
 992         }
 993       else
 994         {
 995           /* We are sure that coding->dst_pos_byte is before the gap
 996              of the buffer. */
 997           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 998                                  + coding->dst_pos_byte - BEG_BYTE);
 999           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1000                                - coding->destination);
1001         }
1002     }
1003   else
1004     {
1005       /* Otherwise, the destination is C string and is never relocated
1006          automatically.  Thus we don't have to update anything.  */
1007     }
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object, and return how
1012    many bytes coding->destination was changed.  */
1013
1014 static ptrdiff_t
1015 coding_change_destination (struct coding_system *coding)
1016 {
1017   const unsigned char *orig = coding->destination;
1018   coding_set_destination (coding);
1019   return coding->destination - orig;
1020 }
1021
1022
1023 static void
1024 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1025 {
1026   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1027     string_overflow ();
1028   coding->destination = xrealloc (coding->destination,
1029                                   coding->dst_bytes + bytes);
1030   coding->dst_bytes += bytes;
1031 }
1032
1033 static void
1034 coding_alloc_by_making_gap (struct coding_system *coding,
1035                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1036 {
1037   if (EQ (coding->src_object, coding->dst_object))
1038     {
1039       /* The gap may contain the produced data at the head and not-yet
1040          consumed data at the tail.  To preserve those data, we at
1041          first make the gap size to zero, then increase the gap
1042          size.  */
1043       ptrdiff_t add = GAP_SIZE;
1044
1045       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1046       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1047       make_gap (bytes);
1048       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1049       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1050     }
1051   else
1052     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1053 }
1054
1055
1056 static unsigned char *
1057 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1058                    unsigned char *dst)
1059 {
1060   ptrdiff_t offset = dst - coding->destination;
1061
1062   if (BUFFERP (coding->dst_object))
1063     {
1064       struct buffer *buf = XBUFFER (coding->dst_object);
1065
1066       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1067     }
1068   else
1069     coding_alloc_by_realloc (coding, nbytes);
1070   coding_set_destination (coding);
1071   dst = coding->destination + offset;
1072   return dst;
1073 }
1074
1075 /** Macros for annotations.  */
1076
1077 /* An annotation data is stored in the array coding->charbuf in this
1078    format:
1079      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1080    LENGTH is the number of elements in the annotation.
1081    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1082    NCHARS is the number of characters in the text annotated.
1083
1084    The format of the following elements depend on ANNOTATION_MASK.
1085
1086    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1087    follows:
1088      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1089
1090    NBYTES is the number of bytes specified in the header part of
1091    old-style emacs-mule encoding, or 0 for the other kind of
1092    composition.
1093
1094    METHOD is one of enum composition_method.
1095
1096    Optional COMPOSITION-COMPONENTS are characters and composition
1097    rules.
1098
1099    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1100    follows.
1101
1102    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1103    recover from an invalid annotation, and should be skipped by
1104    produce_annotation.  */
1105
1106 /* Maximum length of the header of annotation data.  */
1107 #define MAX_ANNOTATION_LENGTH 5
1108
1109 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1110   do {                                                  \
1111     *(buf)++ = -(len);                                  \
1112     *(buf)++ = (mask);                                  \
1113     *(buf)++ = (nchars);                                \
1114     coding->annotated = 1;                              \
1115   } while (0);
1116
1117 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1118   do {                                                                      \
1119     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1120     *buf++ = nbytes;                                                        \
1121     *buf++ = method;                                                        \
1122   } while (0)
1123
1124
1125 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1126   do {                                                                  \
1127     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1128     *buf++ = id;                                                        \
1129   } while (0)
1130
1131 \f
1132 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1133
1134
1135
1136 \f
1137 /*** 3. UTF-8 ***/
1138
1139 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1140    Return true if a text is encoded in UTF-8.  */
1141
1142 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1143 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1144 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1145 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1146 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1147 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1148
1149 #define UTF_8_BOM_1 0xEF
1150 #define UTF_8_BOM_2 0xBB
1151 #define UTF_8_BOM_3 0xBF
1152
1153 static bool
1154 detect_coding_utf_8 (struct coding_system *coding,
1155                      struct coding_detection_info *detect_info)
1156 {
1157   const unsigned char *src = coding->source, *src_base;
1158   const unsigned char *src_end = coding->source + coding->src_bytes;
1159   bool multibytep = coding->src_multibyte;
1160   ptrdiff_t consumed_chars = 0;
1161   bool bom_found = 0;
1162   bool found = 0;
1163
1164   detect_info->checked |= CATEGORY_MASK_UTF_8;
1165   /* A coding system of this category is always ASCII compatible.  */
1166   src += coding->head_ascii;
1167
1168   while (1)
1169     {
1170       int c, c1, c2, c3, c4;
1171
1172       src_base = src;
1173       ONE_MORE_BYTE (c);
1174       if (c < 0 || UTF_8_1_OCTET_P (c))
1175         continue;
1176       ONE_MORE_BYTE (c1);
1177       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1178         break;
1179       if (UTF_8_2_OCTET_LEADING_P (c))
1180         {
1181           found = 1;
1182           continue;
1183         }
1184       ONE_MORE_BYTE (c2);
1185       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1186         break;
1187       if (UTF_8_3_OCTET_LEADING_P (c))
1188         {
1189           found = 1;
1190           if (src_base == coding->source
1191               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1192             bom_found = 1;
1193           continue;
1194         }
1195       ONE_MORE_BYTE (c3);
1196       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1197         break;
1198       if (UTF_8_4_OCTET_LEADING_P (c))
1199         {
1200           found = 1;
1201           continue;
1202         }
1203       ONE_MORE_BYTE (c4);
1204       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1205         break;
1206       if (UTF_8_5_OCTET_LEADING_P (c))
1207         {
1208           found = 1;
1209           continue;
1210         }
1211       break;
1212     }
1213   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1214   return 0;
1215
1216  no_more_source:
1217   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1218     {
1219       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220       return 0;
1221     }
1222   if (bom_found)
1223     {
1224       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1225       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1226     }
1227   else
1228     {
1229       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1230       if (found)
1231         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1232     }
1233   return 1;
1234 }
1235
1236
1237 static void
1238 decode_coding_utf_8 (struct coding_system *coding)
1239 {
1240   const unsigned char *src = coding->source + coding->consumed;
1241   const unsigned char *src_end = coding->source + coding->src_bytes;
1242   const unsigned char *src_base;
1243   int *charbuf = coding->charbuf + coding->charbuf_used;
1244   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1245   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1246   bool multibytep = coding->src_multibyte;
1247   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1248   bool eol_dos
1249     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1250   int byte_after_cr = -1;
1251
1252   if (bom != utf_without_bom)
1253     {
1254       int c1, c2, c3;
1255
1256       src_base = src;
1257       ONE_MORE_BYTE (c1);
1258       if (! UTF_8_3_OCTET_LEADING_P (c1))
1259         src = src_base;
1260       else
1261         {
1262           ONE_MORE_BYTE (c2);
1263           if (! UTF_8_EXTRA_OCTET_P (c2))
1264             src = src_base;
1265           else
1266             {
1267               ONE_MORE_BYTE (c3);
1268               if (! UTF_8_EXTRA_OCTET_P (c3))
1269                 src = src_base;
1270               else
1271                 {
1272                   if ((c1 != UTF_8_BOM_1)
1273                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1274                     src = src_base;
1275                   else
1276                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1277                 }
1278             }
1279         }
1280     }
1281   CODING_UTF_8_BOM (coding) = utf_without_bom;
1282
1283   while (1)
1284     {
1285       int c, c1, c2, c3, c4, c5;
1286
1287       src_base = src;
1288       consumed_chars_base = consumed_chars;
1289
1290       if (charbuf >= charbuf_end)
1291         {
1292           if (byte_after_cr >= 0)
1293             src_base--;
1294           break;
1295         }
1296
1297       if (byte_after_cr >= 0)
1298         c1 = byte_after_cr, byte_after_cr = -1;
1299       else
1300         ONE_MORE_BYTE (c1);
1301       if (c1 < 0)
1302         {
1303           c = - c1;
1304         }
1305       else if (UTF_8_1_OCTET_P (c1))
1306         {
1307           if (eol_dos && c1 == '\r')
1308             ONE_MORE_BYTE (byte_after_cr);
1309           c = c1;
1310         }
1311       else
1312         {
1313           ONE_MORE_BYTE (c2);
1314           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1315             goto invalid_code;
1316           if (UTF_8_2_OCTET_LEADING_P (c1))
1317             {
1318               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1319               /* Reject overlong sequences here and below.  Encoders
1320                  producing them are incorrect, they can be misleading,
1321                  and they mess up read/write invariance.  */
1322               if (c < 128)
1323                 goto invalid_code;
1324             }
1325           else
1326             {
1327               ONE_MORE_BYTE (c3);
1328               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1329                 goto invalid_code;
1330               if (UTF_8_3_OCTET_LEADING_P (c1))
1331                 {
1332                   c = (((c1 & 0xF) << 12)
1333                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1334                   if (c < 0x800
1335                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1336                     goto invalid_code;
1337                 }
1338               else
1339                 {
1340                   ONE_MORE_BYTE (c4);
1341                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1342                     goto invalid_code;
1343                   if (UTF_8_4_OCTET_LEADING_P (c1))
1344                     {
1345                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1346                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1347                     if (c < 0x10000)
1348                       goto invalid_code;
1349                     }
1350                   else
1351                     {
1352                       ONE_MORE_BYTE (c5);
1353                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1354                         goto invalid_code;
1355                       if (UTF_8_5_OCTET_LEADING_P (c1))
1356                         {
1357                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1358                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1359                                | (c5 & 0x3F));
1360                           if ((c > MAX_CHAR) || (c < 0x200000))
1361                             goto invalid_code;
1362                         }
1363                       else
1364                         goto invalid_code;
1365                     }
1366                 }
1367             }
1368         }
1369
1370       *charbuf++ = c;
1371       continue;
1372
1373     invalid_code:
1374       src = src_base;
1375       consumed_chars = consumed_chars_base;
1376       ONE_MORE_BYTE (c);
1377       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1378       coding->errors++;
1379     }
1380
1381  no_more_source:
1382   coding->consumed_char += consumed_chars_base;
1383   coding->consumed = src_base - coding->source;
1384   coding->charbuf_used = charbuf - coding->charbuf;
1385 }
1386
1387
1388 static bool
1389 encode_coding_utf_8 (struct coding_system *coding)
1390 {
1391   bool multibytep = coding->dst_multibyte;
1392   int *charbuf = coding->charbuf;
1393   int *charbuf_end = charbuf + coding->charbuf_used;
1394   unsigned char *dst = coding->destination + coding->produced;
1395   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1396   ptrdiff_t produced_chars = 0;
1397   int c;
1398
1399   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1400     {
1401       ASSURE_DESTINATION (3);
1402       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1403       CODING_UTF_8_BOM (coding) = utf_without_bom;
1404     }
1405
1406   if (multibytep)
1407     {
1408       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1409
1410       while (charbuf < charbuf_end)
1411         {
1412           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1413
1414           ASSURE_DESTINATION (safe_room);
1415           c = *charbuf++;
1416           if (CHAR_BYTE8_P (c))
1417             {
1418               c = CHAR_TO_BYTE8 (c);
1419               EMIT_ONE_BYTE (c);
1420             }
1421           else
1422             {
1423               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1424               for (p = str; p < pend; p++)
1425                 EMIT_ONE_BYTE (*p);
1426             }
1427         }
1428     }
1429   else
1430     {
1431       int safe_room = MAX_MULTIBYTE_LENGTH;
1432
1433       while (charbuf < charbuf_end)
1434         {
1435           ASSURE_DESTINATION (safe_room);
1436           c = *charbuf++;
1437           if (CHAR_BYTE8_P (c))
1438             *dst++ = CHAR_TO_BYTE8 (c);
1439           else
1440             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1441           produced_chars++;
1442         }
1443     }
1444   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1445   coding->produced_char += produced_chars;
1446   coding->produced = dst - coding->destination;
1447   return 0;
1448 }
1449
1450
1451 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1452    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1453
1454 #define UTF_16_HIGH_SURROGATE_P(val) \
1455   (((val) & 0xFC00) == 0xD800)
1456
1457 #define UTF_16_LOW_SURROGATE_P(val) \
1458   (((val) & 0xFC00) == 0xDC00)
1459
1460
1461 static bool
1462 detect_coding_utf_16 (struct coding_system *coding,
1463                       struct coding_detection_info *detect_info)
1464 {
1465   const unsigned char *src = coding->source;
1466   const unsigned char *src_end = coding->source + coding->src_bytes;
1467   bool multibytep = coding->src_multibyte;
1468   int c1, c2;
1469
1470   detect_info->checked |= CATEGORY_MASK_UTF_16;
1471   if (coding->mode & CODING_MODE_LAST_BLOCK
1472       && (coding->src_chars & 1))
1473     {
1474       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1475       return 0;
1476     }
1477
1478   TWO_MORE_BYTES (c1, c2);
1479   if ((c1 == 0xFF) && (c2 == 0xFE))
1480     {
1481       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1482                              | CATEGORY_MASK_UTF_16_AUTO);
1483       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1484                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1485                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1486     }
1487   else if ((c1 == 0xFE) && (c2 == 0xFF))
1488     {
1489       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1490                              | CATEGORY_MASK_UTF_16_AUTO);
1491       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1492                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1493                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1494     }
1495   else if (c2 < 0)
1496     {
1497       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1498       return 0;
1499     }
1500   else
1501     {
1502       /* We check the dispersion of Eth and Oth bytes where E is even and
1503          O is odd.  If both are high, we assume binary data.*/
1504       unsigned char e[256], o[256];
1505       unsigned e_num = 1, o_num = 1;
1506
1507       memset (e, 0, 256);
1508       memset (o, 0, 256);
1509       e[c1] = 1;
1510       o[c2] = 1;
1511
1512       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1513                                 |CATEGORY_MASK_UTF_16_BE
1514                                 | CATEGORY_MASK_UTF_16_LE);
1515
1516       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1517              != CATEGORY_MASK_UTF_16)
1518         {
1519           TWO_MORE_BYTES (c1, c2);
1520           if (c2 < 0)
1521             break;
1522           if (! e[c1])
1523             {
1524               e[c1] = 1;
1525               e_num++;
1526               if (e_num >= 128)
1527                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1528             }
1529           if (! o[c2])
1530             {
1531               o[c2] = 1;
1532               o_num++;
1533               if (o_num >= 128)
1534                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1535             }
1536         }
1537       return 0;
1538     }
1539
1540  no_more_source:
1541   return 1;
1542 }
1543
1544 static void
1545 decode_coding_utf_16 (struct coding_system *coding)
1546 {
1547   const unsigned char *src = coding->source + coding->consumed;
1548   const unsigned char *src_end = coding->source + coding->src_bytes;
1549   const unsigned char *src_base;
1550   int *charbuf = coding->charbuf + coding->charbuf_used;
1551   /* We may produces at most 3 chars in one loop.  */
1552   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1553   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1554   bool multibytep = coding->src_multibyte;
1555   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1556   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1557   int surrogate = CODING_UTF_16_SURROGATE (coding);
1558   bool eol_dos
1559     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1560   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1561
1562   if (bom == utf_with_bom)
1563     {
1564       int c, c1, c2;
1565
1566       src_base = src;
1567       ONE_MORE_BYTE (c1);
1568       ONE_MORE_BYTE (c2);
1569       c = (c1 << 8) | c2;
1570
1571       if (endian == utf_16_big_endian
1572           ? c != 0xFEFF : c != 0xFFFE)
1573         {
1574           /* The first two bytes are not BOM.  Treat them as bytes
1575              for a normal character.  */
1576           src = src_base;
1577           coding->errors++;
1578         }
1579       CODING_UTF_16_BOM (coding) = utf_without_bom;
1580     }
1581   else if (bom == utf_detect_bom)
1582     {
1583       /* We have already tried to detect BOM and failed in
1584          detect_coding.  */
1585       CODING_UTF_16_BOM (coding) = utf_without_bom;
1586     }
1587
1588   while (1)
1589     {
1590       int c, c1, c2;
1591
1592       src_base = src;
1593       consumed_chars_base = consumed_chars;
1594
1595       if (charbuf >= charbuf_end)
1596         {
1597           if (byte_after_cr1 >= 0)
1598             src_base -= 2;
1599           break;
1600         }
1601
1602       if (byte_after_cr1 >= 0)
1603         c1 = byte_after_cr1, byte_after_cr1 = -1;
1604       else
1605         ONE_MORE_BYTE (c1);
1606       if (c1 < 0)
1607         {
1608           *charbuf++ = -c1;
1609           continue;
1610         }
1611       if (byte_after_cr2 >= 0)
1612         c2 = byte_after_cr2, byte_after_cr2 = -1;
1613       else
1614         ONE_MORE_BYTE (c2);
1615       if (c2 < 0)
1616         {
1617           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1618           *charbuf++ = -c2;
1619           continue;
1620         }
1621       c = (endian == utf_16_big_endian
1622            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1623
1624       if (surrogate)
1625         {
1626           if (! UTF_16_LOW_SURROGATE_P (c))
1627             {
1628               if (endian == utf_16_big_endian)
1629                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1630               else
1631                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1632               *charbuf++ = c1;
1633               *charbuf++ = c2;
1634               coding->errors++;
1635               if (UTF_16_HIGH_SURROGATE_P (c))
1636                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1637               else
1638                 *charbuf++ = c;
1639             }
1640           else
1641             {
1642               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1643               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1644               *charbuf++ = 0x10000 + c;
1645             }
1646         }
1647       else
1648         {
1649           if (UTF_16_HIGH_SURROGATE_P (c))
1650             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1651           else
1652             {
1653               if (eol_dos && c == '\r')
1654                 {
1655                   ONE_MORE_BYTE (byte_after_cr1);
1656                   ONE_MORE_BYTE (byte_after_cr2);
1657                 }
1658               *charbuf++ = c;
1659             }
1660         }
1661     }
1662
1663  no_more_source:
1664   coding->consumed_char += consumed_chars_base;
1665   coding->consumed = src_base - coding->source;
1666   coding->charbuf_used = charbuf - coding->charbuf;
1667 }
1668
1669 static bool
1670 encode_coding_utf_16 (struct coding_system *coding)
1671 {
1672   bool multibytep = coding->dst_multibyte;
1673   int *charbuf = coding->charbuf;
1674   int *charbuf_end = charbuf + coding->charbuf_used;
1675   unsigned char *dst = coding->destination + coding->produced;
1676   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1677   int safe_room = 8;
1678   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1679   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1680   ptrdiff_t produced_chars = 0;
1681   int c;
1682
1683   if (bom != utf_without_bom)
1684     {
1685       ASSURE_DESTINATION (safe_room);
1686       if (big_endian)
1687         EMIT_TWO_BYTES (0xFE, 0xFF);
1688       else
1689         EMIT_TWO_BYTES (0xFF, 0xFE);
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692
1693   while (charbuf < charbuf_end)
1694     {
1695       ASSURE_DESTINATION (safe_room);
1696       c = *charbuf++;
1697       if (c > MAX_UNICODE_CHAR)
1698         c = coding->default_char;
1699
1700       if (c < 0x10000)
1701         {
1702           if (big_endian)
1703             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1704           else
1705             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1706         }
1707       else
1708         {
1709           int c1, c2;
1710
1711           c -= 0x10000;
1712           c1 = (c >> 10) + 0xD800;
1713           c2 = (c & 0x3FF) + 0xDC00;
1714           if (big_endian)
1715             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1716           else
1717             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1718         }
1719     }
1720   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1721   coding->produced = dst - coding->destination;
1722   coding->produced_char += produced_chars;
1723   return 0;
1724 }
1725
1726 \f
1727 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1728
1729 /* Emacs' internal format for representation of multiple character
1730    sets is a kind of multi-byte encoding, i.e. characters are
1731    represented by variable-length sequences of one-byte codes.
1732
1733    ASCII characters and control characters (e.g. `tab', `newline') are
1734    represented by one-byte sequences which are their ASCII codes, in
1735    the range 0x00 through 0x7F.
1736
1737    8-bit characters of the range 0x80..0x9F are represented by
1738    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1739    code + 0x20).
1740
1741    8-bit characters of the range 0xA0..0xFF are represented by
1742    one-byte sequences which are their 8-bit code.
1743
1744    The other characters are represented by a sequence of `base
1745    leading-code', optional `extended leading-code', and one or two
1746    `position-code's.  The length of the sequence is determined by the
1747    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1748    whereas extended leading-code and position-code take the range 0xA0
1749    through 0xFF.  See `charset.h' for more details about leading-code
1750    and position-code.
1751
1752    --- CODE RANGE of Emacs' internal format ---
1753    character set        range
1754    -------------        -----
1755    ascii                0x00..0x7F
1756    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1757    eight-bit-graphic    0xA0..0xBF
1758    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1759    ---------------------------------------------
1760
1761    As this is the internal character representation, the format is
1762    usually not used externally (i.e. in a file or in a data sent to a
1763    process).  But, it is possible to have a text externally in this
1764    format (i.e. by encoding by the coding system `emacs-mule').
1765
1766    In that case, a sequence of one-byte codes has a slightly different
1767    form.
1768
1769    At first, all characters in eight-bit-control are represented by
1770    one-byte sequences which are their 8-bit code.
1771
1772    Next, character composition data are represented by the byte
1773    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1774    where,
1775         METHOD is 0xF2 plus one of composition method (enum
1776         composition_method),
1777
1778         BYTES is 0xA0 plus a byte length of this composition data,
1779
1780         CHARS is 0xA0 plus a number of characters composed by this
1781         data,
1782
1783         COMPONENTs are characters of multibyte form or composition
1784         rules encoded by two-byte of ASCII codes.
1785
1786    In addition, for backward compatibility, the following formats are
1787    also recognized as composition data on decoding.
1788
1789    0x80 MSEQ ...
1790    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1791
1792    Here,
1793         MSEQ is a multibyte form but in these special format:
1794           ASCII: 0xA0 ASCII_CODE+0x80,
1795           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1796         RULE is a one byte code of the range 0xA0..0xF0 that
1797         represents a composition rule.
1798   */
1799
1800 char emacs_mule_bytes[256];
1801
1802
1803 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1804    Return true if a text is encoded in 'emacs-mule'.  */
1805
1806 static bool
1807 detect_coding_emacs_mule (struct coding_system *coding,
1808                           struct coding_detection_info *detect_info)
1809 {
1810   const unsigned char *src = coding->source, *src_base;
1811   const unsigned char *src_end = coding->source + coding->src_bytes;
1812   bool multibytep = coding->src_multibyte;
1813   ptrdiff_t consumed_chars = 0;
1814   int c;
1815   int found = 0;
1816
1817   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1818   /* A coding system of this category is always ASCII compatible.  */
1819   src += coding->head_ascii;
1820
1821   while (1)
1822     {
1823       src_base = src;
1824       ONE_MORE_BYTE (c);
1825       if (c < 0)
1826         continue;
1827       if (c == 0x80)
1828         {
1829           /* Perhaps the start of composite character.  We simply skip
1830              it because analyzing it is too heavy for detecting.  But,
1831              at least, we check that the composite character
1832              constitutes of more than 4 bytes.  */
1833           const unsigned char *src_start;
1834
1835         repeat:
1836           src_start = src;
1837           do
1838             {
1839               ONE_MORE_BYTE (c);
1840             }
1841           while (c >= 0xA0);
1842
1843           if (src - src_start <= 4)
1844             break;
1845           found = CATEGORY_MASK_EMACS_MULE;
1846           if (c == 0x80)
1847             goto repeat;
1848         }
1849
1850       if (c < 0x80)
1851         {
1852           if (c < 0x20
1853               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1854             break;
1855         }
1856       else
1857         {
1858           int more_bytes = emacs_mule_bytes[c] - 1;
1859
1860           while (more_bytes > 0)
1861             {
1862               ONE_MORE_BYTE (c);
1863               if (c < 0xA0)
1864                 {
1865                   src--;        /* Unread the last byte.  */
1866                   break;
1867                 }
1868               more_bytes--;
1869             }
1870           if (more_bytes != 0)
1871             break;
1872           found = CATEGORY_MASK_EMACS_MULE;
1873         }
1874     }
1875   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1876   return 0;
1877
1878  no_more_source:
1879   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1880     {
1881       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882       return 0;
1883     }
1884   detect_info->found |= found;
1885   return 1;
1886 }
1887
1888
1889 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1890    character.  If CMP_STATUS indicates that we must expect MSEQ or
1891    RULE described above, decode it and return the negative value of
1892    the decoded character or rule.  If an invalid byte is found, return
1893    -1.  If SRC is too short, return -2.  */
1894
1895 static int
1896 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1897                  int *nbytes, int *nchars, int *id,
1898                  struct composition_status *cmp_status)
1899 {
1900   const unsigned char *src_end = coding->source + coding->src_bytes;
1901   const unsigned char *src_base = src;
1902   bool multibytep = coding->src_multibyte;
1903   int charset_ID;
1904   unsigned code;
1905   int c;
1906   int consumed_chars = 0;
1907   bool mseq_found = 0;
1908
1909   ONE_MORE_BYTE (c);
1910   if (c < 0)
1911     {
1912       c = -c;
1913       charset_ID = emacs_mule_charset[0];
1914     }
1915   else
1916     {
1917       if (c >= 0xA0)
1918         {
1919           if (cmp_status->state != COMPOSING_NO
1920               && cmp_status->old_form)
1921             {
1922               if (cmp_status->state == COMPOSING_CHAR)
1923                 {
1924                   if (c == 0xA0)
1925                     {
1926                       ONE_MORE_BYTE (c);
1927                       c -= 0x80;
1928                       if (c < 0)
1929                         goto invalid_code;
1930                     }
1931                   else
1932                     c -= 0x20;
1933                   mseq_found = 1;
1934                 }
1935               else
1936                 {
1937                   *nbytes = src - src_base;
1938                   *nchars = consumed_chars;
1939                   return -c;
1940                 }
1941             }
1942           else
1943             goto invalid_code;
1944         }
1945
1946       switch (emacs_mule_bytes[c])
1947         {
1948         case 2:
1949           if ((charset_ID = emacs_mule_charset[c]) < 0)
1950             goto invalid_code;
1951           ONE_MORE_BYTE (c);
1952           if (c < 0xA0)
1953             goto invalid_code;
1954           code = c & 0x7F;
1955           break;
1956
1957         case 3:
1958           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1959               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1960             {
1961               ONE_MORE_BYTE (c);
1962               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1963                 goto invalid_code;
1964               ONE_MORE_BYTE (c);
1965               if (c < 0xA0)
1966                 goto invalid_code;
1967               code = c & 0x7F;
1968             }
1969           else
1970             {
1971               if ((charset_ID = emacs_mule_charset[c]) < 0)
1972                 goto invalid_code;
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 goto invalid_code;
1976               code = (c & 0x7F) << 8;
1977               ONE_MORE_BYTE (c);
1978               if (c < 0xA0)
1979                 goto invalid_code;
1980               code |= c & 0x7F;
1981             }
1982           break;
1983
1984         case 4:
1985           ONE_MORE_BYTE (c);
1986           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1987             goto invalid_code;
1988           ONE_MORE_BYTE (c);
1989           if (c < 0xA0)
1990             goto invalid_code;
1991           code = (c & 0x7F) << 8;
1992           ONE_MORE_BYTE (c);
1993           if (c < 0xA0)
1994             goto invalid_code;
1995           code |= c & 0x7F;
1996           break;
1997
1998         case 1:
1999           code = c;
2000           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2001           break;
2002
2003         default:
2004           emacs_abort ();
2005         }
2006       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2007                           CHARSET_FROM_ID (charset_ID), code, c);
2008       if (c < 0)
2009         goto invalid_code;
2010     }
2011   *nbytes = src - src_base;
2012   *nchars = consumed_chars;
2013   if (id)
2014     *id = charset_ID;
2015   return (mseq_found ? -c : c);
2016
2017  no_more_source:
2018   return -2;
2019
2020  invalid_code:
2021   return -1;
2022 }
2023
2024
2025 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2026
2027 /* Handle these composition sequence ('|': the end of header elements,
2028    BYTES and CHARS >= 0xA0):
2029
2030    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2031    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2032    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2033
2034    and these old form:
2035
2036    (4) relative composition: 0x80 | MSEQ ... MSEQ
2037    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2038
2039    When the starter 0x80 and the following header elements are found,
2040    this annotation header is produced.
2041
2042         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2043
2044    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2045    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2046
2047    Then, upon reading the following elements, these codes are produced
2048    until the composition end is found:
2049
2050    (1) CHAR ... CHAR
2051    (2) ALT ... ALT CHAR ... CHAR
2052    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2053    (4) CHAR ... CHAR
2054    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2055
2056    When the composition end is found, LENGTH and NCHARS in the
2057    annotation header is updated as below:
2058
2059    (1) LENGTH: unchanged, NCHARS: unchanged
2060    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2061    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2062    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2063    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2064
2065    If an error is found while composing, the annotation header is
2066    changed to the original composition header (plus filler -1s) as
2067    below:
2068
2069    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2070    (5)          [ 0x80 0xFF -1 -1- -1 ]
2071
2072    and the sequence [ -2 DECODED-RULE ] is changed to the original
2073    byte sequence as below:
2074         o the original byte sequence is B: [ B -1 ]
2075         o the original byte sequence is B1 B2: [ B1 B2 ]
2076
2077    Most of the routines are implemented by macros because many
2078    variables and labels in the caller decode_coding_emacs_mule must be
2079    accessible, and they are usually called just once (thus doesn't
2080    increase the size of compiled object).  */
2081
2082 /* Decode a composition rule represented by C as a component of
2083    composition sequence of Emacs 20 style.  Set RULE to the decoded
2084    rule. */
2085
2086 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2087   do {                                                  \
2088     int gref, nref;                                     \
2089                                                         \
2090     c -= 0xA0;                                          \
2091     if (c < 0 || c >= 81)                               \
2092       goto invalid_code;                                \
2093     gref = c / 9, nref = c % 9;                         \
2094     if (gref == 4) gref = 10;                           \
2095     if (nref == 4) nref = 10;                           \
2096     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2097   } while (0)
2098
2099
2100 /* Decode a composition rule represented by C and the following byte
2101    at SRC as a component of composition sequence of Emacs 21 style.
2102    Set RULE to the decoded rule.  */
2103
2104 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2105   do {                                                  \
2106     int gref, nref;                                     \
2107                                                         \
2108     gref = c - 0x20;                                    \
2109     if (gref < 0 || gref >= 81)                         \
2110       goto invalid_code;                                \
2111     ONE_MORE_BYTE (c);                                  \
2112     nref = c - 0x20;                                    \
2113     if (nref < 0 || nref >= 81)                         \
2114       goto invalid_code;                                \
2115     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2116   } while (0)
2117
2118
2119 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2120    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2121    byte length of this composition information, CHARS is the number of
2122    characters composed by this composition.  */
2123
2124 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2125   do {                                                                  \
2126     enum composition_method method = c - 0xF2;                          \
2127     int nbytes, nchars;                                                 \
2128                                                                         \
2129     ONE_MORE_BYTE (c);                                                  \
2130     if (c < 0)                                                          \
2131       goto invalid_code;                                                \
2132     nbytes = c - 0xA0;                                                  \
2133     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2134       goto invalid_code;                                                \
2135     ONE_MORE_BYTE (c);                                                  \
2136     nchars = c - 0xA0;                                                  \
2137     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2138       goto invalid_code;                                                \
2139     cmp_status->old_form = 0;                                           \
2140     cmp_status->method = method;                                        \
2141     if (method == COMPOSITION_RELATIVE)                                 \
2142       cmp_status->state = COMPOSING_CHAR;                               \
2143     else                                                                \
2144       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2145     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2146     cmp_status->nchars = nchars;                                        \
2147     cmp_status->ncomps = nbytes - 4;                                    \
2148     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2149   } while (0)
2150
2151
2152 /* Start of Emacs 20 style format for relative composition.  */
2153
2154 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2155   do {                                                          \
2156     cmp_status->old_form = 1;                                   \
2157     cmp_status->method = COMPOSITION_RELATIVE;                  \
2158     cmp_status->state = COMPOSING_CHAR;                         \
2159     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2160     cmp_status->nchars = cmp_status->ncomps = 0;                \
2161     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2162   } while (0)
2163
2164
2165 /* Start of Emacs 20 style format for rule-base composition.  */
2166
2167 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2168   do {                                                          \
2169     cmp_status->old_form = 1;                                   \
2170     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2171     cmp_status->state = COMPOSING_CHAR;                         \
2172     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2173     cmp_status->nchars = cmp_status->ncomps = 0;                \
2174     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2175   } while (0)
2176
2177
2178 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2179   do {                                                  \
2180     const unsigned char *current_src = src;             \
2181                                                         \
2182     ONE_MORE_BYTE (c);                                  \
2183     if (c < 0)                                          \
2184       goto invalid_code;                                \
2185     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2186         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2187       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2188     else if (c < 0xA0)                                  \
2189       goto invalid_code;                                \
2190     else if (c < 0xC0)                                  \
2191       {                                                 \
2192         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2193         /* Re-read C as a composition component.  */    \
2194         src = current_src;                              \
2195       }                                                 \
2196     else if (c == 0xFF)                                 \
2197       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2198     else                                                \
2199       goto invalid_code;                                \
2200   } while (0)
2201
2202 #define EMACS_MULE_COMPOSITION_END()                            \
2203   do {                                                          \
2204     int idx = - cmp_status->length;                             \
2205                                                                 \
2206     if (cmp_status->old_form)                                   \
2207       charbuf[idx + 2] = cmp_status->nchars;                    \
2208     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2209       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2210     cmp_status->state = COMPOSING_NO;                           \
2211   } while (0)
2212
2213
2214 static int
2215 emacs_mule_finish_composition (int *charbuf,
2216                                struct composition_status *cmp_status)
2217 {
2218   int idx = - cmp_status->length;
2219   int new_chars;
2220
2221   if (cmp_status->old_form && cmp_status->nchars > 0)
2222     {
2223       charbuf[idx + 2] = cmp_status->nchars;
2224       new_chars = 0;
2225       if (cmp_status->method == COMPOSITION_WITH_RULE
2226           && cmp_status->state == COMPOSING_CHAR)
2227         {
2228           /* The last rule was invalid.  */
2229           int rule = charbuf[-1] + 0xA0;
2230
2231           charbuf[-2] = BYTE8_TO_CHAR (rule);
2232           charbuf[-1] = -1;
2233           new_chars = 1;
2234         }
2235     }
2236   else
2237     {
2238       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2239
2240       if (cmp_status->method == COMPOSITION_WITH_RULE)
2241         {
2242           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2243           charbuf[idx++] = -3;
2244           charbuf[idx++] = 0;
2245           new_chars = 1;
2246         }
2247       else
2248         {
2249           int nchars = charbuf[idx + 1] + 0xA0;
2250           int nbytes = charbuf[idx + 2] + 0xA0;
2251
2252           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2253           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2254           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2255           charbuf[idx++] = -1;
2256           new_chars = 4;
2257         }
2258     }
2259   cmp_status->state = COMPOSING_NO;
2260   return new_chars;
2261 }
2262
2263 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2264   do {                                                                    \
2265     if (cmp_status->state != COMPOSING_NO)                                \
2266       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2267   } while (0)
2268
2269
2270 static void
2271 decode_coding_emacs_mule (struct coding_system *coding)
2272 {
2273   const unsigned char *src = coding->source + coding->consumed;
2274   const unsigned char *src_end = coding->source + coding->src_bytes;
2275   const unsigned char *src_base;
2276   int *charbuf = coding->charbuf + coding->charbuf_used;
2277   /* We may produce two annotations (charset and composition) in one
2278      loop and one more charset annotation at the end.  */
2279   int *charbuf_end
2280     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2281       /* We can produce up to 2 characters in a loop.  */
2282       - 1;
2283   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2284   bool multibytep = coding->src_multibyte;
2285   ptrdiff_t char_offset = coding->produced_char;
2286   ptrdiff_t last_offset = char_offset;
2287   int last_id = charset_ascii;
2288   bool eol_dos
2289     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2290   int byte_after_cr = -1;
2291   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2292
2293   if (cmp_status->state != COMPOSING_NO)
2294     {
2295       int i;
2296
2297       if (charbuf_end - charbuf < cmp_status->length)
2298         emacs_abort ();
2299       for (i = 0; i < cmp_status->length; i++)
2300         *charbuf++ = cmp_status->carryover[i];
2301       coding->annotated = 1;
2302     }
2303
2304   while (1)
2305     {
2306       int c, id IF_LINT (= 0);
2307
2308       src_base = src;
2309       consumed_chars_base = consumed_chars;
2310
2311       if (charbuf >= charbuf_end)
2312         {
2313           if (byte_after_cr >= 0)
2314             src_base--;
2315           break;
2316         }
2317
2318       if (byte_after_cr >= 0)
2319         c = byte_after_cr, byte_after_cr = -1;
2320       else
2321         ONE_MORE_BYTE (c);
2322
2323       if (c < 0 || c == 0x80)
2324         {
2325           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2326           if (c < 0)
2327             {
2328               *charbuf++ = -c;
2329               char_offset++;
2330             }
2331           else
2332             DECODE_EMACS_MULE_COMPOSITION_START ();
2333           continue;
2334         }
2335
2336       if (c < 0x80)
2337         {
2338           if (eol_dos && c == '\r')
2339             ONE_MORE_BYTE (byte_after_cr);
2340           id = charset_ascii;
2341           if (cmp_status->state != COMPOSING_NO)
2342             {
2343               if (cmp_status->old_form)
2344                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2345               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2346                 cmp_status->ncomps--;
2347             }
2348         }
2349       else
2350         {
2351           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2352           /* emacs_mule_char can load a charset map from a file, which
2353              allocates a large structure and might cause buffer text
2354              to be relocated as result.  Thus, we need to remember the
2355              original pointer to buffer text, and fix up all related
2356              pointers after the call.  */
2357           const unsigned char *orig = coding->source;
2358           ptrdiff_t offset;
2359
2360           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2361                                cmp_status);
2362           offset = coding->source - orig;
2363           if (offset)
2364             {
2365               src += offset;
2366               src_base += offset;
2367               src_end += offset;
2368             }
2369           if (c < 0)
2370             {
2371               if (c == -1)
2372                 goto invalid_code;
2373               if (c == -2)
2374                 break;
2375             }
2376           src = src_base + nbytes;
2377           consumed_chars = consumed_chars_base + nchars;
2378           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2379             cmp_status->ncomps -= nchars;
2380         }
2381
2382       /* Now if C >= 0, we found a normally encoded character, if C <
2383          0, we found an old-style composition component character or
2384          rule.  */
2385
2386       if (cmp_status->state == COMPOSING_NO)
2387         {
2388           if (last_id != id)
2389             {
2390               if (last_id != charset_ascii)
2391                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2392                                   last_id);
2393               last_id = id;
2394               last_offset = char_offset;
2395             }
2396           *charbuf++ = c;
2397           char_offset++;
2398         }
2399       else if (cmp_status->state == COMPOSING_CHAR)
2400         {
2401           if (cmp_status->old_form)
2402             {
2403               if (c >= 0)
2404                 {
2405                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2406                   *charbuf++ = c;
2407                   char_offset++;
2408                 }
2409               else
2410                 {
2411                   *charbuf++ = -c;
2412                   cmp_status->nchars++;
2413                   cmp_status->length++;
2414                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2415                     EMACS_MULE_COMPOSITION_END ();
2416                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2417                     cmp_status->state = COMPOSING_RULE;
2418                 }
2419             }
2420           else
2421             {
2422               *charbuf++ = c;
2423               cmp_status->length++;
2424               cmp_status->nchars--;
2425               if (cmp_status->nchars == 0)
2426                 EMACS_MULE_COMPOSITION_END ();
2427             }
2428         }
2429       else if (cmp_status->state == COMPOSING_RULE)
2430         {
2431           int rule;
2432
2433           if (c >= 0)
2434             {
2435               EMACS_MULE_COMPOSITION_END ();
2436               *charbuf++ = c;
2437               char_offset++;
2438             }
2439           else
2440             {
2441               c = -c;
2442               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2443               if (rule < 0)
2444                 goto invalid_code;
2445               *charbuf++ = -2;
2446               *charbuf++ = rule;
2447               cmp_status->length += 2;
2448               cmp_status->state = COMPOSING_CHAR;
2449             }
2450         }
2451       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2452         {
2453           *charbuf++ = c;
2454           cmp_status->length++;
2455           if (cmp_status->ncomps == 0)
2456             cmp_status->state = COMPOSING_CHAR;
2457           else if (cmp_status->ncomps > 0)
2458             {
2459               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2460                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2461             }
2462           else
2463             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2464         }
2465       else                      /* COMPOSING_COMPONENT_RULE */
2466         {
2467           int rule;
2468
2469           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2470           if (rule < 0)
2471             goto invalid_code;
2472           *charbuf++ = -2;
2473           *charbuf++ = rule;
2474           cmp_status->length += 2;
2475           cmp_status->ncomps--;
2476           if (cmp_status->ncomps > 0)
2477             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2478           else
2479             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2480         }
2481       continue;
2482
2483     invalid_code:
2484       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2485       src = src_base;
2486       consumed_chars = consumed_chars_base;
2487       ONE_MORE_BYTE (c);
2488       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2489       char_offset++;
2490       coding->errors++;
2491     }
2492
2493  no_more_source:
2494   if (cmp_status->state != COMPOSING_NO)
2495     {
2496       if (coding->mode & CODING_MODE_LAST_BLOCK)
2497         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2498       else
2499         {
2500           int i;
2501
2502           charbuf -= cmp_status->length;
2503           for (i = 0; i < cmp_status->length; i++)
2504             cmp_status->carryover[i] = charbuf[i];
2505         }
2506     }
2507   if (last_id != charset_ascii)
2508     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2509   coding->consumed_char += consumed_chars_base;
2510   coding->consumed = src_base - coding->source;
2511   coding->charbuf_used = charbuf - coding->charbuf;
2512 }
2513
2514
2515 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2516   do {                                          \
2517     if (id < 0xA0)                              \
2518       codes[0] = id, codes[1] = 0;              \
2519     else if (id < 0xE0)                         \
2520       codes[0] = 0x9A, codes[1] = id;           \
2521     else if (id < 0xF0)                         \
2522       codes[0] = 0x9B, codes[1] = id;           \
2523     else if (id < 0xF5)                         \
2524       codes[0] = 0x9C, codes[1] = id;           \
2525     else                                        \
2526       codes[0] = 0x9D, codes[1] = id;           \
2527   } while (0);
2528
2529
2530 static bool
2531 encode_coding_emacs_mule (struct coding_system *coding)
2532 {
2533   bool multibytep = coding->dst_multibyte;
2534   int *charbuf = coding->charbuf;
2535   int *charbuf_end = charbuf + coding->charbuf_used;
2536   unsigned char *dst = coding->destination + coding->produced;
2537   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2538   int safe_room = 8;
2539   ptrdiff_t produced_chars = 0;
2540   Lisp_Object attrs, charset_list;
2541   int c;
2542   int preferred_charset_id = -1;
2543
2544   CODING_GET_INFO (coding, attrs, charset_list);
2545   if (! EQ (charset_list, Vemacs_mule_charset_list))
2546     {
2547       charset_list = Vemacs_mule_charset_list;
2548       ASET (attrs, coding_attr_charset_list, charset_list);
2549     }
2550
2551   while (charbuf < charbuf_end)
2552     {
2553       ASSURE_DESTINATION (safe_room);
2554       c = *charbuf++;
2555
2556       if (c < 0)
2557         {
2558           /* Handle an annotation.  */
2559           switch (*charbuf)
2560             {
2561             case CODING_ANNOTATE_COMPOSITION_MASK:
2562               /* Not yet implemented.  */
2563               break;
2564             case CODING_ANNOTATE_CHARSET_MASK:
2565               preferred_charset_id = charbuf[3];
2566               if (preferred_charset_id >= 0
2567                   && NILP (Fmemq (make_number (preferred_charset_id),
2568                                   charset_list)))
2569                 preferred_charset_id = -1;
2570               break;
2571             default:
2572               emacs_abort ();
2573             }
2574           charbuf += -c - 1;
2575           continue;
2576         }
2577
2578       if (ASCII_CHAR_P (c))
2579         EMIT_ONE_ASCII_BYTE (c);
2580       else if (CHAR_BYTE8_P (c))
2581         {
2582           c = CHAR_TO_BYTE8 (c);
2583           EMIT_ONE_BYTE (c);
2584         }
2585       else
2586         {
2587           struct charset *charset;
2588           unsigned code;
2589           int dimension;
2590           int emacs_mule_id;
2591           unsigned char leading_codes[2];
2592
2593           if (preferred_charset_id >= 0)
2594             {
2595               bool result;
2596
2597               charset = CHARSET_FROM_ID (preferred_charset_id);
2598               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2599               if (result)
2600                 code = ENCODE_CHAR (charset, c);
2601               else
2602                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2603                                      &code, charset);
2604             }
2605           else
2606             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2607                                  &code, charset);
2608           if (! charset)
2609             {
2610               c = coding->default_char;
2611               if (ASCII_CHAR_P (c))
2612                 {
2613                   EMIT_ONE_ASCII_BYTE (c);
2614                   continue;
2615                 }
2616               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2617                                    &code, charset);
2618             }
2619           dimension = CHARSET_DIMENSION (charset);
2620           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2621           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2622           EMIT_ONE_BYTE (leading_codes[0]);
2623           if (leading_codes[1])
2624             EMIT_ONE_BYTE (leading_codes[1]);
2625           if (dimension == 1)
2626             EMIT_ONE_BYTE (code | 0x80);
2627           else
2628             {
2629               code |= 0x8080;
2630               EMIT_ONE_BYTE (code >> 8);
2631               EMIT_ONE_BYTE (code & 0xFF);
2632             }
2633         }
2634     }
2635   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2636   coding->produced_char += produced_chars;
2637   coding->produced = dst - coding->destination;
2638   return 0;
2639 }
2640
2641 \f
2642 /*** 7. ISO2022 handlers ***/
2643
2644 /* The following note describes the coding system ISO2022 briefly.
2645    Since the intention of this note is to help understand the
2646    functions in this file, some parts are NOT ACCURATE or are OVERLY
2647    SIMPLIFIED.  For thorough understanding, please refer to the
2648    original document of ISO2022.  This is equivalent to the standard
2649    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2650
2651    ISO2022 provides many mechanisms to encode several character sets
2652    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2653    is encoded using bytes less than 128.  This may make the encoded
2654    text a little bit longer, but the text passes more easily through
2655    several types of gateway, some of which strip off the MSB (Most
2656    Significant Bit).
2657
2658    There are two kinds of character sets: control character sets and
2659    graphic character sets.  The former contain control characters such
2660    as `newline' and `escape' to provide control functions (control
2661    functions are also provided by escape sequences).  The latter
2662    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2663    two control character sets and many graphic character sets.
2664
2665    Graphic character sets are classified into one of the following
2666    four classes, according to the number of bytes (DIMENSION) and
2667    number of characters in one dimension (CHARS) of the set:
2668    - DIMENSION1_CHARS94
2669    - DIMENSION1_CHARS96
2670    - DIMENSION2_CHARS94
2671    - DIMENSION2_CHARS96
2672
2673    In addition, each character set is assigned an identification tag,
2674    unique for each set, called the "final character" (denoted as <F>
2675    hereafter).  The <F> of each character set is decided by ECMA(*)
2676    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2677    (0x30..0x3F are for private use only).
2678
2679    Note (*): ECMA = European Computer Manufacturers Association
2680
2681    Here are examples of graphic character sets [NAME(<F>)]:
2682         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2683         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2684         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2685         o DIMENSION2_CHARS96 -- none for the moment
2686
2687    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2688         C0 [0x00..0x1F] -- control character plane 0
2689         GL [0x20..0x7F] -- graphic character plane 0
2690         C1 [0x80..0x9F] -- control character plane 1
2691         GR [0xA0..0xFF] -- graphic character plane 1
2692
2693    A control character set is directly designated and invoked to C0 or
2694    C1 by an escape sequence.  The most common case is that:
2695    - ISO646's  control character set is designated/invoked to C0, and
2696    - ISO6429's control character set is designated/invoked to C1,
2697    and usually these designations/invocations are omitted in encoded
2698    text.  In a 7-bit environment, only C0 can be used, and a control
2699    character for C1 is encoded by an appropriate escape sequence to
2700    fit into the environment.  All control characters for C1 are
2701    defined to have corresponding escape sequences.
2702
2703    A graphic character set is at first designated to one of four
2704    graphic registers (G0 through G3), then these graphic registers are
2705    invoked to GL or GR.  These designations and invocations can be
2706    done independently.  The most common case is that G0 is invoked to
2707    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2708    these invocations and designations are omitted in encoded text.
2709    In a 7-bit environment, only GL can be used.
2710
2711    When a graphic character set of CHARS94 is invoked to GL, codes
2712    0x20 and 0x7F of the GL area work as control characters SPACE and
2713    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2714    be used.
2715
2716    There are two ways of invocation: locking-shift and single-shift.
2717    With locking-shift, the invocation lasts until the next different
2718    invocation, whereas with single-shift, the invocation affects the
2719    following character only and doesn't affect the locking-shift
2720    state.  Invocations are done by the following control characters or
2721    escape sequences:
2722
2723    ----------------------------------------------------------------------
2724    abbrev  function                  cntrl escape seq   description
2725    ----------------------------------------------------------------------
2726    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2727    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2728    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2729    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2730    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2731    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2732    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2733    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2734    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2735    ----------------------------------------------------------------------
2736    (*) These are not used by any known coding system.
2737
2738    Control characters for these functions are defined by macros
2739    ISO_CODE_XXX in `coding.h'.
2740
2741    Designations are done by the following escape sequences:
2742    ----------------------------------------------------------------------
2743    escape sequence      description
2744    ----------------------------------------------------------------------
2745    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2746    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2747    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2748    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2749    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2750    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2751    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2752    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2753    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2754    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2755    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2756    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2757    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2758    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2759    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2760    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2761    ----------------------------------------------------------------------
2762
2763    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2764    of dimension 1, chars 94, and final character <F>, etc...
2765
2766    Note (*): Although these designations are not allowed in ISO2022,
2767    Emacs accepts them on decoding, and produces them on encoding
2768    CHARS96 character sets in a coding system which is characterized as
2769    7-bit environment, non-locking-shift, and non-single-shift.
2770
2771    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2772    '(' must be omitted.  We refer to this as "short-form" hereafter.
2773
2774    Now you may notice that there are a lot of ways of encoding the
2775    same multilingual text in ISO2022.  Actually, there exist many
2776    coding systems such as Compound Text (used in X11's inter client
2777    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2778    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2779    localized platforms), and all of these are variants of ISO2022.
2780
2781    In addition to the above, Emacs handles two more kinds of escape
2782    sequences: ISO6429's direction specification and Emacs' private
2783    sequence for specifying character composition.
2784
2785    ISO6429's direction specification takes the following form:
2786         o CSI ']'      -- end of the current direction
2787         o CSI '0' ']'  -- end of the current direction
2788         o CSI '1' ']'  -- start of left-to-right text
2789         o CSI '2' ']'  -- start of right-to-left text
2790    The control character CSI (0x9B: control sequence introducer) is
2791    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2792
2793    Character composition specification takes the following form:
2794         o ESC '0' -- start relative composition
2795         o ESC '1' -- end composition
2796         o ESC '2' -- start rule-base composition (*)
2797         o ESC '3' -- start relative composition with alternate chars  (**)
2798         o ESC '4' -- start rule-base composition with alternate chars  (**)
2799   Since these are not standard escape sequences of any ISO standard,
2800   the use of them with these meanings is restricted to Emacs only.
2801
2802   (*) This form is used only in Emacs 20.7 and older versions,
2803   but newer versions can safely decode it.
2804   (**) This form is used only in Emacs 21.1 and newer versions,
2805   and older versions can't decode it.
2806
2807   Here's a list of example usages of these composition escape
2808   sequences (categorized by `enum composition_method').
2809
2810   COMPOSITION_RELATIVE:
2811         ESC 0 CHAR [ CHAR ] ESC 1
2812   COMPOSITION_WITH_RULE:
2813         ESC 2 CHAR [ RULE CHAR ] ESC 1
2814   COMPOSITION_WITH_ALTCHARS:
2815         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2816   COMPOSITION_WITH_RULE_ALTCHARS:
2817         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2818
2819 static enum iso_code_class_type iso_code_class[256];
2820
2821 #define SAFE_CHARSET_P(coding, id)      \
2822   ((id) <= (coding)->max_charset_id     \
2823    && (coding)->safe_charsets[id] != 255)
2824
2825 static void
2826 setup_iso_safe_charsets (Lisp_Object attrs)
2827 {
2828   Lisp_Object charset_list, safe_charsets;
2829   Lisp_Object request;
2830   Lisp_Object reg_usage;
2831   Lisp_Object tail;
2832   EMACS_INT reg94, reg96;
2833   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2834   int max_charset_id;
2835
2836   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2837   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2838       && ! EQ (charset_list, Viso_2022_charset_list))
2839     {
2840       charset_list = Viso_2022_charset_list;
2841       ASET (attrs, coding_attr_charset_list, charset_list);
2842       ASET (attrs, coding_attr_safe_charsets, Qnil);
2843     }
2844
2845   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2846     return;
2847
2848   max_charset_id = 0;
2849   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2850     {
2851       int id = XINT (XCAR (tail));
2852       if (max_charset_id < id)
2853         max_charset_id = id;
2854     }
2855
2856   safe_charsets = make_uninit_string (max_charset_id + 1);
2857   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2858   request = AREF (attrs, coding_attr_iso_request);
2859   reg_usage = AREF (attrs, coding_attr_iso_usage);
2860   reg94 = XINT (XCAR (reg_usage));
2861   reg96 = XINT (XCDR (reg_usage));
2862
2863   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2864     {
2865       Lisp_Object id;
2866       Lisp_Object reg;
2867       struct charset *charset;
2868
2869       id = XCAR (tail);
2870       charset = CHARSET_FROM_ID (XINT (id));
2871       reg = Fcdr (Fassq (id, request));
2872       if (! NILP (reg))
2873         SSET (safe_charsets, XINT (id), XINT (reg));
2874       else if (charset->iso_chars_96)
2875         {
2876           if (reg96 < 4)
2877             SSET (safe_charsets, XINT (id), reg96);
2878         }
2879       else
2880         {
2881           if (reg94 < 4)
2882             SSET (safe_charsets, XINT (id), reg94);
2883         }
2884     }
2885   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2886 }
2887
2888
2889 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2890    Return true if a text is encoded in one of ISO-2022 based coding
2891    systems.  */
2892
2893 static bool
2894 detect_coding_iso_2022 (struct coding_system *coding,
2895                         struct coding_detection_info *detect_info)
2896 {
2897   const unsigned char *src = coding->source, *src_base = src;
2898   const unsigned char *src_end = coding->source + coding->src_bytes;
2899   bool multibytep = coding->src_multibyte;
2900   bool single_shifting = 0;
2901   int id;
2902   int c, c1;
2903   ptrdiff_t consumed_chars = 0;
2904   int i;
2905   int rejected = 0;
2906   int found = 0;
2907   int composition_count = -1;
2908
2909   detect_info->checked |= CATEGORY_MASK_ISO;
2910
2911   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2912     {
2913       struct coding_system *this = &(coding_categories[i]);
2914       Lisp_Object attrs, val;
2915
2916       if (this->id < 0)
2917         continue;
2918       attrs = CODING_ID_ATTRS (this->id);
2919       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2920           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2921         setup_iso_safe_charsets (attrs);
2922       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2923       this->max_charset_id = SCHARS (val) - 1;
2924       this->safe_charsets = SDATA (val);
2925     }
2926
2927   /* A coding system of this category is always ASCII compatible.  */
2928   src += coding->head_ascii;
2929
2930   while (rejected != CATEGORY_MASK_ISO)
2931     {
2932       src_base = src;
2933       ONE_MORE_BYTE (c);
2934       switch (c)
2935         {
2936         case ISO_CODE_ESC:
2937           if (inhibit_iso_escape_detection)
2938             break;
2939           single_shifting = 0;
2940           ONE_MORE_BYTE (c);
2941           if (c == 'N' || c == 'O')
2942             {
2943               /* ESC <Fe> for SS2 or SS3.  */
2944               single_shifting = 1;
2945               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2946             }
2947           else if (c == '1')
2948             {
2949               /* End of composition.  */
2950               if (composition_count < 0
2951                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2952                 /* Invalid */
2953                 break;
2954               composition_count = -1;
2955               found |= CATEGORY_MASK_ISO;
2956             }
2957           else if (c >= '0' && c <= '4')
2958             {
2959               /* ESC <Fp> for start/end composition.  */
2960               composition_count = 0;
2961             }
2962           else
2963             {
2964               if (c >= '(' && c <= '/')
2965                 {
2966                   /* Designation sequence for a charset of dimension 1.  */
2967                   ONE_MORE_BYTE (c1);
2968                   if (c1 < ' ' || c1 >= 0x80
2969                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2970                     /* Invalid designation sequence.  Just ignore.  */
2971                     break;
2972                 }
2973               else if (c == '$')
2974                 {
2975                   /* Designation sequence for a charset of dimension 2.  */
2976                   ONE_MORE_BYTE (c);
2977                   if (c >= '@' && c <= 'B')
2978                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2979                     id = iso_charset_table[1][0][c];
2980                   else if (c >= '(' && c <= '/')
2981                     {
2982                       ONE_MORE_BYTE (c1);
2983                       if (c1 < ' ' || c1 >= 0x80
2984                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2985                         /* Invalid designation sequence.  Just ignore.  */
2986                         break;
2987                     }
2988                   else
2989                     /* Invalid designation sequence.  Just ignore it.  */
2990                     break;
2991                 }
2992               else
2993                 {
2994                   /* Invalid escape sequence.  Just ignore it.  */
2995                   break;
2996                 }
2997
2998               /* We found a valid designation sequence for CHARSET.  */
2999               rejected |= CATEGORY_MASK_ISO_8BIT;
3000               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3001                                   id))
3002                 found |= CATEGORY_MASK_ISO_7;
3003               else
3004                 rejected |= CATEGORY_MASK_ISO_7;
3005               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3006                                   id))
3007                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3008               else
3009                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3010               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3011                                   id))
3012                 found |= CATEGORY_MASK_ISO_7_ELSE;
3013               else
3014                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3015               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3016                                   id))
3017                 found |= CATEGORY_MASK_ISO_8_ELSE;
3018               else
3019                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3020             }
3021           break;
3022
3023         case ISO_CODE_SO:
3024         case ISO_CODE_SI:
3025           /* Locking shift out/in.  */
3026           if (inhibit_iso_escape_detection)
3027             break;
3028           single_shifting = 0;
3029           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3030           break;
3031
3032         case ISO_CODE_CSI:
3033           /* Control sequence introducer.  */
3034           single_shifting = 0;
3035           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3036           found |= CATEGORY_MASK_ISO_8_ELSE;
3037           goto check_extra_latin;
3038
3039         case ISO_CODE_SS2:
3040         case ISO_CODE_SS3:
3041           /* Single shift.   */
3042           if (inhibit_iso_escape_detection)
3043             break;
3044           single_shifting = 0;
3045           rejected |= CATEGORY_MASK_ISO_7BIT;
3046           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3047               & CODING_ISO_FLAG_SINGLE_SHIFT)
3048             {
3049               found |= CATEGORY_MASK_ISO_8_1;
3050               single_shifting = 1;
3051             }
3052           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3053               & CODING_ISO_FLAG_SINGLE_SHIFT)
3054             {
3055               found |= CATEGORY_MASK_ISO_8_2;
3056               single_shifting = 1;
3057             }
3058           if (single_shifting)
3059             break;
3060           goto check_extra_latin;
3061
3062         default:
3063           if (c < 0)
3064             continue;
3065           if (c < 0x80)
3066             {
3067               if (composition_count >= 0)
3068                 composition_count++;
3069               single_shifting = 0;
3070               break;
3071             }
3072           if (c >= 0xA0)
3073             {
3074               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3075               found |= CATEGORY_MASK_ISO_8_1;
3076               /* Check the length of succeeding codes of the range
3077                  0xA0..0FF.  If the byte length is even, we include
3078                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3079                  only when we are not single shifting.  */
3080               if (! single_shifting
3081                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3082                 {
3083                   int len = 1;
3084                   while (src < src_end)
3085                     {
3086                       src_base = src;
3087                       ONE_MORE_BYTE (c);
3088                       if (c < 0xA0)
3089                         {
3090                           src = src_base;
3091                           break;
3092                         }
3093                       len++;
3094                     }
3095
3096                   if (len & 1 && src < src_end)
3097                     {
3098                       rejected |= CATEGORY_MASK_ISO_8_2;
3099                       if (composition_count >= 0)
3100                         composition_count += len;
3101                     }
3102                   else
3103                     {
3104                       found |= CATEGORY_MASK_ISO_8_2;
3105                       if (composition_count >= 0)
3106                         composition_count += len / 2;
3107                     }
3108                 }
3109               break;
3110             }
3111         check_extra_latin:
3112           if (! VECTORP (Vlatin_extra_code_table)
3113               || NILP (AREF (Vlatin_extra_code_table, c)))
3114             {
3115               rejected = CATEGORY_MASK_ISO;
3116               break;
3117             }
3118           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3119               & CODING_ISO_FLAG_LATIN_EXTRA)
3120             found |= CATEGORY_MASK_ISO_8_1;
3121           else
3122             rejected |= CATEGORY_MASK_ISO_8_1;
3123           rejected |= CATEGORY_MASK_ISO_8_2;
3124           break;
3125         }
3126     }
3127   detect_info->rejected |= CATEGORY_MASK_ISO;
3128   return 0;
3129
3130  no_more_source:
3131   detect_info->rejected |= rejected;
3132   detect_info->found |= (found & ~rejected);
3133   return 1;
3134 }
3135
3136
3137 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3138    escape sequence should be kept.  */
3139 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3140   do {                                                                  \
3141     int id, prev;                                                       \
3142                                                                         \
3143     if (final < '0' || final >= 128                                     \
3144         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3145         || !SAFE_CHARSET_P (coding, id))                                \
3146       {                                                                 \
3147         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3148         chars_96 = -1;                                                  \
3149         break;                                                          \
3150       }                                                                 \
3151     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3152     if (id == charset_jisx0201_roman)                                   \
3153       {                                                                 \
3154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3155           id = charset_ascii;                                           \
3156       }                                                                 \
3157     else if (id == charset_jisx0208_1978)                               \
3158       {                                                                 \
3159         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3160           id = charset_jisx0208;                                        \
3161       }                                                                 \
3162     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3163     /* If there was an invalid designation to REG previously, and this  \
3164        designation is ASCII to REG, we should keep this designation     \
3165        sequence.  */                                                    \
3166     if (prev == -2 && id == charset_ascii)                              \
3167       chars_96 = -1;                                                    \
3168   } while (0)
3169
3170
3171 /* Handle these composition sequence (ALT: alternate char):
3172
3173    (1) relative composition: ESC 0 CHAR ... ESC 1
3174    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3175    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3176    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3177
3178    When the start sequence (ESC 0/2/3/4) is found, this annotation
3179    header is produced.
3180
3181         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3182
3183    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3184    produced until the end sequence (ESC 1) is found:
3185
3186    (1) CHAR ... CHAR
3187    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3188    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3189    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3190
3191    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3192    annotation header is updated as below:
3193
3194    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3195    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3196    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3197    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3198
3199    If an error is found while composing, the annotation header is
3200    changed to:
3201
3202         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3203
3204    and the sequence [ -2 DECODED-RULE ] is changed to the original
3205    byte sequence as below:
3206         o the original byte sequence is B: [ B -1 ]
3207         o the original byte sequence is B1 B2: [ B1 B2 ]
3208    and the sequence [ -1 -1 ] is changed to the original byte
3209    sequence:
3210         [ ESC '0' ]
3211 */
3212
3213 /* Decode a composition rule C1 and maybe one more byte from the
3214    source, and set RULE to the encoded composition rule.  If the rule
3215    is invalid, goto invalid_code.  */
3216
3217 #define DECODE_COMPOSITION_RULE(rule)                                   \
3218   do {                                                                  \
3219     rule = c1 - 32;                                                     \
3220     if (rule < 0)                                                       \
3221       goto invalid_code;                                                \
3222     if (rule < 81)              /* old format (before ver.21) */        \
3223       {                                                                 \
3224         int gref = (rule) / 9;                                          \
3225         int nref = (rule) % 9;                                          \
3226         if (gref == 4) gref = 10;                                       \
3227         if (nref == 4) nref = 10;                                       \
3228         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3229       }                                                                 \
3230     else                        /* new format (after ver.21) */         \
3231       {                                                                 \
3232         int b;                                                          \
3233                                                                         \
3234         ONE_MORE_BYTE (b);                                              \
3235         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3236           goto invalid_code;                                            \
3237         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3238         rule += 0x100;   /* Distinguish it from the old format.  */     \
3239       }                                                                 \
3240   } while (0)
3241
3242 #define ENCODE_COMPOSITION_RULE(rule)                           \
3243   do {                                                          \
3244     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3245                                                                 \
3246     if (rule < 0x100)           /* old format */                \
3247       {                                                         \
3248         if (gref == 10) gref = 4;                               \
3249         if (nref == 10) nref = 4;                               \
3250         charbuf[idx] = 32 + gref * 9 + nref;                    \
3251         charbuf[idx + 1] = -1;                                  \
3252         new_chars++;                                            \
3253       }                                                         \
3254     else                                /* new format */        \
3255       {                                                         \
3256         charbuf[idx] = 32 + 81 + gref;                          \
3257         charbuf[idx + 1] = 32 + nref;                           \
3258         new_chars += 2;                                         \
3259       }                                                         \
3260   } while (0)
3261
3262 /* Finish the current composition as invalid.  */
3263
3264 static int
3265 finish_composition (int *charbuf, struct composition_status *cmp_status)
3266 {
3267   int idx = - cmp_status->length;
3268   int new_chars;
3269
3270   /* Recover the original ESC sequence */
3271   charbuf[idx++] = ISO_CODE_ESC;
3272   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3273                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3274                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3275                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3276                     : '4');
3277   charbuf[idx++] = -2;
3278   charbuf[idx++] = 0;
3279   charbuf[idx++] = -1;
3280   new_chars = cmp_status->nchars;
3281   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3282     for (; idx < 0; idx++)
3283       {
3284         int elt = charbuf[idx];
3285
3286         if (elt == -2)
3287           {
3288             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3289             idx++;
3290           }
3291         else if (elt == -1)
3292           {
3293             charbuf[idx++] = ISO_CODE_ESC;
3294             charbuf[idx] = '0';
3295             new_chars += 2;
3296           }
3297       }
3298   cmp_status->state = COMPOSING_NO;
3299   return new_chars;
3300 }
3301
3302 /* If characters are under composition, finish the composition.  */
3303 #define MAYBE_FINISH_COMPOSITION()                              \
3304   do {                                                          \
3305     if (cmp_status->state != COMPOSING_NO)                      \
3306       char_offset += finish_composition (charbuf, cmp_status);  \
3307   } while (0)
3308
3309 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3310
3311    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3312    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3313    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3314    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3315
3316    Produce this annotation sequence now:
3317
3318    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3319 */
3320
3321 #define DECODE_COMPOSITION_START(c1)                                       \
3322   do {                                                                     \
3323     if (c1 == '0'                                                          \
3324         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3325              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3326             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3327                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3328       {                                                                    \
3329         *charbuf++ = -1;                                                   \
3330         *charbuf++= -1;                                                    \
3331         cmp_status->state = COMPOSING_CHAR;                                \
3332         cmp_status->length += 2;                                           \
3333       }                                                                    \
3334     else                                                                   \
3335       {                                                                    \
3336         MAYBE_FINISH_COMPOSITION ();                                       \
3337         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3338                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3339                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3340                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3341         cmp_status->state                                                  \
3342           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3343         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3344         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3345         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3346         coding->annotated = 1;                                             \
3347       }                                                                    \
3348   } while (0)
3349
3350
3351 /* Handle composition end sequence ESC 1.  */
3352
3353 #define DECODE_COMPOSITION_END()                                        \
3354   do {                                                                  \
3355     if (cmp_status->nchars == 0                                         \
3356         || ((cmp_status->state == COMPOSING_CHAR)                       \
3357             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3358       {                                                                 \
3359         MAYBE_FINISH_COMPOSITION ();                                    \
3360         goto invalid_code;                                              \
3361       }                                                                 \
3362     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3363       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3364     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3365       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3366     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3367     char_offset += cmp_status->nchars;                                  \
3368     cmp_status->state = COMPOSING_NO;                                   \
3369   } while (0)
3370
3371 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3372
3373 #define STORE_COMPOSITION_RULE(rule)    \
3374   do {                                  \
3375     *charbuf++ = -2;                    \
3376     *charbuf++ = rule;                  \
3377     cmp_status->length += 2;            \
3378     cmp_status->state--;                \
3379   } while (0)
3380
3381 /* Store a composed char or a component char C in charbuf, and update
3382    cmp_status.  */
3383
3384 #define STORE_COMPOSITION_CHAR(c)                                       \
3385   do {                                                                  \
3386     *charbuf++ = (c);                                                   \
3387     cmp_status->length++;                                               \
3388     if (cmp_status->state == COMPOSING_CHAR)                            \
3389       cmp_status->nchars++;                                             \
3390     else                                                                \
3391       cmp_status->ncomps++;                                             \
3392     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3393         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3394             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3395       cmp_status->state++;                                              \
3396   } while (0)
3397
3398
3399 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3400
3401 static void
3402 decode_coding_iso_2022 (struct coding_system *coding)
3403 {
3404   const unsigned char *src = coding->source + coding->consumed;
3405   const unsigned char *src_end = coding->source + coding->src_bytes;
3406   const unsigned char *src_base;
3407   int *charbuf = coding->charbuf + coding->charbuf_used;
3408   /* We may produce two annotations (charset and composition) in one
3409      loop and one more charset annotation at the end.  */
3410   int *charbuf_end
3411     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3412   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3413   bool multibytep = coding->src_multibyte;
3414   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3415   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3416   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3417   int charset_id_2, charset_id_3;
3418   struct charset *charset;
3419   int c;
3420   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3421   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3422   ptrdiff_t char_offset = coding->produced_char;
3423   ptrdiff_t last_offset = char_offset;
3424   int last_id = charset_ascii;
3425   bool eol_dos
3426     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3427   int byte_after_cr = -1;
3428   int i;
3429
3430   setup_iso_safe_charsets (attrs);
3431   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3432
3433   if (cmp_status->state != COMPOSING_NO)
3434     {
3435       if (charbuf_end - charbuf < cmp_status->length)
3436         emacs_abort ();
3437       for (i = 0; i < cmp_status->length; i++)
3438         *charbuf++ = cmp_status->carryover[i];
3439       coding->annotated = 1;
3440     }
3441
3442   while (1)
3443     {
3444       int c1, c2, c3;
3445
3446       src_base = src;
3447       consumed_chars_base = consumed_chars;
3448
3449       if (charbuf >= charbuf_end)
3450         {
3451           if (byte_after_cr >= 0)
3452             src_base--;
3453           break;
3454         }
3455
3456       if (byte_after_cr >= 0)
3457         c1 = byte_after_cr, byte_after_cr = -1;
3458       else
3459         ONE_MORE_BYTE (c1);
3460       if (c1 < 0)
3461         goto invalid_code;
3462
3463       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3464         {
3465           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3466           char_offset++;
3467           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3468           continue;
3469         }
3470
3471       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3472         {
3473           if (c1 == ISO_CODE_ESC)
3474             {
3475               if (src + 1 >= src_end)
3476                 goto no_more_source;
3477               *charbuf++ = ISO_CODE_ESC;
3478               char_offset++;
3479               if (src[0] == '%' && src[1] == '@')
3480                 {
3481                   src += 2;
3482                   consumed_chars += 2;
3483                   char_offset += 2;
3484                   /* We are sure charbuf can contain two more chars. */
3485                   *charbuf++ = '%';
3486                   *charbuf++ = '@';
3487                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3488                 }
3489             }
3490           else
3491             {
3492               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3493               char_offset++;
3494             }
3495           continue;
3496         }
3497
3498       if ((cmp_status->state == COMPOSING_RULE
3499            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3500           && c1 != ISO_CODE_ESC)
3501         {
3502           int rule;
3503
3504           DECODE_COMPOSITION_RULE (rule);
3505           STORE_COMPOSITION_RULE (rule);
3506           continue;
3507         }
3508
3509       /* We produce at most one character.  */
3510       switch (iso_code_class [c1])
3511         {
3512         case ISO_0x20_or_0x7F:
3513           if (charset_id_0 < 0
3514               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3515             /* This is SPACE or DEL.  */
3516             charset = CHARSET_FROM_ID (charset_ascii);
3517           else
3518             charset = CHARSET_FROM_ID (charset_id_0);
3519           break;
3520
3521         case ISO_graphic_plane_0:
3522           if (charset_id_0 < 0)
3523             charset = CHARSET_FROM_ID (charset_ascii);
3524           else
3525             charset = CHARSET_FROM_ID (charset_id_0);
3526           break;
3527
3528         case ISO_0xA0_or_0xFF:
3529           if (charset_id_1 < 0
3530               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3531               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3532             goto invalid_code;
3533           /* This is a graphic character, we fall down ... */
3534
3535         case ISO_graphic_plane_1:
3536           if (charset_id_1 < 0)
3537             goto invalid_code;
3538           charset = CHARSET_FROM_ID (charset_id_1);
3539           break;
3540
3541         case ISO_control_0:
3542           if (eol_dos && c1 == '\r')
3543             ONE_MORE_BYTE (byte_after_cr);
3544           MAYBE_FINISH_COMPOSITION ();
3545           charset = CHARSET_FROM_ID (charset_ascii);
3546           break;
3547
3548         case ISO_control_1:
3549           goto invalid_code;
3550
3551         case ISO_shift_out:
3552           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3553               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3554             goto invalid_code;
3555           CODING_ISO_INVOCATION (coding, 0) = 1;
3556           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3557           continue;
3558
3559         case ISO_shift_in:
3560           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3561             goto invalid_code;
3562           CODING_ISO_INVOCATION (coding, 0) = 0;
3563           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3564           continue;
3565
3566         case ISO_single_shift_2_7:
3567           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3568             goto invalid_code;
3569         case ISO_single_shift_2:
3570           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3571             goto invalid_code;
3572           /* SS2 is handled as an escape sequence of ESC 'N' */
3573           c1 = 'N';
3574           goto label_escape_sequence;
3575
3576         case ISO_single_shift_3:
3577           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3578             goto invalid_code;
3579           /* SS2 is handled as an escape sequence of ESC 'O' */
3580           c1 = 'O';
3581           goto label_escape_sequence;
3582
3583         case ISO_control_sequence_introducer:
3584           /* CSI is handled as an escape sequence of ESC '[' ...  */
3585           c1 = '[';
3586           goto label_escape_sequence;
3587
3588         case ISO_escape:
3589           ONE_MORE_BYTE (c1);
3590         label_escape_sequence:
3591           /* Escape sequences handled here are invocation,
3592              designation, direction specification, and character
3593              composition specification.  */
3594           switch (c1)
3595             {
3596             case '&':           /* revision of following character set */
3597               ONE_MORE_BYTE (c1);
3598               if (!(c1 >= '@' && c1 <= '~'))
3599                 goto invalid_code;
3600               ONE_MORE_BYTE (c1);
3601               if (c1 != ISO_CODE_ESC)
3602                 goto invalid_code;
3603               ONE_MORE_BYTE (c1);
3604               goto label_escape_sequence;
3605
3606             case '$':           /* designation of 2-byte character set */
3607               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3608                 goto invalid_code;
3609               {
3610                 int reg, chars96;
3611
3612                 ONE_MORE_BYTE (c1);
3613                 if (c1 >= '@' && c1 <= 'B')
3614                   {     /* designation of JISX0208.1978, GB2312.1980,
3615                            or JISX0208.1980 */
3616                     reg = 0, chars96 = 0;
3617                   }
3618                 else if (c1 >= 0x28 && c1 <= 0x2B)
3619                   { /* designation of DIMENSION2_CHARS94 character set */
3620                     reg = c1 - 0x28, chars96 = 0;
3621                     ONE_MORE_BYTE (c1);
3622                   }
3623                 else if (c1 >= 0x2C && c1 <= 0x2F)
3624                   { /* designation of DIMENSION2_CHARS96 character set */
3625                     reg = c1 - 0x2C, chars96 = 1;
3626                     ONE_MORE_BYTE (c1);
3627                   }
3628                 else
3629                   goto invalid_code;
3630                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3631                 /* We must update these variables now.  */
3632                 if (reg == 0)
3633                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3634                 else if (reg == 1)
3635                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3636                 if (chars96 < 0)
3637                   goto invalid_code;
3638               }
3639               continue;
3640
3641             case 'n':           /* invocation of locking-shift-2 */
3642               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3643                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3644                 goto invalid_code;
3645               CODING_ISO_INVOCATION (coding, 0) = 2;
3646               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3647               continue;
3648
3649             case 'o':           /* invocation of locking-shift-3 */
3650               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3651                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3652                 goto invalid_code;
3653               CODING_ISO_INVOCATION (coding, 0) = 3;
3654               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3655               continue;
3656
3657             case 'N':           /* invocation of single-shift-2 */
3658               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3659                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3660                 goto invalid_code;
3661               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3662               if (charset_id_2 < 0)
3663                 charset = CHARSET_FROM_ID (charset_ascii);
3664               else
3665                 charset = CHARSET_FROM_ID (charset_id_2);
3666               ONE_MORE_BYTE (c1);
3667               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3668                 goto invalid_code;
3669               break;
3670
3671             case 'O':           /* invocation of single-shift-3 */
3672               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3673                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3674                 goto invalid_code;
3675               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3676               if (charset_id_3 < 0)
3677                 charset = CHARSET_FROM_ID (charset_ascii);
3678               else
3679                 charset = CHARSET_FROM_ID (charset_id_3);
3680               ONE_MORE_BYTE (c1);
3681               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3682                 goto invalid_code;
3683               break;
3684
3685             case '0': case '2': case '3': case '4': /* start composition */
3686               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3687                 goto invalid_code;
3688               if (last_id != charset_ascii)
3689                 {
3690                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3691                   last_id = charset_ascii;
3692                   last_offset = char_offset;
3693                 }
3694               DECODE_COMPOSITION_START (c1);
3695               continue;
3696
3697             case '1':           /* end composition */
3698               if (cmp_status->state == COMPOSING_NO)
3699                 goto invalid_code;
3700               DECODE_COMPOSITION_END ();
3701               continue;
3702
3703             case '[':           /* specification of direction */
3704               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3705                 goto invalid_code;
3706               /* For the moment, nested direction is not supported.
3707                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3708                  left-to-right, and nonzero means right-to-left.  */
3709               ONE_MORE_BYTE (c1);
3710               switch (c1)
3711                 {
3712                 case ']':       /* end of the current direction */
3713                   coding->mode &= ~CODING_MODE_DIRECTION;
3714
3715                 case '0':       /* end of the current direction */
3716                 case '1':       /* start of left-to-right direction */
3717                   ONE_MORE_BYTE (c1);
3718                   if (c1 == ']')
3719                     coding->mode &= ~CODING_MODE_DIRECTION;
3720                   else
3721                     goto invalid_code;
3722                   break;
3723
3724                 case '2':       /* start of right-to-left direction */
3725                   ONE_MORE_BYTE (c1);
3726                   if (c1 == ']')
3727                     coding->mode |= CODING_MODE_DIRECTION;
3728                   else
3729                     goto invalid_code;
3730                   break;
3731
3732                 default:
3733                   goto invalid_code;
3734                 }
3735               continue;
3736
3737             case '%':
3738               ONE_MORE_BYTE (c1);
3739               if (c1 == '/')
3740                 {
3741                   /* CTEXT extended segment:
3742                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3743                      We keep these bytes as is for the moment.
3744                      They may be decoded by post-read-conversion.  */
3745                   int dim, M, L;
3746                   int size;
3747
3748                   ONE_MORE_BYTE (dim);
3749                   if (dim < '0' || dim > '4')
3750                     goto invalid_code;
3751                   ONE_MORE_BYTE (M);
3752                   if (M < 128)
3753                     goto invalid_code;
3754                   ONE_MORE_BYTE (L);
3755                   if (L < 128)
3756                     goto invalid_code;
3757                   size = ((M - 128) * 128) + (L - 128);
3758                   if (charbuf + 6 > charbuf_end)
3759                     goto break_loop;
3760                   *charbuf++ = ISO_CODE_ESC;
3761                   *charbuf++ = '%';
3762                   *charbuf++ = '/';
3763                   *charbuf++ = dim;
3764                   *charbuf++ = BYTE8_TO_CHAR (M);
3765                   *charbuf++ = BYTE8_TO_CHAR (L);
3766                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3767                 }
3768               else if (c1 == 'G')
3769                 {
3770                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3771                      ESC % G --UTF-8-BYTES-- ESC % @
3772                      We keep these bytes as is for the moment.
3773                      They may be decoded by post-read-conversion.  */
3774                   if (charbuf + 3 > charbuf_end)
3775                     goto break_loop;
3776                   *charbuf++ = ISO_CODE_ESC;
3777                   *charbuf++ = '%';
3778                   *charbuf++ = 'G';
3779                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3780                 }
3781               else
3782                 goto invalid_code;
3783               continue;
3784               break;
3785
3786             default:
3787               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3788                 goto invalid_code;
3789               {
3790                 int reg, chars96;
3791
3792                 if (c1 >= 0x28 && c1 <= 0x2B)
3793                   { /* designation of DIMENSION1_CHARS94 character set */
3794                     reg = c1 - 0x28, chars96 = 0;
3795                     ONE_MORE_BYTE (c1);
3796                   }
3797                 else if (c1 >= 0x2C && c1 <= 0x2F)
3798                   { /* designation of DIMENSION1_CHARS96 character set */
3799                     reg = c1 - 0x2C, chars96 = 1;
3800                     ONE_MORE_BYTE (c1);
3801                   }
3802                 else
3803                   goto invalid_code;
3804                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3805                 /* We must update these variables now.  */
3806                 if (reg == 0)
3807                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3808                 else if (reg == 1)
3809                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3810                 if (chars96 < 0)
3811                   goto invalid_code;
3812               }
3813               continue;
3814             }
3815           break;
3816
3817         default:
3818           emacs_abort ();
3819         }
3820
3821       if (cmp_status->state == COMPOSING_NO
3822           && charset->id != charset_ascii
3823           && last_id != charset->id)
3824         {
3825           if (last_id != charset_ascii)
3826             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3827           last_id = charset->id;
3828           last_offset = char_offset;
3829         }
3830
3831       /* Now we know CHARSET and 1st position code C1 of a character.
3832          Produce a decoded character while getting 2nd and 3rd
3833          position codes C2, C3 if necessary.  */
3834       if (CHARSET_DIMENSION (charset) > 1)
3835         {
3836           ONE_MORE_BYTE (c2);
3837           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3838               || ((c1 & 0x80) != (c2 & 0x80)))
3839             /* C2 is not in a valid range.  */
3840             goto invalid_code;
3841           if (CHARSET_DIMENSION (charset) == 2)
3842             c1 = (c1 << 8) | c2;
3843           else
3844             {
3845               ONE_MORE_BYTE (c3);
3846               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3847                   || ((c1 & 0x80) != (c3 & 0x80)))
3848                 /* C3 is not in a valid range.  */
3849                 goto invalid_code;
3850               c1 = (c1 << 16) | (c2 << 8) | c2;
3851             }
3852         }
3853       c1 &= 0x7F7F7F;
3854       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3855       if (c < 0)
3856         {
3857           MAYBE_FINISH_COMPOSITION ();
3858           for (; src_base < src; src_base++, char_offset++)
3859             {
3860               if (ASCII_BYTE_P (*src_base))
3861                 *charbuf++ = *src_base;
3862               else
3863                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3864             }
3865         }
3866       else if (cmp_status->state == COMPOSING_NO)
3867         {
3868           *charbuf++ = c;
3869           char_offset++;
3870         }
3871       else if ((cmp_status->state == COMPOSING_CHAR
3872                 ? cmp_status->nchars
3873                 : cmp_status->ncomps)
3874                >= MAX_COMPOSITION_COMPONENTS)
3875         {
3876           /* Too long composition.  */
3877           MAYBE_FINISH_COMPOSITION ();
3878           *charbuf++ = c;
3879           char_offset++;
3880         }
3881       else
3882         STORE_COMPOSITION_CHAR (c);
3883       continue;
3884
3885     invalid_code:
3886       MAYBE_FINISH_COMPOSITION ();
3887       src = src_base;
3888       consumed_chars = consumed_chars_base;
3889       ONE_MORE_BYTE (c);
3890       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3891       char_offset++;
3892       coding->errors++;
3893       continue;
3894
3895     break_loop:
3896       break;
3897     }
3898
3899  no_more_source:
3900   if (cmp_status->state != COMPOSING_NO)
3901     {
3902       if (coding->mode & CODING_MODE_LAST_BLOCK)
3903         MAYBE_FINISH_COMPOSITION ();
3904       else
3905         {
3906           charbuf -= cmp_status->length;
3907           for (i = 0; i < cmp_status->length; i++)
3908             cmp_status->carryover[i] = charbuf[i];
3909         }
3910     }
3911   else if (last_id != charset_ascii)
3912     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3913   coding->consumed_char += consumed_chars_base;
3914   coding->consumed = src_base - coding->source;
3915   coding->charbuf_used = charbuf - coding->charbuf;
3916 }
3917
3918
3919 /* ISO2022 encoding stuff.  */
3920
3921 /*
3922    It is not enough to say just "ISO2022" on encoding, we have to
3923    specify more details.  In Emacs, each coding system of ISO2022
3924    variant has the following specifications:
3925         1. Initial designation to G0 thru G3.
3926         2. Allows short-form designation?
3927         3. ASCII should be designated to G0 before control characters?
3928         4. ASCII should be designated to G0 at end of line?
3929         5. 7-bit environment or 8-bit environment?
3930         6. Use locking-shift?
3931         7. Use Single-shift?
3932    And the following two are only for Japanese:
3933         8. Use ASCII in place of JIS0201-1976-Roman?
3934         9. Use JISX0208-1983 in place of JISX0208-1978?
3935    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3936    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3937    details.
3938 */
3939
3940 /* Produce codes (escape sequence) for designating CHARSET to graphic
3941    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3942    '@', 'A', or 'B' and the coding system CODING allows, produce
3943    designation sequence of short-form.  */
3944
3945 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3946   do {                                                                  \
3947     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3948     const char *intermediate_char_94 = "()*+";                          \
3949     const char *intermediate_char_96 = ",-./";                          \
3950     int revision = -1;                                                  \
3951                                                                         \
3952     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3953       revision = CHARSET_ISO_REVISION (charset);                        \
3954                                                                         \
3955     if (revision >= 0)                                                  \
3956       {                                                                 \
3957         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3958         EMIT_ONE_BYTE ('@' + revision);                                 \
3959       }                                                                 \
3960     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3961     if (CHARSET_DIMENSION (charset) == 1)                               \
3962       {                                                                 \
3963         int b;                                                          \
3964         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3965           b = intermediate_char_94[reg];                                \
3966         else                                                            \
3967           b = intermediate_char_96[reg];                                \
3968         EMIT_ONE_ASCII_BYTE (b);                                        \
3969       }                                                                 \
3970     else                                                                \
3971       {                                                                 \
3972         EMIT_ONE_ASCII_BYTE ('$');                                      \
3973         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3974           {                                                             \
3975             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3976                 || reg != 0                                             \
3977                 || final_char < '@' || final_char > 'B')                \
3978               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3979           }                                                             \
3980         else                                                            \
3981           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3982       }                                                                 \
3983     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3984                                                                         \
3985     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3986   } while (0)
3987
3988
3989 /* The following two macros produce codes (control character or escape
3990    sequence) for ISO2022 single-shift functions (single-shift-2 and
3991    single-shift-3).  */
3992
3993 #define ENCODE_SINGLE_SHIFT_2                                           \
3994   do {                                                                  \
3995     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3996       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3997     else                                                                \
3998       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3999     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4000   } while (0)
4001
4002
4003 #define ENCODE_SINGLE_SHIFT_3                                           \
4004   do {                                                                  \
4005     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4006       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4007     else                                                                \
4008       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4009     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4010   } while (0)
4011
4012
4013 /* The following four macros produce codes (control character or
4014    escape sequence) for ISO2022 locking-shift functions (shift-in,
4015    shift-out, locking-shift-2, and locking-shift-3).  */
4016
4017 #define ENCODE_SHIFT_IN                                 \
4018   do {                                                  \
4019     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4020     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4021   } while (0)
4022
4023
4024 #define ENCODE_SHIFT_OUT                                \
4025   do {                                                  \
4026     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4027     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4028   } while (0)
4029
4030
4031 #define ENCODE_LOCKING_SHIFT_2                          \
4032   do {                                                  \
4033     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4034     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4035   } while (0)
4036
4037
4038 #define ENCODE_LOCKING_SHIFT_3                          \
4039   do {                                                  \
4040     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4041     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4042   } while (0)
4043
4044
4045 /* Produce codes for a DIMENSION1 character whose character set is
4046    CHARSET and whose position-code is C1.  Designation and invocation
4047    sequences are also produced in advance if necessary.  */
4048
4049 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4050   do {                                                                  \
4051     int id = CHARSET_ID (charset);                                      \
4052                                                                         \
4053     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4054         && id == charset_ascii)                                         \
4055       {                                                                 \
4056         id = charset_jisx0201_roman;                                    \
4057         charset = CHARSET_FROM_ID (id);                                 \
4058       }                                                                 \
4059                                                                         \
4060     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4061       {                                                                 \
4062         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4063           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4064         else                                                            \
4065           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4066         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4067         break;                                                          \
4068       }                                                                 \
4069     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4070       {                                                                 \
4071         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4072         break;                                                          \
4073       }                                                                 \
4074     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4075       {                                                                 \
4076         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4077         break;                                                          \
4078       }                                                                 \
4079     else                                                                \
4080       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4081          must invoke it, or, at first, designate it to some graphic     \
4082          register.  Then repeat the loop to actually produce the        \
4083          character.  */                                                 \
4084       dst = encode_invocation_designation (charset, coding, dst,        \
4085                                            &produced_chars);            \
4086   } while (1)
4087
4088
4089 /* Produce codes for a DIMENSION2 character whose character set is
4090    CHARSET and whose position-codes are C1 and C2.  Designation and
4091    invocation codes are also produced in advance if necessary.  */
4092
4093 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4094   do {                                                                  \
4095     int id = CHARSET_ID (charset);                                      \
4096                                                                         \
4097     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4098         && id == charset_jisx0208)                                      \
4099       {                                                                 \
4100         id = charset_jisx0208_1978;                                     \
4101         charset = CHARSET_FROM_ID (id);                                 \
4102       }                                                                 \
4103                                                                         \
4104     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4105       {                                                                 \
4106         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4107           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4108         else                                                            \
4109           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4110         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4111         break;                                                          \
4112       }                                                                 \
4113     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4114       {                                                                 \
4115         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4116         break;                                                          \
4117       }                                                                 \
4118     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4119       {                                                                 \
4120         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4121         break;                                                          \
4122       }                                                                 \
4123     else                                                                \
4124       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4125          must invoke it, or, at first, designate it to some graphic     \
4126          register.  Then repeat the loop to actually produce the        \
4127          character.  */                                                 \
4128       dst = encode_invocation_designation (charset, coding, dst,        \
4129                                            &produced_chars);            \
4130   } while (1)
4131
4132
4133 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4134   do {                                                                     \
4135     unsigned code;                                                         \
4136     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4137                                                                            \
4138     if (CHARSET_DIMENSION (charset) == 1)                                  \
4139       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4140     else                                                                   \
4141       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4142   } while (0)
4143
4144
4145 /* Produce designation and invocation codes at a place pointed by DST
4146    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4147    Return new DST.  */
4148
4149 static unsigned char *
4150 encode_invocation_designation (struct charset *charset,
4151                                struct coding_system *coding,
4152                                unsigned char *dst, ptrdiff_t *p_nchars)
4153 {
4154   bool multibytep = coding->dst_multibyte;
4155   ptrdiff_t produced_chars = *p_nchars;
4156   int reg;                      /* graphic register number */
4157   int id = CHARSET_ID (charset);
4158
4159   /* At first, check designations.  */
4160   for (reg = 0; reg < 4; reg++)
4161     if (id == CODING_ISO_DESIGNATION (coding, reg))
4162       break;
4163
4164   if (reg >= 4)
4165     {
4166       /* CHARSET is not yet designated to any graphic registers.  */
4167       /* At first check the requested designation.  */
4168       reg = CODING_ISO_REQUEST (coding, id);
4169       if (reg < 0)
4170         /* Since CHARSET requests no special designation, designate it
4171            to graphic register 0.  */
4172         reg = 0;
4173
4174       ENCODE_DESIGNATION (charset, reg, coding);
4175     }
4176
4177   if (CODING_ISO_INVOCATION (coding, 0) != reg
4178       && CODING_ISO_INVOCATION (coding, 1) != reg)
4179     {
4180       /* Since the graphic register REG is not invoked to any graphic
4181          planes, invoke it to graphic plane 0.  */
4182       switch (reg)
4183         {
4184         case 0:                 /* graphic register 0 */
4185           ENCODE_SHIFT_IN;
4186           break;
4187
4188         case 1:                 /* graphic register 1 */
4189           ENCODE_SHIFT_OUT;
4190           break;
4191
4192         case 2:                 /* graphic register 2 */
4193           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4194             ENCODE_SINGLE_SHIFT_2;
4195           else
4196             ENCODE_LOCKING_SHIFT_2;
4197           break;
4198
4199         case 3:                 /* graphic register 3 */
4200           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4201             ENCODE_SINGLE_SHIFT_3;
4202           else
4203             ENCODE_LOCKING_SHIFT_3;
4204           break;
4205         }
4206     }
4207
4208   *p_nchars = produced_chars;
4209   return dst;
4210 }
4211
4212
4213 /* Produce codes for designation and invocation to reset the graphic
4214    planes and registers to initial state.  */
4215 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4216   do {                                                                  \
4217     int reg;                                                            \
4218     struct charset *charset;                                            \
4219                                                                         \
4220     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4221       ENCODE_SHIFT_IN;                                                  \
4222     for (reg = 0; reg < 4; reg++)                                       \
4223       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4224           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4225               != CODING_ISO_INITIAL (coding, reg)))                     \
4226         {                                                               \
4227           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4228           ENCODE_DESIGNATION (charset, reg, coding);                    \
4229         }                                                               \
4230   } while (0)
4231
4232
4233 /* Produce designation sequences of charsets in the line started from
4234    CHARBUF to a place pointed by DST, and return the number of
4235    produced bytes.  DST should not directly point a buffer text area
4236    which may be relocated by char_charset call.
4237
4238    If the current block ends before any end-of-line, we may fail to
4239    find all the necessary designations.  */
4240
4241 static ptrdiff_t
4242 encode_designation_at_bol (struct coding_system *coding,
4243                            int *charbuf, int *charbuf_end,
4244                            unsigned char *dst)
4245 {
4246   unsigned char *orig = dst;
4247   struct charset *charset;
4248   /* Table of charsets to be designated to each graphic register.  */
4249   int r[4];
4250   int c, found = 0, reg;
4251   ptrdiff_t produced_chars = 0;
4252   bool multibytep = coding->dst_multibyte;
4253   Lisp_Object attrs;
4254   Lisp_Object charset_list;
4255
4256   attrs = CODING_ID_ATTRS (coding->id);
4257   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4258   if (EQ (charset_list, Qiso_2022))
4259     charset_list = Viso_2022_charset_list;
4260
4261   for (reg = 0; reg < 4; reg++)
4262     r[reg] = -1;
4263
4264   while (charbuf < charbuf_end && found < 4)
4265     {
4266       int id;
4267
4268       c = *charbuf++;
4269       if (c == '\n')
4270         break;
4271       charset = char_charset (c, charset_list, NULL);
4272       id = CHARSET_ID (charset);
4273       reg = CODING_ISO_REQUEST (coding, id);
4274       if (reg >= 0 && r[reg] < 0)
4275         {
4276           found++;
4277           r[reg] = id;
4278         }
4279     }
4280
4281   if (found)
4282     {
4283       for (reg = 0; reg < 4; reg++)
4284         if (r[reg] >= 0
4285             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4286           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4287     }
4288
4289   return dst - orig;
4290 }
4291
4292 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4293
4294 static bool
4295 encode_coding_iso_2022 (struct coding_system *coding)
4296 {
4297   bool multibytep = coding->dst_multibyte;
4298   int *charbuf = coding->charbuf;
4299   int *charbuf_end = charbuf + coding->charbuf_used;
4300   unsigned char *dst = coding->destination + coding->produced;
4301   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4302   int safe_room = 16;
4303   bool bol_designation
4304     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4305        && CODING_ISO_BOL (coding));
4306   ptrdiff_t produced_chars = 0;
4307   Lisp_Object attrs, eol_type, charset_list;
4308   bool ascii_compatible;
4309   int c;
4310   int preferred_charset_id = -1;
4311
4312   CODING_GET_INFO (coding, attrs, charset_list);
4313   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4314   if (VECTORP (eol_type))
4315     eol_type = Qunix;
4316
4317   setup_iso_safe_charsets (attrs);
4318   /* Charset list may have been changed.  */
4319   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4320   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4321
4322   ascii_compatible
4323     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4324        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4325                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4326
4327   while (charbuf < charbuf_end)
4328     {
4329       ASSURE_DESTINATION (safe_room);
4330
4331       if (bol_designation)
4332         {
4333           /* We have to produce designation sequences if any now.  */
4334           unsigned char desig_buf[16];
4335           int nbytes;
4336           ptrdiff_t offset;
4337
4338           charset_map_loaded = 0;
4339           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4340                                               desig_buf);
4341           if (charset_map_loaded
4342               && (offset = coding_change_destination (coding)))
4343             {
4344               dst += offset;
4345               dst_end += offset;
4346             }
4347           memcpy (dst, desig_buf, nbytes);
4348           dst += nbytes;
4349           /* We are sure that designation sequences are all ASCII bytes.  */
4350           produced_chars += nbytes;
4351           bol_designation = 0;
4352           ASSURE_DESTINATION (safe_room);
4353         }
4354
4355       c = *charbuf++;
4356
4357       if (c < 0)
4358         {
4359           /* Handle an annotation.  */
4360           switch (*charbuf)
4361             {
4362             case CODING_ANNOTATE_COMPOSITION_MASK:
4363               /* Not yet implemented.  */
4364               break;
4365             case CODING_ANNOTATE_CHARSET_MASK:
4366               preferred_charset_id = charbuf[2];
4367               if (preferred_charset_id >= 0
4368                   && NILP (Fmemq (make_number (preferred_charset_id),
4369                                   charset_list)))
4370                 preferred_charset_id = -1;
4371               break;
4372             default:
4373               emacs_abort ();
4374             }
4375           charbuf += -c - 1;
4376           continue;
4377         }
4378
4379       /* Now encode the character C.  */
4380       if (c < 0x20 || c == 0x7F)
4381         {
4382           if (c == '\n'
4383               || (c == '\r' && EQ (eol_type, Qmac)))
4384             {
4385               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4386                 ENCODE_RESET_PLANE_AND_REGISTER ();
4387               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4388                 {
4389                   int i;
4390
4391                   for (i = 0; i < 4; i++)
4392                     CODING_ISO_DESIGNATION (coding, i)
4393                       = CODING_ISO_INITIAL (coding, i);
4394                 }
4395               bol_designation = ((CODING_ISO_FLAGS (coding)
4396                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4397                                  != 0);
4398             }
4399           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4400             ENCODE_RESET_PLANE_AND_REGISTER ();
4401           EMIT_ONE_ASCII_BYTE (c);
4402         }
4403       else if (ASCII_CHAR_P (c))
4404         {
4405           if (ascii_compatible)
4406             EMIT_ONE_ASCII_BYTE (c);
4407           else
4408             {
4409               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4410               ENCODE_ISO_CHARACTER (charset, c);
4411             }
4412         }
4413       else if (CHAR_BYTE8_P (c))
4414         {
4415           c = CHAR_TO_BYTE8 (c);
4416           EMIT_ONE_BYTE (c);
4417         }
4418       else
4419         {
4420           struct charset *charset;
4421
4422           if (preferred_charset_id >= 0)
4423             {
4424               bool result;
4425
4426               charset = CHARSET_FROM_ID (preferred_charset_id);
4427               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4428               if (! result)
4429                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4430                                      NULL, charset);
4431             }
4432           else
4433             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4434                                  NULL, charset);
4435           if (!charset)
4436             {
4437               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4438                 {
4439                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4440                   charset = CHARSET_FROM_ID (charset_ascii);
4441                 }
4442               else
4443                 {
4444                   c = coding->default_char;
4445                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4446                                        charset_list, NULL, charset);
4447                 }
4448             }
4449           ENCODE_ISO_CHARACTER (charset, c);
4450         }
4451     }
4452
4453   if (coding->mode & CODING_MODE_LAST_BLOCK
4454       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4455     {
4456       ASSURE_DESTINATION (safe_room);
4457       ENCODE_RESET_PLANE_AND_REGISTER ();
4458     }
4459   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4460   CODING_ISO_BOL (coding) = bol_designation;
4461   coding->produced_char += produced_chars;
4462   coding->produced = dst - coding->destination;
4463   return 0;
4464 }
4465
4466 \f
4467 /*** 8,9. SJIS and BIG5 handlers ***/
4468
4469 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4470    quite widely.  So, for the moment, Emacs supports them in the bare
4471    C code.  But, in the future, they may be supported only by CCL.  */
4472
4473 /* SJIS is a coding system encoding three character sets: ASCII, right
4474    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4475    as is.  A character of charset katakana-jisx0201 is encoded by
4476    "position-code + 0x80".  A character of charset japanese-jisx0208
4477    is encoded in 2-byte but two position-codes are divided and shifted
4478    so that it fit in the range below.
4479
4480    --- CODE RANGE of SJIS ---
4481    (character set)      (range)
4482    ASCII                0x00 .. 0x7F
4483    KATAKANA-JISX0201    0xA0 .. 0xDF
4484    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4485             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4486    -------------------------------
4487
4488 */
4489
4490 /* BIG5 is a coding system encoding two character sets: ASCII and
4491    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4492    character set and is encoded in two-byte.
4493
4494    --- CODE RANGE of BIG5 ---
4495    (character set)      (range)
4496    ASCII                0x00 .. 0x7F
4497    Big5 (1st byte)      0xA1 .. 0xFE
4498         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4499    --------------------------
4500
4501   */
4502
4503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4504    Return true if a text is encoded in SJIS.  */
4505
4506 static bool
4507 detect_coding_sjis (struct coding_system *coding,
4508                     struct coding_detection_info *detect_info)
4509 {
4510   const unsigned char *src = coding->source, *src_base;
4511   const unsigned char *src_end = coding->source + coding->src_bytes;
4512   bool multibytep = coding->src_multibyte;
4513   ptrdiff_t consumed_chars = 0;
4514   int found = 0;
4515   int c;
4516   Lisp_Object attrs, charset_list;
4517   int max_first_byte_of_2_byte_code;
4518
4519   CODING_GET_INFO (coding, attrs, charset_list);
4520   max_first_byte_of_2_byte_code
4521     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4522
4523   detect_info->checked |= CATEGORY_MASK_SJIS;
4524   /* A coding system of this category is always ASCII compatible.  */
4525   src += coding->head_ascii;
4526
4527   while (1)
4528     {
4529       src_base = src;
4530       ONE_MORE_BYTE (c);
4531       if (c < 0x80)
4532         continue;
4533       if ((c >= 0x81 && c <= 0x9F)
4534           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4535         {
4536           ONE_MORE_BYTE (c);
4537           if (c < 0x40 || c == 0x7F || c > 0xFC)
4538             break;
4539           found = CATEGORY_MASK_SJIS;
4540         }
4541       else if (c >= 0xA0 && c < 0xE0)
4542         found = CATEGORY_MASK_SJIS;
4543       else
4544         break;
4545     }
4546   detect_info->rejected |= CATEGORY_MASK_SJIS;
4547   return 0;
4548
4549  no_more_source:
4550   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4551     {
4552       detect_info->rejected |= CATEGORY_MASK_SJIS;
4553       return 0;
4554     }
4555   detect_info->found |= found;
4556   return 1;
4557 }
4558
4559 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4560    Return true if a text is encoded in BIG5.  */
4561
4562 static bool
4563 detect_coding_big5 (struct coding_system *coding,
4564                     struct coding_detection_info *detect_info)
4565 {
4566   const unsigned char *src = coding->source, *src_base;
4567   const unsigned char *src_end = coding->source + coding->src_bytes;
4568   bool multibytep = coding->src_multibyte;
4569   ptrdiff_t consumed_chars = 0;
4570   int found = 0;
4571   int c;
4572
4573   detect_info->checked |= CATEGORY_MASK_BIG5;
4574   /* A coding system of this category is always ASCII compatible.  */
4575   src += coding->head_ascii;
4576
4577   while (1)
4578     {
4579       src_base = src;
4580       ONE_MORE_BYTE (c);
4581       if (c < 0x80)
4582         continue;
4583       if (c >= 0xA1)
4584         {
4585           ONE_MORE_BYTE (c);
4586           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4587             return 0;
4588           found = CATEGORY_MASK_BIG5;
4589         }
4590       else
4591         break;
4592     }
4593   detect_info->rejected |= CATEGORY_MASK_BIG5;
4594   return 0;
4595
4596  no_more_source:
4597   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4598     {
4599       detect_info->rejected |= CATEGORY_MASK_BIG5;
4600       return 0;
4601     }
4602   detect_info->found |= found;
4603   return 1;
4604 }
4605
4606 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4607
4608 static void
4609 decode_coding_sjis (struct coding_system *coding)
4610 {
4611   const unsigned char *src = coding->source + coding->consumed;
4612   const unsigned char *src_end = coding->source + coding->src_bytes;
4613   const unsigned char *src_base;
4614   int *charbuf = coding->charbuf + coding->charbuf_used;
4615   /* We may produce one charset annotation in one loop and one more at
4616      the end.  */
4617   int *charbuf_end
4618     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4619   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4620   bool multibytep = coding->src_multibyte;
4621   struct charset *charset_roman, *charset_kanji, *charset_kana;
4622   struct charset *charset_kanji2;
4623   Lisp_Object attrs, charset_list, val;
4624   ptrdiff_t char_offset = coding->produced_char;
4625   ptrdiff_t last_offset = char_offset;
4626   int last_id = charset_ascii;
4627   bool eol_dos
4628     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4629   int byte_after_cr = -1;
4630
4631   CODING_GET_INFO (coding, attrs, charset_list);
4632
4633   val = charset_list;
4634   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4635   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4637   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4638
4639   while (1)
4640     {
4641       int c, c1;
4642       struct charset *charset;
4643
4644       src_base = src;
4645       consumed_chars_base = consumed_chars;
4646
4647       if (charbuf >= charbuf_end)
4648         {
4649           if (byte_after_cr >= 0)
4650             src_base--;
4651           break;
4652         }
4653
4654       if (byte_after_cr >= 0)
4655         c = byte_after_cr, byte_after_cr = -1;
4656       else
4657         ONE_MORE_BYTE (c);
4658       if (c < 0)
4659         goto invalid_code;
4660       if (c < 0x80)
4661         {
4662           if (eol_dos && c == '\r')
4663             ONE_MORE_BYTE (byte_after_cr);
4664           charset = charset_roman;
4665         }
4666       else if (c == 0x80 || c == 0xA0)
4667         goto invalid_code;
4668       else if (c >= 0xA1 && c <= 0xDF)
4669         {
4670           /* SJIS -> JISX0201-Kana */
4671           c &= 0x7F;
4672           charset = charset_kana;
4673         }
4674       else if (c <= 0xEF)
4675         {
4676           /* SJIS -> JISX0208 */
4677           ONE_MORE_BYTE (c1);
4678           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4679             goto invalid_code;
4680           c = (c << 8) | c1;
4681           SJIS_TO_JIS (c);
4682           charset = charset_kanji;
4683         }
4684       else if (c <= 0xFC && charset_kanji2)
4685         {
4686           /* SJIS -> JISX0213-2 */
4687           ONE_MORE_BYTE (c1);
4688           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4689             goto invalid_code;
4690           c = (c << 8) | c1;
4691           SJIS_TO_JIS2 (c);
4692           charset = charset_kanji2;
4693         }
4694       else
4695         goto invalid_code;
4696       if (charset->id != charset_ascii
4697           && last_id != charset->id)
4698         {
4699           if (last_id != charset_ascii)
4700             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4701           last_id = charset->id;
4702           last_offset = char_offset;
4703         }
4704       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4705       *charbuf++ = c;
4706       char_offset++;
4707       continue;
4708
4709     invalid_code:
4710       src = src_base;
4711       consumed_chars = consumed_chars_base;
4712       ONE_MORE_BYTE (c);
4713       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4714       char_offset++;
4715       coding->errors++;
4716     }
4717
4718  no_more_source:
4719   if (last_id != charset_ascii)
4720     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4721   coding->consumed_char += consumed_chars_base;
4722   coding->consumed = src_base - coding->source;
4723   coding->charbuf_used = charbuf - coding->charbuf;
4724 }
4725
4726 static void
4727 decode_coding_big5 (struct coding_system *coding)
4728 {
4729   const unsigned char *src = coding->source + coding->consumed;
4730   const unsigned char *src_end = coding->source + coding->src_bytes;
4731   const unsigned char *src_base;
4732   int *charbuf = coding->charbuf + coding->charbuf_used;
4733   /* We may produce one charset annotation in one loop and one more at
4734      the end.  */
4735   int *charbuf_end
4736     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4737   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4738   bool multibytep = coding->src_multibyte;
4739   struct charset *charset_roman, *charset_big5;
4740   Lisp_Object attrs, charset_list, val;
4741   ptrdiff_t char_offset = coding->produced_char;
4742   ptrdiff_t last_offset = char_offset;
4743   int last_id = charset_ascii;
4744   bool eol_dos
4745     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4746   int byte_after_cr = -1;
4747
4748   CODING_GET_INFO (coding, attrs, charset_list);
4749   val = charset_list;
4750   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4751   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4752
4753   while (1)
4754     {
4755       int c, c1;
4756       struct charset *charset;
4757
4758       src_base = src;
4759       consumed_chars_base = consumed_chars;
4760
4761       if (charbuf >= charbuf_end)
4762         {
4763           if (byte_after_cr >= 0)
4764             src_base--;
4765           break;
4766         }
4767
4768       if (byte_after_cr >= 0)
4769         c = byte_after_cr, byte_after_cr = -1;
4770       else
4771         ONE_MORE_BYTE (c);
4772
4773       if (c < 0)
4774         goto invalid_code;
4775       if (c < 0x80)
4776         {
4777           if (eol_dos && c == '\r')
4778             ONE_MORE_BYTE (byte_after_cr);
4779           charset = charset_roman;
4780         }
4781       else
4782         {
4783           /* BIG5 -> Big5 */
4784           if (c < 0xA1 || c > 0xFE)
4785             goto invalid_code;
4786           ONE_MORE_BYTE (c1);
4787           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4788             goto invalid_code;
4789           c = c << 8 | c1;
4790           charset = charset_big5;
4791         }
4792       if (charset->id != charset_ascii
4793           && last_id != charset->id)
4794         {
4795           if (last_id != charset_ascii)
4796             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4797           last_id = charset->id;
4798           last_offset = char_offset;
4799         }
4800       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4801       *charbuf++ = c;
4802       char_offset++;
4803       continue;
4804
4805     invalid_code:
4806       src = src_base;
4807       consumed_chars = consumed_chars_base;
4808       ONE_MORE_BYTE (c);
4809       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4810       char_offset++;
4811       coding->errors++;
4812     }
4813
4814  no_more_source:
4815   if (last_id != charset_ascii)
4816     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4817   coding->consumed_char += consumed_chars_base;
4818   coding->consumed = src_base - coding->source;
4819   coding->charbuf_used = charbuf - coding->charbuf;
4820 }
4821
4822 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4823    This function can encode charsets `ascii', `katakana-jisx0201',
4824    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4825    are sure that all these charsets are registered as official charset
4826    (i.e. do not have extended leading-codes).  Characters of other
4827    charsets are produced without any encoding.  */
4828
4829 static bool
4830 encode_coding_sjis (struct coding_system *coding)
4831 {
4832   bool multibytep = coding->dst_multibyte;
4833   int *charbuf = coding->charbuf;
4834   int *charbuf_end = charbuf + coding->charbuf_used;
4835   unsigned char *dst = coding->destination + coding->produced;
4836   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4837   int safe_room = 4;
4838   ptrdiff_t produced_chars = 0;
4839   Lisp_Object attrs, charset_list, val;
4840   bool ascii_compatible;
4841   struct charset *charset_kanji, *charset_kana;
4842   struct charset *charset_kanji2;
4843   int c;
4844
4845   CODING_GET_INFO (coding, attrs, charset_list);
4846   val = XCDR (charset_list);
4847   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4848   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4849   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4850
4851   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4852
4853   while (charbuf < charbuf_end)
4854     {
4855       ASSURE_DESTINATION (safe_room);
4856       c = *charbuf++;
4857       /* Now encode the character C.  */
4858       if (ASCII_CHAR_P (c) && ascii_compatible)
4859         EMIT_ONE_ASCII_BYTE (c);
4860       else if (CHAR_BYTE8_P (c))
4861         {
4862           c = CHAR_TO_BYTE8 (c);
4863           EMIT_ONE_BYTE (c);
4864         }
4865       else
4866         {
4867           unsigned code;
4868           struct charset *charset;
4869           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4870                                &code, charset);
4871
4872           if (!charset)
4873             {
4874               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4875                 {
4876                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4877                   charset = CHARSET_FROM_ID (charset_ascii);
4878                 }
4879               else
4880                 {
4881                   c = coding->default_char;
4882                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4883                                        charset_list, &code, charset);
4884                 }
4885             }
4886           if (code == CHARSET_INVALID_CODE (charset))
4887             emacs_abort ();
4888           if (charset == charset_kanji)
4889             {
4890               int c1, c2;
4891               JIS_TO_SJIS (code);
4892               c1 = code >> 8, c2 = code & 0xFF;
4893               EMIT_TWO_BYTES (c1, c2);
4894             }
4895           else if (charset == charset_kana)
4896             EMIT_ONE_BYTE (code | 0x80);
4897           else if (charset_kanji2 && charset == charset_kanji2)
4898             {
4899               int c1, c2;
4900
4901               c1 = code >> 8;
4902               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4903                   || c1 == 0x28
4904                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4905                 {
4906                   JIS_TO_SJIS2 (code);
4907                   c1 = code >> 8, c2 = code & 0xFF;
4908                   EMIT_TWO_BYTES (c1, c2);
4909                 }
4910               else
4911                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4912             }
4913           else
4914             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4915         }
4916     }
4917   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4918   coding->produced_char += produced_chars;
4919   coding->produced = dst - coding->destination;
4920   return 0;
4921 }
4922
4923 static bool
4924 encode_coding_big5 (struct coding_system *coding)
4925 {
4926   bool multibytep = coding->dst_multibyte;
4927   int *charbuf = coding->charbuf;
4928   int *charbuf_end = charbuf + coding->charbuf_used;
4929   unsigned char *dst = coding->destination + coding->produced;
4930   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4931   int safe_room = 4;
4932   ptrdiff_t produced_chars = 0;
4933   Lisp_Object attrs, charset_list, val;
4934   bool ascii_compatible;
4935   struct charset *charset_big5;
4936   int c;
4937
4938   CODING_GET_INFO (coding, attrs, charset_list);
4939   val = XCDR (charset_list);
4940   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4941   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4942
4943   while (charbuf < charbuf_end)
4944     {
4945       ASSURE_DESTINATION (safe_room);
4946       c = *charbuf++;
4947       /* Now encode the character C.  */
4948       if (ASCII_CHAR_P (c) && ascii_compatible)
4949         EMIT_ONE_ASCII_BYTE (c);
4950       else if (CHAR_BYTE8_P (c))
4951         {
4952           c = CHAR_TO_BYTE8 (c);
4953           EMIT_ONE_BYTE (c);
4954         }
4955       else
4956         {
4957           unsigned code;
4958           struct charset *charset;
4959           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4960                                &code, charset);
4961
4962           if (! charset)
4963             {
4964               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4965                 {
4966                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4967                   charset = CHARSET_FROM_ID (charset_ascii);
4968                 }
4969               else
4970                 {
4971                   c = coding->default_char;
4972                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4973                                        charset_list, &code, charset);
4974                 }
4975             }
4976           if (code == CHARSET_INVALID_CODE (charset))
4977             emacs_abort ();
4978           if (charset == charset_big5)
4979             {
4980               int c1, c2;
4981
4982               c1 = code >> 8, c2 = code & 0xFF;
4983               EMIT_TWO_BYTES (c1, c2);
4984             }
4985           else
4986             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4987         }
4988     }
4989   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4990   coding->produced_char += produced_chars;
4991   coding->produced = dst - coding->destination;
4992   return 0;
4993 }
4994
4995 \f
4996 /*** 10. CCL handlers ***/
4997
4998 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4999    Return true if a text is encoded in a coding system of which
5000    encoder/decoder are written in CCL program.  */
5001
5002 static bool
5003 detect_coding_ccl (struct coding_system *coding,
5004                    struct coding_detection_info *detect_info)
5005 {
5006   const unsigned char *src = coding->source, *src_base;
5007   const unsigned char *src_end = coding->source + coding->src_bytes;
5008   bool multibytep = coding->src_multibyte;
5009   ptrdiff_t consumed_chars = 0;
5010   int found = 0;
5011   unsigned char *valids;
5012   ptrdiff_t head_ascii = coding->head_ascii;
5013   Lisp_Object attrs;
5014
5015   detect_info->checked |= CATEGORY_MASK_CCL;
5016
5017   coding = &coding_categories[coding_category_ccl];
5018   valids = CODING_CCL_VALIDS (coding);
5019   attrs = CODING_ID_ATTRS (coding->id);
5020   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5021     src += head_ascii;
5022
5023   while (1)
5024     {
5025       int c;
5026
5027       src_base = src;
5028       ONE_MORE_BYTE (c);
5029       if (c < 0 || ! valids[c])
5030         break;
5031       if ((valids[c] > 1))
5032         found = CATEGORY_MASK_CCL;
5033     }
5034   detect_info->rejected |= CATEGORY_MASK_CCL;
5035   return 0;
5036
5037  no_more_source:
5038   detect_info->found |= found;
5039   return 1;
5040 }
5041
5042 static void
5043 decode_coding_ccl (struct coding_system *coding)
5044 {
5045   const unsigned char *src = coding->source + coding->consumed;
5046   const unsigned char *src_end = coding->source + coding->src_bytes;
5047   int *charbuf = coding->charbuf + coding->charbuf_used;
5048   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5049   ptrdiff_t consumed_chars = 0;
5050   bool multibytep = coding->src_multibyte;
5051   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5052   int source_charbuf[1024];
5053   int source_byteidx[1025];
5054   Lisp_Object attrs, charset_list;
5055
5056   CODING_GET_INFO (coding, attrs, charset_list);
5057
5058   while (1)
5059     {
5060       const unsigned char *p = src;
5061       ptrdiff_t offset;
5062       int i = 0;
5063
5064       if (multibytep)
5065         {
5066           while (i < 1024 && p < src_end)
5067             {
5068               source_byteidx[i] = p - src;
5069               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5070             }
5071           source_byteidx[i] = p - src;
5072         }
5073       else
5074         while (i < 1024 && p < src_end)
5075           source_charbuf[i++] = *p++;
5076
5077       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5078         ccl->last_block = 1;
5079       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5080       charset_map_loaded = 0;
5081       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5082                   charset_list);
5083       if (charset_map_loaded
5084           && (offset = coding_change_source (coding)))
5085         {
5086           p += offset;
5087           src += offset;
5088           src_end += offset;
5089         }
5090       charbuf += ccl->produced;
5091       if (multibytep)
5092         src += source_byteidx[ccl->consumed];
5093       else
5094         src += ccl->consumed;
5095       consumed_chars += ccl->consumed;
5096       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5097         break;
5098     }
5099
5100   switch (ccl->status)
5101     {
5102     case CCL_STAT_SUSPEND_BY_SRC:
5103       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5104       break;
5105     case CCL_STAT_SUSPEND_BY_DST:
5106       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5107       break;
5108     case CCL_STAT_QUIT:
5109     case CCL_STAT_INVALID_CMD:
5110       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5111       break;
5112     default:
5113       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114       break;
5115     }
5116   coding->consumed_char += consumed_chars;
5117   coding->consumed = src - coding->source;
5118   coding->charbuf_used = charbuf - coding->charbuf;
5119 }
5120
5121 static bool
5122 encode_coding_ccl (struct coding_system *coding)
5123 {
5124   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5125   bool multibytep = coding->dst_multibyte;
5126   int *charbuf = coding->charbuf;
5127   int *charbuf_end = charbuf + coding->charbuf_used;
5128   unsigned char *dst = coding->destination + coding->produced;
5129   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5130   int destination_charbuf[1024];
5131   ptrdiff_t produced_chars = 0;
5132   int i;
5133   Lisp_Object attrs, charset_list;
5134
5135   CODING_GET_INFO (coding, attrs, charset_list);
5136   if (coding->consumed_char == coding->src_chars
5137       && coding->mode & CODING_MODE_LAST_BLOCK)
5138     ccl->last_block = 1;
5139
5140   do
5141     {
5142       ptrdiff_t offset;
5143
5144       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5145       charset_map_loaded = 0;
5146       ccl_driver (ccl, charbuf, destination_charbuf,
5147                   charbuf_end - charbuf, 1024, charset_list);
5148       if (charset_map_loaded
5149           && (offset = coding_change_destination (coding)))
5150         dst += offset;
5151       if (multibytep)
5152         {
5153           ASSURE_DESTINATION (ccl->produced * 2);
5154           for (i = 0; i < ccl->produced; i++)
5155             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5156         }
5157       else
5158         {
5159           ASSURE_DESTINATION (ccl->produced);
5160           for (i = 0; i < ccl->produced; i++)
5161             *dst++ = destination_charbuf[i] & 0xFF;
5162           produced_chars += ccl->produced;
5163         }
5164       charbuf += ccl->consumed;
5165       if (ccl->status == CCL_STAT_QUIT
5166           || ccl->status == CCL_STAT_INVALID_CMD)
5167         break;
5168     }
5169   while (charbuf < charbuf_end);
5170
5171   switch (ccl->status)
5172     {
5173     case CCL_STAT_SUSPEND_BY_SRC:
5174       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5175       break;
5176     case CCL_STAT_SUSPEND_BY_DST:
5177       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5178       break;
5179     case CCL_STAT_QUIT:
5180     case CCL_STAT_INVALID_CMD:
5181       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5182       break;
5183     default:
5184       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5185       break;
5186     }
5187
5188   coding->produced_char += produced_chars;
5189   coding->produced = dst - coding->destination;
5190   return 0;
5191 }
5192
5193 \f
5194 /*** 10, 11. no-conversion handlers ***/
5195
5196 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5197
5198 static void
5199 decode_coding_raw_text (struct coding_system *coding)
5200 {
5201   bool eol_dos
5202     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5203
5204   coding->chars_at_source = 1;
5205   coding->consumed_char = coding->src_chars;
5206   coding->consumed = coding->src_bytes;
5207   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5208     {
5209       coding->consumed_char--;
5210       coding->consumed--;
5211       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5212     }
5213   else
5214     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5215 }
5216
5217 static bool
5218 encode_coding_raw_text (struct coding_system *coding)
5219 {
5220   bool multibytep = coding->dst_multibyte;
5221   int *charbuf = coding->charbuf;
5222   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5223   unsigned char *dst = coding->destination + coding->produced;
5224   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5225   ptrdiff_t produced_chars = 0;
5226   int c;
5227
5228   if (multibytep)
5229     {
5230       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5231
5232       if (coding->src_multibyte)
5233         while (charbuf < charbuf_end)
5234           {
5235             ASSURE_DESTINATION (safe_room);
5236             c = *charbuf++;
5237             if (ASCII_CHAR_P (c))
5238               EMIT_ONE_ASCII_BYTE (c);
5239             else if (CHAR_BYTE8_P (c))
5240               {
5241                 c = CHAR_TO_BYTE8 (c);
5242                 EMIT_ONE_BYTE (c);
5243               }
5244             else
5245               {
5246                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5247
5248                 CHAR_STRING_ADVANCE (c, p1);
5249                 do
5250                   {
5251                     EMIT_ONE_BYTE (*p0);
5252                     p0++;
5253                   }
5254                 while (p0 < p1);
5255               }
5256           }
5257       else
5258         while (charbuf < charbuf_end)
5259           {
5260             ASSURE_DESTINATION (safe_room);
5261             c = *charbuf++;
5262             EMIT_ONE_BYTE (c);
5263           }
5264     }
5265   else
5266     {
5267       if (coding->src_multibyte)
5268         {
5269           int safe_room = MAX_MULTIBYTE_LENGTH;
5270
5271           while (charbuf < charbuf_end)
5272             {
5273               ASSURE_DESTINATION (safe_room);
5274               c = *charbuf++;
5275               if (ASCII_CHAR_P (c))
5276                 *dst++ = c;
5277               else if (CHAR_BYTE8_P (c))
5278                 *dst++ = CHAR_TO_BYTE8 (c);
5279               else
5280                 CHAR_STRING_ADVANCE (c, dst);
5281             }
5282         }
5283       else
5284         {
5285           ASSURE_DESTINATION (charbuf_end - charbuf);
5286           while (charbuf < charbuf_end && dst < dst_end)
5287             *dst++ = *charbuf++;
5288         }
5289       produced_chars = dst - (coding->destination + coding->produced);
5290     }
5291   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5292   coding->produced_char += produced_chars;
5293   coding->produced = dst - coding->destination;
5294   return 0;
5295 }
5296
5297 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5298    Return true if a text is encoded in a charset-based coding system.  */
5299
5300 static bool
5301 detect_coding_charset (struct coding_system *coding,
5302                        struct coding_detection_info *detect_info)
5303 {
5304   const unsigned char *src = coding->source, *src_base;
5305   const unsigned char *src_end = coding->source + coding->src_bytes;
5306   bool multibytep = coding->src_multibyte;
5307   ptrdiff_t consumed_chars = 0;
5308   Lisp_Object attrs, valids, name;
5309   int found = 0;
5310   ptrdiff_t head_ascii = coding->head_ascii;
5311   bool check_latin_extra = 0;
5312
5313   detect_info->checked |= CATEGORY_MASK_CHARSET;
5314
5315   coding = &coding_categories[coding_category_charset];
5316   attrs = CODING_ID_ATTRS (coding->id);
5317   valids = AREF (attrs, coding_attr_charset_valids);
5318   name = CODING_ID_NAME (coding->id);
5319   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5320                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5321       || strncmp (SSDATA (SYMBOL_NAME (name)),
5322                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5323     check_latin_extra = 1;
5324
5325   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5326     src += head_ascii;
5327
5328   while (1)
5329     {
5330       int c;
5331       Lisp_Object val;
5332       struct charset *charset;
5333       int dim, idx;
5334
5335       src_base = src;
5336       ONE_MORE_BYTE (c);
5337       if (c < 0)
5338         continue;
5339       val = AREF (valids, c);
5340       if (NILP (val))
5341         break;
5342       if (c >= 0x80)
5343         {
5344           if (c < 0xA0
5345               && check_latin_extra
5346               && (!VECTORP (Vlatin_extra_code_table)
5347                   || NILP (AREF (Vlatin_extra_code_table, c))))
5348             break;
5349           found = CATEGORY_MASK_CHARSET;
5350         }
5351       if (INTEGERP (val))
5352         {
5353           charset = CHARSET_FROM_ID (XFASTINT (val));
5354           dim = CHARSET_DIMENSION (charset);
5355           for (idx = 1; idx < dim; idx++)
5356             {
5357               if (src == src_end)
5358                 goto too_short;
5359               ONE_MORE_BYTE (c);
5360               if (c < charset->code_space[(dim - 1 - idx) * 4]
5361                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5362                 break;
5363             }
5364           if (idx < dim)
5365             break;
5366         }
5367       else
5368         {
5369           idx = 1;
5370           for (; CONSP (val); val = XCDR (val))
5371             {
5372               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5373               dim = CHARSET_DIMENSION (charset);
5374               while (idx < dim)
5375                 {
5376                   if (src == src_end)
5377                     goto too_short;
5378                   ONE_MORE_BYTE (c);
5379                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5380                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5381                     break;
5382                   idx++;
5383                 }
5384               if (idx == dim)
5385                 {
5386                   val = Qnil;
5387                   break;
5388                 }
5389             }
5390           if (CONSP (val))
5391             break;
5392         }
5393     }
5394  too_short:
5395   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5396   return 0;
5397
5398  no_more_source:
5399   detect_info->found |= found;
5400   return 1;
5401 }
5402
5403 static void
5404 decode_coding_charset (struct coding_system *coding)
5405 {
5406   const unsigned char *src = coding->source + coding->consumed;
5407   const unsigned char *src_end = coding->source + coding->src_bytes;
5408   const unsigned char *src_base;
5409   int *charbuf = coding->charbuf + coding->charbuf_used;
5410   /* We may produce one charset annotation in one loop and one more at
5411      the end.  */
5412   int *charbuf_end
5413     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5414   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5415   bool multibytep = coding->src_multibyte;
5416   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5417   Lisp_Object valids;
5418   ptrdiff_t char_offset = coding->produced_char;
5419   ptrdiff_t last_offset = char_offset;
5420   int last_id = charset_ascii;
5421   bool eol_dos
5422     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5423   int byte_after_cr = -1;
5424
5425   valids = AREF (attrs, coding_attr_charset_valids);
5426
5427   while (1)
5428     {
5429       int c;
5430       Lisp_Object val;
5431       struct charset *charset;
5432       int dim;
5433       int len = 1;
5434       unsigned code;
5435
5436       src_base = src;
5437       consumed_chars_base = consumed_chars;
5438
5439       if (charbuf >= charbuf_end)
5440         {
5441           if (byte_after_cr >= 0)
5442             src_base--;
5443           break;
5444         }
5445
5446       if (byte_after_cr >= 0)
5447         {
5448           c = byte_after_cr;
5449           byte_after_cr = -1;
5450         }
5451       else
5452         {
5453           ONE_MORE_BYTE (c);
5454           if (eol_dos && c == '\r')
5455             ONE_MORE_BYTE (byte_after_cr);
5456         }
5457       if (c < 0)
5458         goto invalid_code;
5459       code = c;
5460
5461       val = AREF (valids, c);
5462       if (! INTEGERP (val) && ! CONSP (val))
5463         goto invalid_code;
5464       if (INTEGERP (val))
5465         {
5466           charset = CHARSET_FROM_ID (XFASTINT (val));
5467           dim = CHARSET_DIMENSION (charset);
5468           while (len < dim)
5469             {
5470               ONE_MORE_BYTE (c);
5471               code = (code << 8) | c;
5472               len++;
5473             }
5474           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5475                               charset, code, c);
5476         }
5477       else
5478         {
5479           /* VAL is a list of charset IDs.  It is assured that the
5480              list is sorted by charset dimensions (smaller one
5481              comes first).  */
5482           while (CONSP (val))
5483             {
5484               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5485               dim = CHARSET_DIMENSION (charset);
5486               while (len < dim)
5487                 {
5488                   ONE_MORE_BYTE (c);
5489                   code = (code << 8) | c;
5490                   len++;
5491                 }
5492               CODING_DECODE_CHAR (coding, src, src_base,
5493                                   src_end, charset, code, c);
5494               if (c >= 0)
5495                 break;
5496               val = XCDR (val);
5497             }
5498         }
5499       if (c < 0)
5500         goto invalid_code;
5501       if (charset->id != charset_ascii
5502           && last_id != charset->id)
5503         {
5504           if (last_id != charset_ascii)
5505             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5506           last_id = charset->id;
5507           last_offset = char_offset;
5508         }
5509
5510       *charbuf++ = c;
5511       char_offset++;
5512       continue;
5513
5514     invalid_code:
5515       src = src_base;
5516       consumed_chars = consumed_chars_base;
5517       ONE_MORE_BYTE (c);
5518       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5519       char_offset++;
5520       coding->errors++;
5521     }
5522
5523  no_more_source:
5524   if (last_id != charset_ascii)
5525     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5526   coding->consumed_char += consumed_chars_base;
5527   coding->consumed = src_base - coding->source;
5528   coding->charbuf_used = charbuf - coding->charbuf;
5529 }
5530
5531 static bool
5532 encode_coding_charset (struct coding_system *coding)
5533 {
5534   bool multibytep = coding->dst_multibyte;
5535   int *charbuf = coding->charbuf;
5536   int *charbuf_end = charbuf + coding->charbuf_used;
5537   unsigned char *dst = coding->destination + coding->produced;
5538   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5539   int safe_room = MAX_MULTIBYTE_LENGTH;
5540   ptrdiff_t produced_chars = 0;
5541   Lisp_Object attrs, charset_list;
5542   bool ascii_compatible;
5543   int c;
5544
5545   CODING_GET_INFO (coding, attrs, charset_list);
5546   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5547
5548   while (charbuf < charbuf_end)
5549     {
5550       struct charset *charset;
5551       unsigned code;
5552
5553       ASSURE_DESTINATION (safe_room);
5554       c = *charbuf++;
5555       if (ascii_compatible && ASCII_CHAR_P (c))
5556         EMIT_ONE_ASCII_BYTE (c);
5557       else if (CHAR_BYTE8_P (c))
5558         {
5559           c = CHAR_TO_BYTE8 (c);
5560           EMIT_ONE_BYTE (c);
5561         }
5562       else
5563         {
5564           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5565                                &code, charset);
5566
5567           if (charset)
5568             {
5569               if (CHARSET_DIMENSION (charset) == 1)
5570                 EMIT_ONE_BYTE (code);
5571               else if (CHARSET_DIMENSION (charset) == 2)
5572                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5573               else if (CHARSET_DIMENSION (charset) == 3)
5574                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5575               else
5576                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5577                                  (code >> 8) & 0xFF, code & 0xFF);
5578             }
5579           else
5580             {
5581               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5582                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5583               else
5584                 c = coding->default_char;
5585               EMIT_ONE_BYTE (c);
5586             }
5587         }
5588     }
5589
5590   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5591   coding->produced_char += produced_chars;
5592   coding->produced = dst - coding->destination;
5593   return 0;
5594 }
5595
5596 \f
5597 /*** 7. C library functions ***/
5598
5599 /* Setup coding context CODING from information about CODING_SYSTEM.
5600    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5601    CODING_SYSTEM is invalid, signal an error.  */
5602
5603 void
5604 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5605 {
5606   Lisp_Object attrs;
5607   Lisp_Object eol_type;
5608   Lisp_Object coding_type;
5609   Lisp_Object val;
5610
5611   if (NILP (coding_system))
5612     coding_system = Qundecided;
5613
5614   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5615
5616   attrs = CODING_ID_ATTRS (coding->id);
5617   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5618
5619   coding->mode = 0;
5620   coding->head_ascii = -1;
5621   if (VECTORP (eol_type))
5622     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5623                             | CODING_REQUIRE_DETECTION_MASK);
5624   else if (! EQ (eol_type, Qunix))
5625     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5626                             | CODING_REQUIRE_ENCODING_MASK);
5627   else
5628     coding->common_flags = 0;
5629   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5630     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5631   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5632     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5633   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5634     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5635
5636   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5637   coding->max_charset_id = SCHARS (val) - 1;
5638   coding->safe_charsets = SDATA (val);
5639   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5640   coding->carryover_bytes = 0;
5641
5642   coding_type = CODING_ATTR_TYPE (attrs);
5643   if (EQ (coding_type, Qundecided))
5644     {
5645       coding->detector = NULL;
5646       coding->decoder = decode_coding_raw_text;
5647       coding->encoder = encode_coding_raw_text;
5648       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5649     }
5650   else if (EQ (coding_type, Qiso_2022))
5651     {
5652       int i;
5653       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5654
5655       /* Invoke graphic register 0 to plane 0.  */
5656       CODING_ISO_INVOCATION (coding, 0) = 0;
5657       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5658       CODING_ISO_INVOCATION (coding, 1)
5659         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5660       /* Setup the initial status of designation.  */
5661       for (i = 0; i < 4; i++)
5662         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5663       /* Not single shifting initially.  */
5664       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5665       /* Beginning of buffer should also be regarded as bol. */
5666       CODING_ISO_BOL (coding) = 1;
5667       coding->detector = detect_coding_iso_2022;
5668       coding->decoder = decode_coding_iso_2022;
5669       coding->encoder = encode_coding_iso_2022;
5670       if (flags & CODING_ISO_FLAG_SAFE)
5671         coding->mode |= CODING_MODE_SAFE_ENCODING;
5672       coding->common_flags
5673         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5674             | CODING_REQUIRE_FLUSHING_MASK);
5675       if (flags & CODING_ISO_FLAG_COMPOSITION)
5676         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5677       if (flags & CODING_ISO_FLAG_DESIGNATION)
5678         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5679       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5680         {
5681           setup_iso_safe_charsets (attrs);
5682           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5683           coding->max_charset_id = SCHARS (val) - 1;
5684           coding->safe_charsets = SDATA (val);
5685         }
5686       CODING_ISO_FLAGS (coding) = flags;
5687       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5688       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5689       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5690       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5691     }
5692   else if (EQ (coding_type, Qcharset))
5693     {
5694       coding->detector = detect_coding_charset;
5695       coding->decoder = decode_coding_charset;
5696       coding->encoder = encode_coding_charset;
5697       coding->common_flags
5698         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5699     }
5700   else if (EQ (coding_type, Qutf_8))
5701     {
5702       val = AREF (attrs, coding_attr_utf_bom);
5703       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5704                                    : EQ (val, Qt) ? utf_with_bom
5705                                    : utf_without_bom);
5706       coding->detector = detect_coding_utf_8;
5707       coding->decoder = decode_coding_utf_8;
5708       coding->encoder = encode_coding_utf_8;
5709       coding->common_flags
5710         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5711       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5712         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5713     }
5714   else if (EQ (coding_type, Qutf_16))
5715     {
5716       val = AREF (attrs, coding_attr_utf_bom);
5717       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5718                                     : EQ (val, Qt) ? utf_with_bom
5719                                     : utf_without_bom);
5720       val = AREF (attrs, coding_attr_utf_16_endian);
5721       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5722                                        : utf_16_little_endian);
5723       CODING_UTF_16_SURROGATE (coding) = 0;
5724       coding->detector = detect_coding_utf_16;
5725       coding->decoder = decode_coding_utf_16;
5726       coding->encoder = encode_coding_utf_16;
5727       coding->common_flags
5728         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5729       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5730         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5731     }
5732   else if (EQ (coding_type, Qccl))
5733     {
5734       coding->detector = detect_coding_ccl;
5735       coding->decoder = decode_coding_ccl;
5736       coding->encoder = encode_coding_ccl;
5737       coding->common_flags
5738         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5739             | CODING_REQUIRE_FLUSHING_MASK);
5740     }
5741   else if (EQ (coding_type, Qemacs_mule))
5742     {
5743       coding->detector = detect_coding_emacs_mule;
5744       coding->decoder = decode_coding_emacs_mule;
5745       coding->encoder = encode_coding_emacs_mule;
5746       coding->common_flags
5747         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5748       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5749           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5750         {
5751           Lisp_Object tail, safe_charsets;
5752           int max_charset_id = 0;
5753
5754           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5755                tail = XCDR (tail))
5756             if (max_charset_id < XFASTINT (XCAR (tail)))
5757               max_charset_id = XFASTINT (XCAR (tail));
5758           safe_charsets = make_uninit_string (max_charset_id + 1);
5759           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5760           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5761                tail = XCDR (tail))
5762             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5763           coding->max_charset_id = max_charset_id;
5764           coding->safe_charsets = SDATA (safe_charsets);
5765         }
5766       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5767       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5768     }
5769   else if (EQ (coding_type, Qshift_jis))
5770     {
5771       coding->detector = detect_coding_sjis;
5772       coding->decoder = decode_coding_sjis;
5773       coding->encoder = encode_coding_sjis;
5774       coding->common_flags
5775         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5776     }
5777   else if (EQ (coding_type, Qbig5))
5778     {
5779       coding->detector = detect_coding_big5;
5780       coding->decoder = decode_coding_big5;
5781       coding->encoder = encode_coding_big5;
5782       coding->common_flags
5783         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5784     }
5785   else                          /* EQ (coding_type, Qraw_text) */
5786     {
5787       coding->detector = NULL;
5788       coding->decoder = decode_coding_raw_text;
5789       coding->encoder = encode_coding_raw_text;
5790       if (! EQ (eol_type, Qunix))
5791         {
5792           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5793           if (! VECTORP (eol_type))
5794             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5795         }
5796
5797     }
5798
5799   return;
5800 }
5801
5802 /* Return a list of charsets supported by CODING.  */
5803
5804 Lisp_Object
5805 coding_charset_list (struct coding_system *coding)
5806 {
5807   Lisp_Object attrs, charset_list;
5808
5809   CODING_GET_INFO (coding, attrs, charset_list);
5810   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5811     {
5812       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5813
5814       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5815         charset_list = Viso_2022_charset_list;
5816     }
5817   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5818     {
5819       charset_list = Vemacs_mule_charset_list;
5820     }
5821   return charset_list;
5822 }
5823
5824
5825 /* Return a list of charsets supported by CODING-SYSTEM.  */
5826
5827 Lisp_Object
5828 coding_system_charset_list (Lisp_Object coding_system)
5829 {
5830   ptrdiff_t id;
5831   Lisp_Object attrs, charset_list;
5832
5833   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5834   attrs = CODING_ID_ATTRS (id);
5835
5836   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5837     {
5838       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5839
5840       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5841         charset_list = Viso_2022_charset_list;
5842       else
5843         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5844     }
5845   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5846     {
5847       charset_list = Vemacs_mule_charset_list;
5848     }
5849   else
5850     {
5851       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5852     }
5853   return charset_list;
5854 }
5855
5856
5857 /* Return raw-text or one of its subsidiaries that has the same
5858    eol_type as CODING-SYSTEM.  */
5859
5860 Lisp_Object
5861 raw_text_coding_system (Lisp_Object coding_system)
5862 {
5863   Lisp_Object spec, attrs;
5864   Lisp_Object eol_type, raw_text_eol_type;
5865
5866   if (NILP (coding_system))
5867     return Qraw_text;
5868   spec = CODING_SYSTEM_SPEC (coding_system);
5869   attrs = AREF (spec, 0);
5870
5871   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5872     return coding_system;
5873
5874   eol_type = AREF (spec, 2);
5875   if (VECTORP (eol_type))
5876     return Qraw_text;
5877   spec = CODING_SYSTEM_SPEC (Qraw_text);
5878   raw_text_eol_type = AREF (spec, 2);
5879   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5880           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5881           : AREF (raw_text_eol_type, 2));
5882 }
5883
5884
5885 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5886    the subsidiary that has the same eol-spec as PARENT (if it is not
5887    nil and specifies end-of-line format) or the system's setting
5888    (system_eol_type).  */
5889
5890 Lisp_Object
5891 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5892 {
5893   Lisp_Object spec, eol_type;
5894
5895   if (NILP (coding_system))
5896     coding_system = Qraw_text;
5897   spec = CODING_SYSTEM_SPEC (coding_system);
5898   eol_type = AREF (spec, 2);
5899   if (VECTORP (eol_type))
5900     {
5901       Lisp_Object parent_eol_type;
5902
5903       if (! NILP (parent))
5904         {
5905           Lisp_Object parent_spec;
5906
5907           parent_spec = CODING_SYSTEM_SPEC (parent);
5908           parent_eol_type = AREF (parent_spec, 2);
5909           if (VECTORP (parent_eol_type))
5910             parent_eol_type = system_eol_type;
5911         }
5912       else
5913         parent_eol_type = system_eol_type;
5914       if (EQ (parent_eol_type, Qunix))
5915         coding_system = AREF (eol_type, 0);
5916       else if (EQ (parent_eol_type, Qdos))
5917         coding_system = AREF (eol_type, 1);
5918       else if (EQ (parent_eol_type, Qmac))
5919         coding_system = AREF (eol_type, 2);
5920     }
5921   return coding_system;
5922 }
5923
5924
5925 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5926    decided for writing to a process.  If not, complement them, and
5927    return a new coding system.  */
5928
5929 Lisp_Object
5930 complement_process_encoding_system (Lisp_Object coding_system)
5931 {
5932   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5933   Lisp_Object spec, attrs;
5934   int i;
5935
5936   for (i = 0; i < 3; i++)
5937     {
5938       if (i == 1)
5939         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5940       else if (i == 2)
5941         coding_system = preferred_coding_system ();
5942       spec = CODING_SYSTEM_SPEC (coding_system);
5943       if (NILP (spec))
5944         continue;
5945       attrs = AREF (spec, 0);
5946       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5947         coding_base = CODING_ATTR_BASE_NAME (attrs);
5948       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5949         eol_base = coding_system;
5950       if (! NILP (coding_base) && ! NILP (eol_base))
5951         break;
5952     }
5953
5954   if (i > 0)
5955     /* The original CODING_SYSTEM didn't specify text-conversion or
5956        eol-conversion.  Be sure that we return a fully complemented
5957        coding system.  */
5958     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5959   return coding_system;
5960 }
5961
5962
5963 /* Emacs has a mechanism to automatically detect a coding system if it
5964    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5965    it's impossible to distinguish some coding systems accurately
5966    because they use the same range of codes.  So, at first, coding
5967    systems are categorized into 7, those are:
5968
5969    o coding-category-emacs-mule
5970
5971         The category for a coding system which has the same code range
5972         as Emacs' internal format.  Assigned the coding-system (Lisp
5973         symbol) `emacs-mule' by default.
5974
5975    o coding-category-sjis
5976
5977         The category for a coding system which has the same code range
5978         as SJIS.  Assigned the coding-system (Lisp
5979         symbol) `japanese-shift-jis' by default.
5980
5981    o coding-category-iso-7
5982
5983         The category for a coding system which has the same code range
5984         as ISO2022 of 7-bit environment.  This doesn't use any locking
5985         shift and single shift functions.  This can encode/decode all
5986         charsets.  Assigned the coding-system (Lisp symbol)
5987         `iso-2022-7bit' by default.
5988
5989    o coding-category-iso-7-tight
5990
5991         Same as coding-category-iso-7 except that this can
5992         encode/decode only the specified charsets.
5993
5994    o coding-category-iso-8-1
5995
5996         The category for a coding system which has the same code range
5997         as ISO2022 of 8-bit environment and graphic plane 1 used only
5998         for DIMENSION1 charset.  This doesn't use any locking shift
5999         and single shift functions.  Assigned the coding-system (Lisp
6000         symbol) `iso-latin-1' by default.
6001
6002    o coding-category-iso-8-2
6003
6004         The category for a coding system which has the same code range
6005         as ISO2022 of 8-bit environment and graphic plane 1 used only
6006         for DIMENSION2 charset.  This doesn't use any locking shift
6007         and single shift functions.  Assigned the coding-system (Lisp
6008         symbol) `japanese-iso-8bit' by default.
6009
6010    o coding-category-iso-7-else
6011
6012         The category for a coding system which has the same code range
6013         as ISO2022 of 7-bit environment but uses locking shift or
6014         single shift functions.  Assigned the coding-system (Lisp
6015         symbol) `iso-2022-7bit-lock' by default.
6016
6017    o coding-category-iso-8-else
6018
6019         The category for a coding system which has the same code range
6020         as ISO2022 of 8-bit environment but uses locking shift or
6021         single shift functions.  Assigned the coding-system (Lisp
6022         symbol) `iso-2022-8bit-ss2' by default.
6023
6024    o coding-category-big5
6025
6026         The category for a coding system which has the same code range
6027         as BIG5.  Assigned the coding-system (Lisp symbol)
6028         `cn-big5' by default.
6029
6030    o coding-category-utf-8
6031
6032         The category for a coding system which has the same code range
6033         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6034         symbol) `utf-8' by default.
6035
6036    o coding-category-utf-16-be
6037
6038         The category for a coding system in which a text has an
6039         Unicode signature (cf. Unicode Standard) in the order of BIG
6040         endian at the head.  Assigned the coding-system (Lisp symbol)
6041         `utf-16-be' by default.
6042
6043    o coding-category-utf-16-le
6044
6045         The category for a coding system in which a text has an
6046         Unicode signature (cf. Unicode Standard) in the order of
6047         LITTLE endian at the head.  Assigned the coding-system (Lisp
6048         symbol) `utf-16-le' by default.
6049
6050    o coding-category-ccl
6051
6052         The category for a coding system of which encoder/decoder is
6053         written in CCL programs.  The default value is nil, i.e., no
6054         coding system is assigned.
6055
6056    o coding-category-binary
6057
6058         The category for a coding system not categorized in any of the
6059         above.  Assigned the coding-system (Lisp symbol)
6060         `no-conversion' by default.
6061
6062    Each of them is a Lisp symbol and the value is an actual
6063    `coding-system's (this is also a Lisp symbol) assigned by a user.
6064    What Emacs does actually is to detect a category of coding system.
6065    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6066    decide only one possible category, it selects a category of the
6067    highest priority.  Priorities of categories are also specified by a
6068    user in a Lisp variable `coding-category-list'.
6069
6070 */
6071
6072 #define EOL_SEEN_NONE   0
6073 #define EOL_SEEN_LF     1
6074 #define EOL_SEEN_CR     2
6075 #define EOL_SEEN_CRLF   4
6076
6077 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6078    SOURCE is encoded.  If CATEGORY is one of
6079    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6080    two-byte, else they are encoded by one-byte.
6081
6082    Return one of EOL_SEEN_XXX.  */
6083
6084 #define MAX_EOL_CHECK_COUNT 3
6085
6086 static int
6087 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6088             enum coding_category category)
6089 {
6090   const unsigned char *src = source, *src_end = src + src_bytes;
6091   unsigned char c;
6092   int total  = 0;
6093   int eol_seen = EOL_SEEN_NONE;
6094
6095   if ((1 << category) & CATEGORY_MASK_UTF_16)
6096     {
6097       bool msb = category == (coding_category_utf_16_le
6098                               | coding_category_utf_16_le_nosig);
6099       bool lsb = !msb;
6100
6101       while (src + 1 < src_end)
6102         {
6103           c = src[lsb];
6104           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6105             {
6106               int this_eol;
6107
6108               if (c == '\n')
6109                 this_eol = EOL_SEEN_LF;
6110               else if (src + 3 >= src_end
6111                        || src[msb + 2] != 0
6112                        || src[lsb + 2] != '\n')
6113                 this_eol = EOL_SEEN_CR;
6114               else
6115                 {
6116                   this_eol = EOL_SEEN_CRLF;
6117                   src += 2;
6118                 }
6119
6120               if (eol_seen == EOL_SEEN_NONE)
6121                 /* This is the first end-of-line.  */
6122                 eol_seen = this_eol;
6123               else if (eol_seen != this_eol)
6124                 {
6125                   /* The found type is different from what found before.
6126                      Allow for stray ^M characters in DOS EOL files.  */
6127                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6128                       || (eol_seen == EOL_SEEN_CRLF
6129                           && this_eol == EOL_SEEN_CR))
6130                     eol_seen = EOL_SEEN_CRLF;
6131                   else
6132                     {
6133                       eol_seen = EOL_SEEN_LF;
6134                       break;
6135                     }
6136                 }
6137               if (++total == MAX_EOL_CHECK_COUNT)
6138                 break;
6139             }
6140           src += 2;
6141         }
6142     }
6143   else
6144     while (src < src_end)
6145       {
6146         c = *src++;
6147         if (c == '\n' || c == '\r')
6148           {
6149             int this_eol;
6150
6151             if (c == '\n')
6152               this_eol = EOL_SEEN_LF;
6153             else if (src >= src_end || *src != '\n')
6154               this_eol = EOL_SEEN_CR;
6155             else
6156               this_eol = EOL_SEEN_CRLF, src++;
6157
6158             if (eol_seen == EOL_SEEN_NONE)
6159               /* This is the first end-of-line.  */
6160               eol_seen = this_eol;
6161             else if (eol_seen != this_eol)
6162               {
6163                 /* The found type is different from what found before.
6164                    Allow for stray ^M characters in DOS EOL files.  */
6165                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6166                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6167                   eol_seen = EOL_SEEN_CRLF;
6168                 else
6169                   {
6170                     eol_seen = EOL_SEEN_LF;
6171                     break;
6172                   }
6173               }
6174             if (++total == MAX_EOL_CHECK_COUNT)
6175               break;
6176           }
6177       }
6178   return eol_seen;
6179 }
6180
6181
6182 static Lisp_Object
6183 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6184 {
6185   Lisp_Object eol_type;
6186
6187   eol_type = CODING_ID_EOL_TYPE (coding->id);
6188   if (eol_seen & EOL_SEEN_LF)
6189     {
6190       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6191       eol_type = Qunix;
6192     }
6193   else if (eol_seen & EOL_SEEN_CRLF)
6194     {
6195       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6196       eol_type = Qdos;
6197     }
6198   else if (eol_seen & EOL_SEEN_CR)
6199     {
6200       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6201       eol_type = Qmac;
6202     }
6203   return eol_type;
6204 }
6205
6206 /* Detect how a text specified in CODING is encoded.  If a coding
6207    system is detected, update fields of CODING by the detected coding
6208    system.  */
6209
6210 static void
6211 detect_coding (struct coding_system *coding)
6212 {
6213   const unsigned char *src, *src_end;
6214   unsigned int saved_mode = coding->mode;
6215
6216   coding->consumed = coding->consumed_char = 0;
6217   coding->produced = coding->produced_char = 0;
6218   coding_set_source (coding);
6219
6220   src_end = coding->source + coding->src_bytes;
6221   coding->head_ascii = 0;
6222
6223   /* If we have not yet decided the text encoding type, detect it
6224      now.  */
6225   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6226     {
6227       int c, i;
6228       struct coding_detection_info detect_info;
6229       bool null_byte_found = 0, eight_bit_found = 0;
6230
6231       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6232       for (src = coding->source; src < src_end; src++)
6233         {
6234           c = *src;
6235           if (c & 0x80)
6236             {
6237               eight_bit_found = 1;
6238               if (null_byte_found)
6239                 break;
6240             }
6241           else if (c < 0x20)
6242             {
6243               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6244                   && ! inhibit_iso_escape_detection
6245                   && ! detect_info.checked)
6246                 {
6247                   if (detect_coding_iso_2022 (coding, &detect_info))
6248                     {
6249                       /* We have scanned the whole data.  */
6250                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6251                         {
6252                           /* We didn't find an 8-bit code.  We may
6253                              have found a null-byte, but it's very
6254                              rare that a binary file conforms to
6255                              ISO-2022.  */
6256                           src = src_end;
6257                           coding->head_ascii = src - coding->source;
6258                         }
6259                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6260                       break;
6261                     }
6262                 }
6263               else if (! c && !inhibit_null_byte_detection)
6264                 {
6265                   null_byte_found = 1;
6266                   if (eight_bit_found)
6267                     break;
6268                 }
6269               if (! eight_bit_found)
6270                 coding->head_ascii++;
6271             }
6272           else if (! eight_bit_found)
6273             coding->head_ascii++;
6274         }
6275
6276       if (null_byte_found || eight_bit_found
6277           || coding->head_ascii < coding->src_bytes
6278           || detect_info.found)
6279         {
6280           enum coding_category category;
6281           struct coding_system *this;
6282
6283           if (coding->head_ascii == coding->src_bytes)
6284             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6285             for (i = 0; i < coding_category_raw_text; i++)
6286               {
6287                 category = coding_priorities[i];
6288                 this = coding_categories + category;
6289                 if (detect_info.found & (1 << category))
6290                   break;
6291               }
6292           else
6293             {
6294               if (null_byte_found)
6295                 {
6296                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6297                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6298                 }
6299               for (i = 0; i < coding_category_raw_text; i++)
6300                 {
6301                   category = coding_priorities[i];
6302                   this = coding_categories + category;
6303                   /* Some of this->detector (e.g. detect_coding_sjis)
6304                      require this information.  */
6305                   coding->id = this->id;
6306                   if (this->id < 0)
6307                     {
6308                       /* No coding system of this category is defined.  */
6309                       detect_info.rejected |= (1 << category);
6310                     }
6311                   else if (category >= coding_category_raw_text)
6312                     continue;
6313                   else if (detect_info.checked & (1 << category))
6314                     {
6315                       if (detect_info.found & (1 << category))
6316                         break;
6317                     }
6318                   else if ((*(this->detector)) (coding, &detect_info)
6319                            && detect_info.found & (1 << category))
6320                     {
6321                       if (category == coding_category_utf_16_auto)
6322                         {
6323                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6324                             category = coding_category_utf_16_le;
6325                           else
6326                             category = coding_category_utf_16_be;
6327                         }
6328                       break;
6329                     }
6330                 }
6331             }
6332
6333           if (i < coding_category_raw_text)
6334             setup_coding_system (CODING_ID_NAME (this->id), coding);
6335           else if (null_byte_found)
6336             setup_coding_system (Qno_conversion, coding);
6337           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6338                    == CATEGORY_MASK_ANY)
6339             setup_coding_system (Qraw_text, coding);
6340           else if (detect_info.rejected)
6341             for (i = 0; i < coding_category_raw_text; i++)
6342               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6343                 {
6344                   this = coding_categories + coding_priorities[i];
6345                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6346                   break;
6347                 }
6348         }
6349     }
6350   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6351            == coding_category_utf_8_auto)
6352     {
6353       Lisp_Object coding_systems;
6354       struct coding_detection_info detect_info;
6355
6356       coding_systems
6357         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6358       detect_info.found = detect_info.rejected = 0;
6359       coding->head_ascii = 0;
6360       if (CONSP (coding_systems)
6361           && detect_coding_utf_8 (coding, &detect_info))
6362         {
6363           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6364             setup_coding_system (XCAR (coding_systems), coding);
6365           else
6366             setup_coding_system (XCDR (coding_systems), coding);
6367         }
6368     }
6369   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6370            == coding_category_utf_16_auto)
6371     {
6372       Lisp_Object coding_systems;
6373       struct coding_detection_info detect_info;
6374
6375       coding_systems
6376         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6377       detect_info.found = detect_info.rejected = 0;
6378       coding->head_ascii = 0;
6379       if (CONSP (coding_systems)
6380           && detect_coding_utf_16 (coding, &detect_info))
6381         {
6382           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6383             setup_coding_system (XCAR (coding_systems), coding);
6384           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6385             setup_coding_system (XCDR (coding_systems), coding);
6386         }
6387     }
6388   coding->mode = saved_mode;
6389 }
6390
6391
6392 static void
6393 decode_eol (struct coding_system *coding)
6394 {
6395   Lisp_Object eol_type;
6396   unsigned char *p, *pbeg, *pend;
6397
6398   eol_type = CODING_ID_EOL_TYPE (coding->id);
6399   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6400     return;
6401
6402   if (NILP (coding->dst_object))
6403     pbeg = coding->destination;
6404   else
6405     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6406   pend = pbeg + coding->produced;
6407
6408   if (VECTORP (eol_type))
6409     {
6410       int eol_seen = EOL_SEEN_NONE;
6411
6412       for (p = pbeg; p < pend; p++)
6413         {
6414           if (*p == '\n')
6415             eol_seen |= EOL_SEEN_LF;
6416           else if (*p == '\r')
6417             {
6418               if (p + 1 < pend && *(p + 1) == '\n')
6419                 {
6420                   eol_seen |= EOL_SEEN_CRLF;
6421                   p++;
6422                 }
6423               else
6424                 eol_seen |= EOL_SEEN_CR;
6425             }
6426         }
6427       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6428       if ((eol_seen & EOL_SEEN_CRLF) != 0
6429           && (eol_seen & EOL_SEEN_CR) != 0
6430           && (eol_seen & EOL_SEEN_LF) == 0)
6431         eol_seen = EOL_SEEN_CRLF;
6432       else if (eol_seen != EOL_SEEN_NONE
6433           && eol_seen != EOL_SEEN_LF
6434           && eol_seen != EOL_SEEN_CRLF
6435           && eol_seen != EOL_SEEN_CR)
6436         eol_seen = EOL_SEEN_LF;
6437       if (eol_seen != EOL_SEEN_NONE)
6438         eol_type = adjust_coding_eol_type (coding, eol_seen);
6439     }
6440
6441   if (EQ (eol_type, Qmac))
6442     {
6443       for (p = pbeg; p < pend; p++)
6444         if (*p == '\r')
6445           *p = '\n';
6446     }
6447   else if (EQ (eol_type, Qdos))
6448     {
6449       ptrdiff_t n = 0;
6450
6451       if (NILP (coding->dst_object))
6452         {
6453           /* Start deleting '\r' from the tail to minimize the memory
6454              movement.  */
6455           for (p = pend - 2; p >= pbeg; p--)
6456             if (*p == '\r')
6457               {
6458                 memmove (p, p + 1, pend-- - p - 1);
6459                 n++;
6460               }
6461         }
6462       else
6463         {
6464           ptrdiff_t pos_byte = coding->dst_pos_byte;
6465           ptrdiff_t pos = coding->dst_pos;
6466           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6467
6468           while (pos < pos_end)
6469             {
6470               p = BYTE_POS_ADDR (pos_byte);
6471               if (*p == '\r' && p[1] == '\n')
6472                 {
6473                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6474                   n++;
6475                   pos_end--;
6476                 }
6477               pos++;
6478               if (coding->dst_multibyte)
6479                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6480               else
6481                 pos_byte++;
6482             }
6483         }
6484       coding->produced -= n;
6485       coding->produced_char -= n;
6486     }
6487 }
6488
6489
6490 /* Return a translation table (or list of them) from coding system
6491    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6492    not ENCODEP). */
6493
6494 static Lisp_Object
6495 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6496 {
6497   Lisp_Object standard, translation_table;
6498   Lisp_Object val;
6499
6500   if (NILP (Venable_character_translation))
6501     {
6502       if (max_lookup)
6503         *max_lookup = 0;
6504       return Qnil;
6505     }
6506   if (encodep)
6507     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6508       standard = Vstandard_translation_table_for_encode;
6509   else
6510     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6511       standard = Vstandard_translation_table_for_decode;
6512   if (NILP (translation_table))
6513     translation_table = standard;
6514   else
6515     {
6516       if (SYMBOLP (translation_table))
6517         translation_table = Fget (translation_table, Qtranslation_table);
6518       else if (CONSP (translation_table))
6519         {
6520           translation_table = Fcopy_sequence (translation_table);
6521           for (val = translation_table; CONSP (val); val = XCDR (val))
6522             if (SYMBOLP (XCAR (val)))
6523               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6524         }
6525       if (CHAR_TABLE_P (standard))
6526         {
6527           if (CONSP (translation_table))
6528             translation_table = nconc2 (translation_table,
6529                                         Fcons (standard, Qnil));
6530           else
6531             translation_table = Fcons (translation_table,
6532                                        Fcons (standard, Qnil));
6533         }
6534     }
6535
6536   if (max_lookup)
6537     {
6538       *max_lookup = 1;
6539       if (CHAR_TABLE_P (translation_table)
6540           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6541         {
6542           val = XCHAR_TABLE (translation_table)->extras[1];
6543           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6544             *max_lookup = XFASTINT (val);
6545         }
6546       else if (CONSP (translation_table))
6547         {
6548           Lisp_Object tail;
6549
6550           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6551             if (CHAR_TABLE_P (XCAR (tail))
6552                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6553               {
6554                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6555                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6556                   *max_lookup = XFASTINT (tailval);
6557               }
6558         }
6559     }
6560   return translation_table;
6561 }
6562
6563 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6564   do {                                                          \
6565     trans = Qnil;                                               \
6566     if (CHAR_TABLE_P (table))                                   \
6567       {                                                         \
6568         trans = CHAR_TABLE_REF (table, c);                      \
6569         if (CHARACTERP (trans))                                 \
6570           c = XFASTINT (trans), trans = Qnil;                   \
6571       }                                                         \
6572     else if (CONSP (table))                                     \
6573       {                                                         \
6574         Lisp_Object tail;                                       \
6575                                                                 \
6576         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6577           if (CHAR_TABLE_P (XCAR (tail)))                       \
6578             {                                                   \
6579               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6580               if (CHARACTERP (trans))                           \
6581                 c = XFASTINT (trans), trans = Qnil;             \
6582               else if (! NILP (trans))                          \
6583                 break;                                          \
6584             }                                                   \
6585       }                                                         \
6586   } while (0)
6587
6588
6589 /* Return a translation of character(s) at BUF according to TRANS.
6590    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6591    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6592    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6593    translation is found, and Qnil if not found..
6594    If BUF is too short to lookup characters in FROM, return Qt.  */
6595
6596 static Lisp_Object
6597 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6598 {
6599
6600   if (INTEGERP (trans))
6601     return trans;
6602   for (; CONSP (trans); trans = XCDR (trans))
6603     {
6604       Lisp_Object val = XCAR (trans);
6605       Lisp_Object from = XCAR (val);
6606       ptrdiff_t len = ASIZE (from);
6607       ptrdiff_t i;
6608
6609       for (i = 0; i < len; i++)
6610         {
6611           if (buf + i == buf_end)
6612             return Qt;
6613           if (XINT (AREF (from, i)) != buf[i])
6614             break;
6615         }
6616       if (i == len)
6617         return val;
6618     }
6619   return Qnil;
6620 }
6621
6622
6623 static int
6624 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6625                bool last_block)
6626 {
6627   unsigned char *dst = coding->destination + coding->produced;
6628   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6629   ptrdiff_t produced;
6630   ptrdiff_t produced_chars = 0;
6631   int carryover = 0;
6632
6633   if (! coding->chars_at_source)
6634     {
6635       /* Source characters are in coding->charbuf.  */
6636       int *buf = coding->charbuf;
6637       int *buf_end = buf + coding->charbuf_used;
6638
6639       if (EQ (coding->src_object, coding->dst_object))
6640         {
6641           coding_set_source (coding);
6642           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6643         }
6644
6645       while (buf < buf_end)
6646         {
6647           int c = *buf;
6648           ptrdiff_t i;
6649
6650           if (c >= 0)
6651             {
6652               ptrdiff_t from_nchars = 1, to_nchars = 1;
6653               Lisp_Object trans = Qnil;
6654
6655               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6656               if (! NILP (trans))
6657                 {
6658                   trans = get_translation (trans, buf, buf_end);
6659                   if (INTEGERP (trans))
6660                     c = XINT (trans);
6661                   else if (CONSP (trans))
6662                     {
6663                       from_nchars = ASIZE (XCAR (trans));
6664                       trans = XCDR (trans);
6665                       if (INTEGERP (trans))
6666                         c = XINT (trans);
6667                       else
6668                         {
6669                           to_nchars = ASIZE (trans);
6670                           c = XINT (AREF (trans, 0));
6671                         }
6672                     }
6673                   else if (EQ (trans, Qt) && ! last_block)
6674                     break;
6675                 }
6676
6677               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6678                 {
6679                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6680                        / MAX_MULTIBYTE_LENGTH)
6681                       < to_nchars)
6682                     memory_full (SIZE_MAX);
6683                   dst = alloc_destination (coding,
6684                                            buf_end - buf
6685                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6686                                            dst);
6687                   if (EQ (coding->src_object, coding->dst_object))
6688                     {
6689                       coding_set_source (coding);
6690                       dst_end = (((unsigned char *) coding->source)
6691                                  + coding->consumed);
6692                     }
6693                   else
6694                     dst_end = coding->destination + coding->dst_bytes;
6695                 }
6696
6697               for (i = 0; i < to_nchars; i++)
6698                 {
6699                   if (i > 0)
6700                     c = XINT (AREF (trans, i));
6701                   if (coding->dst_multibyte
6702                       || ! CHAR_BYTE8_P (c))
6703                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6704                   else
6705                     *dst++ = CHAR_TO_BYTE8 (c);
6706                 }
6707               produced_chars += to_nchars;
6708               buf += from_nchars;
6709             }
6710           else
6711             /* This is an annotation datum.  (-C) is the length.  */
6712             buf += -c;
6713         }
6714       carryover = buf_end - buf;
6715     }
6716   else
6717     {
6718       /* Source characters are at coding->source.  */
6719       const unsigned char *src = coding->source;
6720       const unsigned char *src_end = src + coding->consumed;
6721
6722       if (EQ (coding->dst_object, coding->src_object))
6723         dst_end = (unsigned char *) src;
6724       if (coding->src_multibyte != coding->dst_multibyte)
6725         {
6726           if (coding->src_multibyte)
6727             {
6728               bool multibytep = 1;
6729               ptrdiff_t consumed_chars = 0;
6730
6731               while (1)
6732                 {
6733                   const unsigned char *src_base = src;
6734                   int c;
6735
6736                   ONE_MORE_BYTE (c);
6737                   if (dst == dst_end)
6738                     {
6739                       if (EQ (coding->src_object, coding->dst_object))
6740                         dst_end = (unsigned char *) src;
6741                       if (dst == dst_end)
6742                         {
6743                           ptrdiff_t offset = src - coding->source;
6744
6745                           dst = alloc_destination (coding, src_end - src + 1,
6746                                                    dst);
6747                           dst_end = coding->destination + coding->dst_bytes;
6748                           coding_set_source (coding);
6749                           src = coding->source + offset;
6750                           src_end = coding->source + coding->consumed;
6751                           if (EQ (coding->src_object, coding->dst_object))
6752                             dst_end = (unsigned char *) src;
6753                         }
6754                     }
6755                   *dst++ = c;
6756                   produced_chars++;
6757                 }
6758             no_more_source:
6759               ;
6760             }
6761           else
6762             while (src < src_end)
6763               {
6764                 bool multibytep = 1;
6765                 int c = *src++;
6766
6767                 if (dst >= dst_end - 1)
6768                   {
6769                     if (EQ (coding->src_object, coding->dst_object))
6770                       dst_end = (unsigned char *) src;
6771                     if (dst >= dst_end - 1)
6772                       {
6773                         ptrdiff_t offset = src - coding->source;
6774                         ptrdiff_t more_bytes;
6775
6776                         if (EQ (coding->src_object, coding->dst_object))
6777                           more_bytes = ((src_end - src) / 2) + 2;
6778                         else
6779                           more_bytes = src_end - src + 2;
6780                         dst = alloc_destination (coding, more_bytes, dst);
6781                         dst_end = coding->destination + coding->dst_bytes;
6782                         coding_set_source (coding);
6783                         src = coding->source + offset;
6784                         src_end = coding->source + coding->consumed;
6785                         if (EQ (coding->src_object, coding->dst_object))
6786                           dst_end = (unsigned char *) src;
6787                       }
6788                   }
6789                 EMIT_ONE_BYTE (c);
6790               }
6791         }
6792       else
6793         {
6794           if (!EQ (coding->src_object, coding->dst_object))
6795             {
6796               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6797
6798               if (require > 0)
6799                 {
6800                   ptrdiff_t offset = src - coding->source;
6801
6802                   dst = alloc_destination (coding, require, dst);
6803                   coding_set_source (coding);
6804                   src = coding->source + offset;
6805                   src_end = coding->source + coding->consumed;
6806                 }
6807             }
6808           produced_chars = coding->consumed_char;
6809           while (src < src_end)
6810             *dst++ = *src++;
6811         }
6812     }
6813
6814   produced = dst - (coding->destination + coding->produced);
6815   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6816     insert_from_gap (produced_chars, produced);
6817   coding->produced += produced;
6818   coding->produced_char += produced_chars;
6819   return carryover;
6820 }
6821
6822 /* Compose text in CODING->object according to the annotation data at
6823    CHARBUF.  CHARBUF is an array:
6824      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6825  */
6826
6827 static void
6828 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6829 {
6830   int len;
6831   ptrdiff_t to;
6832   enum composition_method method;
6833   Lisp_Object components;
6834
6835   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6836   to = pos + charbuf[2];
6837   method = (enum composition_method) (charbuf[4]);
6838
6839   if (method == COMPOSITION_RELATIVE)
6840     components = Qnil;
6841   else
6842     {
6843       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6844       int i, j;
6845
6846       if (method == COMPOSITION_WITH_RULE)
6847         len = charbuf[2] * 3 - 2;
6848       charbuf += MAX_ANNOTATION_LENGTH;
6849       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6850       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6851         {
6852           if (charbuf[i] >= 0)
6853             args[j] = make_number (charbuf[i]);
6854           else
6855             {
6856               i++;
6857               args[j] = make_number (charbuf[i] % 0x100);
6858             }
6859         }
6860       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6861     }
6862   compose_text (pos, to, components, Qnil, coding->dst_object);
6863 }
6864
6865
6866 /* Put `charset' property on text in CODING->object according to
6867    the annotation data at CHARBUF.  CHARBUF is an array:
6868      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6869  */
6870
6871 static void
6872 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6873 {
6874   ptrdiff_t from = pos - charbuf[2];
6875   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6876
6877   Fput_text_property (make_number (from), make_number (pos),
6878                       Qcharset, CHARSET_NAME (charset),
6879                       coding->dst_object);
6880 }
6881
6882
6883 #define CHARBUF_SIZE 0x4000
6884
6885 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6886   do {                                                                  \
6887     int size = CHARBUF_SIZE;                                            \
6888                                                                         \
6889     coding->charbuf = NULL;                                             \
6890     while (size > 1024)                                                 \
6891       {                                                                 \
6892         coding->charbuf = alloca (sizeof (int) * size);                 \
6893         if (coding->charbuf)                                            \
6894           break;                                                        \
6895         size >>= 1;                                                     \
6896       }                                                                 \
6897     if (! coding->charbuf)                                              \
6898       {                                                                 \
6899         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6900         return;                                                         \
6901       }                                                                 \
6902     coding->charbuf_size = size;                                        \
6903   } while (0)
6904
6905
6906 static void
6907 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6908 {
6909   int *charbuf = coding->charbuf;
6910   int *charbuf_end = charbuf + coding->charbuf_used;
6911
6912   if (NILP (coding->dst_object))
6913     return;
6914
6915   while (charbuf < charbuf_end)
6916     {
6917       if (*charbuf >= 0)
6918         pos++, charbuf++;
6919       else
6920         {
6921           int len = -*charbuf;
6922
6923           if (len > 2)
6924             switch (charbuf[1])
6925               {
6926               case CODING_ANNOTATE_COMPOSITION_MASK:
6927                 produce_composition (coding, charbuf, pos);
6928                 break;
6929               case CODING_ANNOTATE_CHARSET_MASK:
6930                 produce_charset (coding, charbuf, pos);
6931                 break;
6932               }
6933           charbuf += len;
6934         }
6935     }
6936 }
6937
6938 /* Decode the data at CODING->src_object into CODING->dst_object.
6939    CODING->src_object is a buffer, a string, or nil.
6940    CODING->dst_object is a buffer.
6941
6942    If CODING->src_object is a buffer, it must be the current buffer.
6943    In this case, if CODING->src_pos is positive, it is a position of
6944    the source text in the buffer, otherwise, the source text is in the
6945    gap area of the buffer, and CODING->src_pos specifies the offset of
6946    the text from GPT (which must be the same as PT).  If this is the
6947    same buffer as CODING->dst_object, CODING->src_pos must be
6948    negative.
6949
6950    If CODING->src_object is a string, CODING->src_pos is an index to
6951    that string.
6952
6953    If CODING->src_object is nil, CODING->source must already point to
6954    the non-relocatable memory area.  In this case, CODING->src_pos is
6955    an offset from CODING->source.
6956
6957    The decoded data is inserted at the current point of the buffer
6958    CODING->dst_object.
6959 */
6960
6961 static void
6962 decode_coding (struct coding_system *coding)
6963 {
6964   Lisp_Object attrs;
6965   Lisp_Object undo_list;
6966   Lisp_Object translation_table;
6967   struct ccl_spec cclspec;
6968   int carryover;
6969   int i;
6970
6971   if (BUFFERP (coding->src_object)
6972       && coding->src_pos > 0
6973       && coding->src_pos < GPT
6974       && coding->src_pos + coding->src_chars > GPT)
6975     move_gap_both (coding->src_pos, coding->src_pos_byte);
6976
6977   undo_list = Qt;
6978   if (BUFFERP (coding->dst_object))
6979     {
6980       set_buffer_internal (XBUFFER (coding->dst_object));
6981       if (GPT != PT)
6982         move_gap_both (PT, PT_BYTE);
6983
6984       /* We must disable undo_list in order to record the whole insert
6985          transaction via record_insert at the end.  But doing so also
6986          disables the recording of the first change to the undo_list.
6987          Therefore we check for first change here and record it via
6988          record_first_change if needed.  */
6989       if (MODIFF <= SAVE_MODIFF)
6990         record_first_change ();
6991
6992       undo_list = BVAR (current_buffer, undo_list);
6993       bset_undo_list (current_buffer, Qt);
6994     }
6995
6996   coding->consumed = coding->consumed_char = 0;
6997   coding->produced = coding->produced_char = 0;
6998   coding->chars_at_source = 0;
6999   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7000   coding->errors = 0;
7001
7002   ALLOC_CONVERSION_WORK_AREA (coding);
7003
7004   attrs = CODING_ID_ATTRS (coding->id);
7005   translation_table = get_translation_table (attrs, 0, NULL);
7006
7007   carryover = 0;
7008   if (coding->decoder == decode_coding_ccl)
7009     {
7010       coding->spec.ccl = &cclspec;
7011       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7012     }
7013   do
7014     {
7015       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7016
7017       coding_set_source (coding);
7018       coding->annotated = 0;
7019       coding->charbuf_used = carryover;
7020       (*(coding->decoder)) (coding);
7021       coding_set_destination (coding);
7022       carryover = produce_chars (coding, translation_table, 0);
7023       if (coding->annotated)
7024         produce_annotation (coding, pos);
7025       for (i = 0; i < carryover; i++)
7026         coding->charbuf[i]
7027           = coding->charbuf[coding->charbuf_used - carryover + i];
7028     }
7029   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7030          || (coding->consumed < coding->src_bytes
7031              && (coding->result == CODING_RESULT_SUCCESS
7032                  || coding->result == CODING_RESULT_INVALID_SRC)));
7033
7034   if (carryover > 0)
7035     {
7036       coding_set_destination (coding);
7037       coding->charbuf_used = carryover;
7038       produce_chars (coding, translation_table, 1);
7039     }
7040
7041   coding->carryover_bytes = 0;
7042   if (coding->consumed < coding->src_bytes)
7043     {
7044       int nbytes = coding->src_bytes - coding->consumed;
7045       const unsigned char *src;
7046
7047       coding_set_source (coding);
7048       coding_set_destination (coding);
7049       src = coding->source + coding->consumed;
7050
7051       if (coding->mode & CODING_MODE_LAST_BLOCK)
7052         {
7053           /* Flush out unprocessed data as binary chars.  We are sure
7054              that the number of data is less than the size of
7055              coding->charbuf.  */
7056           coding->charbuf_used = 0;
7057           coding->chars_at_source = 0;
7058
7059           while (nbytes-- > 0)
7060             {
7061               int c = *src++;
7062
7063               if (c & 0x80)
7064                 c = BYTE8_TO_CHAR (c);
7065               coding->charbuf[coding->charbuf_used++] = c;
7066             }
7067           produce_chars (coding, Qnil, 1);
7068         }
7069       else
7070         {
7071           /* Record unprocessed bytes in coding->carryover.  We are
7072              sure that the number of data is less than the size of
7073              coding->carryover.  */
7074           unsigned char *p = coding->carryover;
7075
7076           if (nbytes > sizeof coding->carryover)
7077             nbytes = sizeof coding->carryover;
7078           coding->carryover_bytes = nbytes;
7079           while (nbytes-- > 0)
7080             *p++ = *src++;
7081         }
7082       coding->consumed = coding->src_bytes;
7083     }
7084
7085   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7086       && !inhibit_eol_conversion)
7087     decode_eol (coding);
7088   if (BUFFERP (coding->dst_object))
7089     {
7090       bset_undo_list (current_buffer, undo_list);
7091       record_insert (coding->dst_pos, coding->produced_char);
7092     }
7093 }
7094
7095
7096 /* Extract an annotation datum from a composition starting at POS and
7097    ending before LIMIT of CODING->src_object (buffer or string), store
7098    the data in BUF, set *STOP to a starting position of the next
7099    composition (if any) or to LIMIT, and return the address of the
7100    next element of BUF.
7101
7102    If such an annotation is not found, set *STOP to a starting
7103    position of a composition after POS (if any) or to LIMIT, and
7104    return BUF.  */
7105
7106 static int *
7107 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7108                                struct coding_system *coding, int *buf,
7109                                ptrdiff_t *stop)
7110 {
7111   ptrdiff_t start, end;
7112   Lisp_Object prop;
7113
7114   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7115       || end > limit)
7116     *stop = limit;
7117   else if (start > pos)
7118     *stop = start;
7119   else
7120     {
7121       if (start == pos)
7122         {
7123           /* We found a composition.  Store the corresponding
7124              annotation data in BUF.  */
7125           int *head = buf;
7126           enum composition_method method = COMPOSITION_METHOD (prop);
7127           int nchars = COMPOSITION_LENGTH (prop);
7128
7129           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7130           if (method != COMPOSITION_RELATIVE)
7131             {
7132               Lisp_Object components;
7133               ptrdiff_t i, len, i_byte;
7134
7135               components = COMPOSITION_COMPONENTS (prop);
7136               if (VECTORP (components))
7137                 {
7138                   len = ASIZE (components);
7139                   for (i = 0; i < len; i++)
7140                     *buf++ = XINT (AREF (components, i));
7141                 }
7142               else if (STRINGP (components))
7143                 {
7144                   len = SCHARS (components);
7145                   i = i_byte = 0;
7146                   while (i < len)
7147                     {
7148                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7149                       buf++;
7150                     }
7151                 }
7152               else if (INTEGERP (components))
7153                 {
7154                   len = 1;
7155                   *buf++ = XINT (components);
7156                 }
7157               else if (CONSP (components))
7158                 {
7159                   for (len = 0; CONSP (components);
7160                        len++, components = XCDR (components))
7161                     *buf++ = XINT (XCAR (components));
7162                 }
7163               else
7164                 emacs_abort ();
7165               *head -= len;
7166             }
7167         }
7168
7169       if (find_composition (end, limit, &start, &end, &prop,
7170                             coding->src_object)
7171           && end <= limit)
7172         *stop = start;
7173       else
7174         *stop = limit;
7175     }
7176   return buf;
7177 }
7178
7179
7180 /* Extract an annotation datum from a text property `charset' at POS of
7181    CODING->src_object (buffer of string), store the data in BUF, set
7182    *STOP to the position where the value of `charset' property changes
7183    (limiting by LIMIT), and return the address of the next element of
7184    BUF.
7185
7186    If the property value is nil, set *STOP to the position where the
7187    property value is non-nil (limiting by LIMIT), and return BUF.  */
7188
7189 static int *
7190 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7191                            struct coding_system *coding, int *buf,
7192                            ptrdiff_t *stop)
7193 {
7194   Lisp_Object val, next;
7195   int id;
7196
7197   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7198   if (! NILP (val) && CHARSETP (val))
7199     id = XINT (CHARSET_SYMBOL_ID (val));
7200   else
7201     id = -1;
7202   ADD_CHARSET_DATA (buf, 0, id);
7203   next = Fnext_single_property_change (make_number (pos), Qcharset,
7204                                        coding->src_object,
7205                                        make_number (limit));
7206   *stop = XINT (next);
7207   return buf;
7208 }
7209
7210
7211 static void
7212 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7213                int max_lookup)
7214 {
7215   int *buf = coding->charbuf;
7216   int *buf_end = coding->charbuf + coding->charbuf_size;
7217   const unsigned char *src = coding->source + coding->consumed;
7218   const unsigned char *src_end = coding->source + coding->src_bytes;
7219   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7220   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7221   bool multibytep = coding->src_multibyte;
7222   Lisp_Object eol_type;
7223   int c;
7224   ptrdiff_t stop, stop_composition, stop_charset;
7225   int *lookup_buf = NULL;
7226
7227   if (! NILP (translation_table))
7228     lookup_buf = alloca (sizeof (int) * max_lookup);
7229
7230   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7231   if (VECTORP (eol_type))
7232     eol_type = Qunix;
7233
7234   /* Note: composition handling is not yet implemented.  */
7235   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7236
7237   if (NILP (coding->src_object))
7238     stop = stop_composition = stop_charset = end_pos;
7239   else
7240     {
7241       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7242         stop = stop_composition = pos;
7243       else
7244         stop = stop_composition = end_pos;
7245       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7246         stop = stop_charset = pos;
7247       else
7248         stop_charset = end_pos;
7249     }
7250
7251   /* Compensate for CRLF and conversion.  */
7252   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7253   while (buf < buf_end)
7254     {
7255       Lisp_Object trans;
7256
7257       if (pos == stop)
7258         {
7259           if (pos == end_pos)
7260             break;
7261           if (pos == stop_composition)
7262             buf = handle_composition_annotation (pos, end_pos, coding,
7263                                                  buf, &stop_composition);
7264           if (pos == stop_charset)
7265             buf = handle_charset_annotation (pos, end_pos, coding,
7266                                              buf, &stop_charset);
7267           stop = (stop_composition < stop_charset
7268                   ? stop_composition : stop_charset);
7269         }
7270
7271       if (! multibytep)
7272         {
7273           int bytes;
7274
7275           if (coding->encoder == encode_coding_raw_text
7276               || coding->encoder == encode_coding_ccl)
7277             c = *src++, pos++;
7278           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7279             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7280           else
7281             c = BYTE8_TO_CHAR (*src), src++, pos++;
7282         }
7283       else
7284         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7285       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7286         c = '\n';
7287       if (! EQ (eol_type, Qunix))
7288         {
7289           if (c == '\n')
7290             {
7291               if (EQ (eol_type, Qdos))
7292                 *buf++ = '\r';
7293               else
7294                 c = '\r';
7295             }
7296         }
7297
7298       trans = Qnil;
7299       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7300       if (NILP (trans))
7301         *buf++ = c;
7302       else
7303         {
7304           ptrdiff_t from_nchars = 1, to_nchars = 1;
7305           int *lookup_buf_end;
7306           const unsigned char *p = src;
7307           int i;
7308
7309           lookup_buf[0] = c;
7310           for (i = 1; i < max_lookup && p < src_end; i++)
7311             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7312           lookup_buf_end = lookup_buf + i;
7313           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7314           if (INTEGERP (trans))
7315             c = XINT (trans);
7316           else if (CONSP (trans))
7317             {
7318               from_nchars = ASIZE (XCAR (trans));
7319               trans = XCDR (trans);
7320               if (INTEGERP (trans))
7321                 c = XINT (trans);
7322               else
7323                 {
7324                   to_nchars = ASIZE (trans);
7325                   if (buf_end - buf < to_nchars)
7326                     break;
7327                   c = XINT (AREF (trans, 0));
7328                 }
7329             }
7330           else
7331             break;
7332           *buf++ = c;
7333           for (i = 1; i < to_nchars; i++)
7334             *buf++ = XINT (AREF (trans, i));
7335           for (i = 1; i < from_nchars; i++, pos++)
7336             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7337         }
7338     }
7339
7340   coding->consumed = src - coding->source;
7341   coding->consumed_char = pos - coding->src_pos;
7342   coding->charbuf_used = buf - coding->charbuf;
7343   coding->chars_at_source = 0;
7344 }
7345
7346
7347 /* Encode the text at CODING->src_object into CODING->dst_object.
7348    CODING->src_object is a buffer or a string.
7349    CODING->dst_object is a buffer or nil.
7350
7351    If CODING->src_object is a buffer, it must be the current buffer.
7352    In this case, if CODING->src_pos is positive, it is a position of
7353    the source text in the buffer, otherwise. the source text is in the
7354    gap area of the buffer, and coding->src_pos specifies the offset of
7355    the text from GPT (which must be the same as PT).  If this is the
7356    same buffer as CODING->dst_object, CODING->src_pos must be
7357    negative and CODING should not have `pre-write-conversion'.
7358
7359    If CODING->src_object is a string, CODING should not have
7360    `pre-write-conversion'.
7361
7362    If CODING->dst_object is a buffer, the encoded data is inserted at
7363    the current point of that buffer.
7364
7365    If CODING->dst_object is nil, the encoded data is placed at the
7366    memory area specified by CODING->destination.  */
7367
7368 static void
7369 encode_coding (struct coding_system *coding)
7370 {
7371   Lisp_Object attrs;
7372   Lisp_Object translation_table;
7373   int max_lookup;
7374   struct ccl_spec cclspec;
7375
7376   attrs = CODING_ID_ATTRS (coding->id);
7377   if (coding->encoder == encode_coding_raw_text)
7378     translation_table = Qnil, max_lookup = 0;
7379   else
7380     translation_table = get_translation_table (attrs, 1, &max_lookup);
7381
7382   if (BUFFERP (coding->dst_object))
7383     {
7384       set_buffer_internal (XBUFFER (coding->dst_object));
7385       coding->dst_multibyte
7386         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7387     }
7388
7389   coding->consumed = coding->consumed_char = 0;
7390   coding->produced = coding->produced_char = 0;
7391   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7392   coding->errors = 0;
7393
7394   ALLOC_CONVERSION_WORK_AREA (coding);
7395
7396   if (coding->encoder == encode_coding_ccl)
7397     {
7398       coding->spec.ccl = &cclspec;
7399       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7400     }
7401   do {
7402     coding_set_source (coding);
7403     consume_chars (coding, translation_table, max_lookup);
7404     coding_set_destination (coding);
7405     (*(coding->encoder)) (coding);
7406   } while (coding->consumed_char < coding->src_chars);
7407
7408   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7409     insert_from_gap (coding->produced_char, coding->produced);
7410 }
7411
7412
7413 /* Name (or base name) of work buffer for code conversion.  */
7414 static Lisp_Object Vcode_conversion_workbuf_name;
7415
7416 /* A working buffer used by the top level conversion.  Once it is
7417    created, it is never destroyed.  It has the name
7418    Vcode_conversion_workbuf_name.  The other working buffers are
7419    destroyed after the use is finished, and their names are modified
7420    versions of Vcode_conversion_workbuf_name.  */
7421 static Lisp_Object Vcode_conversion_reused_workbuf;
7422
7423 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7424 static bool reused_workbuf_in_use;
7425
7426
7427 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7428    multibyteness of returning buffer.  */
7429
7430 static Lisp_Object
7431 make_conversion_work_buffer (bool multibyte)
7432 {
7433   Lisp_Object name, workbuf;
7434   struct buffer *current;
7435
7436   if (reused_workbuf_in_use)
7437     {
7438       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7439       workbuf = Fget_buffer_create (name);
7440     }
7441   else
7442     {
7443       reused_workbuf_in_use = 1;
7444       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7445         Vcode_conversion_reused_workbuf
7446           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7447       workbuf = Vcode_conversion_reused_workbuf;
7448     }
7449   current = current_buffer;
7450   set_buffer_internal (XBUFFER (workbuf));
7451   /* We can't allow modification hooks to run in the work buffer.  For
7452      instance, directory_files_internal assumes that file decoding
7453      doesn't compile new regexps.  */
7454   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7455   Ferase_buffer ();
7456   bset_undo_list (current_buffer, Qt);
7457   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7458   set_buffer_internal (current);
7459   return workbuf;
7460 }
7461
7462
7463 static Lisp_Object
7464 code_conversion_restore (Lisp_Object arg)
7465 {
7466   Lisp_Object current, workbuf;
7467   struct gcpro gcpro1;
7468
7469   GCPRO1 (arg);
7470   current = XCAR (arg);
7471   workbuf = XCDR (arg);
7472   if (! NILP (workbuf))
7473     {
7474       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7475         reused_workbuf_in_use = 0;
7476       else
7477         Fkill_buffer (workbuf);
7478     }
7479   set_buffer_internal (XBUFFER (current));
7480   UNGCPRO;
7481   return Qnil;
7482 }
7483
7484 Lisp_Object
7485 code_conversion_save (bool with_work_buf, bool multibyte)
7486 {
7487   Lisp_Object workbuf = Qnil;
7488
7489   if (with_work_buf)
7490     workbuf = make_conversion_work_buffer (multibyte);
7491   record_unwind_protect (code_conversion_restore,
7492                          Fcons (Fcurrent_buffer (), workbuf));
7493   return workbuf;
7494 }
7495
7496 void
7497 decode_coding_gap (struct coding_system *coding,
7498                    ptrdiff_t chars, ptrdiff_t bytes)
7499 {
7500   ptrdiff_t count = SPECPDL_INDEX ();
7501   Lisp_Object attrs;
7502
7503   code_conversion_save (0, 0);
7504
7505   coding->src_object = Fcurrent_buffer ();
7506   coding->src_chars = chars;
7507   coding->src_bytes = bytes;
7508   coding->src_pos = -chars;
7509   coding->src_pos_byte = -bytes;
7510   coding->src_multibyte = chars < bytes;
7511   coding->dst_object = coding->src_object;
7512   coding->dst_pos = PT;
7513   coding->dst_pos_byte = PT_BYTE;
7514   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7515
7516   if (CODING_REQUIRE_DETECTION (coding))
7517     detect_coding (coding);
7518
7519   coding->mode |= CODING_MODE_LAST_BLOCK;
7520   current_buffer->text->inhibit_shrinking = 1;
7521   decode_coding (coding);
7522   current_buffer->text->inhibit_shrinking = 0;
7523
7524   attrs = CODING_ID_ATTRS (coding->id);
7525   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7526     {
7527       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7528       Lisp_Object val;
7529
7530       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7531       val = call1 (CODING_ATTR_POST_READ (attrs),
7532                    make_number (coding->produced_char));
7533       CHECK_NATNUM (val);
7534       coding->produced_char += Z - prev_Z;
7535       coding->produced += Z_BYTE - prev_Z_BYTE;
7536     }
7537
7538   unbind_to (count, Qnil);
7539 }
7540
7541
7542 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7543    SRC_OBJECT into DST_OBJECT by coding context CODING.
7544
7545    SRC_OBJECT is a buffer, a string, or Qnil.
7546
7547    If it is a buffer, the text is at point of the buffer.  FROM and TO
7548    are positions in the buffer.
7549
7550    If it is a string, the text is at the beginning of the string.
7551    FROM and TO are indices to the string.
7552
7553    If it is nil, the text is at coding->source.  FROM and TO are
7554    indices to coding->source.
7555
7556    DST_OBJECT is a buffer, Qt, or Qnil.
7557
7558    If it is a buffer, the decoded text is inserted at point of the
7559    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7560    is deleted.
7561
7562    If it is Qt, a string is made from the decoded text, and
7563    set in CODING->dst_object.
7564
7565    If it is Qnil, the decoded text is stored at CODING->destination.
7566    The caller must allocate CODING->dst_bytes bytes at
7567    CODING->destination by xmalloc.  If the decoded text is longer than
7568    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7569  */
7570
7571 void
7572 decode_coding_object (struct coding_system *coding,
7573                       Lisp_Object src_object,
7574                       ptrdiff_t from, ptrdiff_t from_byte,
7575                       ptrdiff_t to, ptrdiff_t to_byte,
7576                       Lisp_Object dst_object)
7577 {
7578   ptrdiff_t count = SPECPDL_INDEX ();
7579   unsigned char *destination IF_LINT (= NULL);
7580   ptrdiff_t dst_bytes IF_LINT (= 0);
7581   ptrdiff_t chars = to - from;
7582   ptrdiff_t bytes = to_byte - from_byte;
7583   Lisp_Object attrs;
7584   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7585   bool need_marker_adjustment = 0;
7586   Lisp_Object old_deactivate_mark;
7587
7588   old_deactivate_mark = Vdeactivate_mark;
7589
7590   if (NILP (dst_object))
7591     {
7592       destination = coding->destination;
7593       dst_bytes = coding->dst_bytes;
7594     }
7595
7596   coding->src_object = src_object;
7597   coding->src_chars = chars;
7598   coding->src_bytes = bytes;
7599   coding->src_multibyte = chars < bytes;
7600
7601   if (STRINGP (src_object))
7602     {
7603       coding->src_pos = from;
7604       coding->src_pos_byte = from_byte;
7605     }
7606   else if (BUFFERP (src_object))
7607     {
7608       set_buffer_internal (XBUFFER (src_object));
7609       if (from != GPT)
7610         move_gap_both (from, from_byte);
7611       if (EQ (src_object, dst_object))
7612         {
7613           struct Lisp_Marker *tail;
7614
7615           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7616             {
7617               tail->need_adjustment
7618                 = tail->charpos == (tail->insertion_type ? from : to);
7619               need_marker_adjustment |= tail->need_adjustment;
7620             }
7621           saved_pt = PT, saved_pt_byte = PT_BYTE;
7622           TEMP_SET_PT_BOTH (from, from_byte);
7623           current_buffer->text->inhibit_shrinking = 1;
7624           del_range_both (from, from_byte, to, to_byte, 1);
7625           coding->src_pos = -chars;
7626           coding->src_pos_byte = -bytes;
7627         }
7628       else
7629         {
7630           coding->src_pos = from;
7631           coding->src_pos_byte = from_byte;
7632         }
7633     }
7634
7635   if (CODING_REQUIRE_DETECTION (coding))
7636     detect_coding (coding);
7637   attrs = CODING_ID_ATTRS (coding->id);
7638
7639   if (EQ (dst_object, Qt)
7640       || (! NILP (CODING_ATTR_POST_READ (attrs))
7641           && NILP (dst_object)))
7642     {
7643       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7644       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7645       coding->dst_pos = BEG;
7646       coding->dst_pos_byte = BEG_BYTE;
7647     }
7648   else if (BUFFERP (dst_object))
7649     {
7650       code_conversion_save (0, 0);
7651       coding->dst_object = dst_object;
7652       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7653       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7654       coding->dst_multibyte
7655         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7656     }
7657   else
7658     {
7659       code_conversion_save (0, 0);
7660       coding->dst_object = Qnil;
7661       /* Most callers presume this will return a multibyte result, and they
7662          won't use `binary' or `raw-text' anyway, so let's not worry about
7663          CODING_FOR_UNIBYTE.  */
7664       coding->dst_multibyte = 1;
7665     }
7666
7667   decode_coding (coding);
7668
7669   if (BUFFERP (coding->dst_object))
7670     set_buffer_internal (XBUFFER (coding->dst_object));
7671
7672   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7673     {
7674       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7675       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7676       Lisp_Object val;
7677
7678       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7679       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7680               old_deactivate_mark);
7681       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7682                         make_number (coding->produced_char));
7683       UNGCPRO;
7684       CHECK_NATNUM (val);
7685       coding->produced_char += Z - prev_Z;
7686       coding->produced += Z_BYTE - prev_Z_BYTE;
7687     }
7688
7689   if (EQ (dst_object, Qt))
7690     {
7691       coding->dst_object = Fbuffer_string ();
7692     }
7693   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7694     {
7695       set_buffer_internal (XBUFFER (coding->dst_object));
7696       if (dst_bytes < coding->produced)
7697         {
7698           destination = xrealloc (destination, coding->produced);
7699           if (! destination)
7700             {
7701               record_conversion_result (coding,
7702                                         CODING_RESULT_INSUFFICIENT_MEM);
7703               unbind_to (count, Qnil);
7704               return;
7705             }
7706           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7707             move_gap_both (BEGV, BEGV_BYTE);
7708           memcpy (destination, BEGV_ADDR, coding->produced);
7709           coding->destination = destination;
7710         }
7711     }
7712
7713   if (saved_pt >= 0)
7714     {
7715       /* This is the case of:
7716          (BUFFERP (src_object) && EQ (src_object, dst_object))
7717          As we have moved PT while replacing the original buffer
7718          contents, we must recover it now.  */
7719       set_buffer_internal (XBUFFER (src_object));
7720       current_buffer->text->inhibit_shrinking = 0;
7721       if (saved_pt < from)
7722         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7723       else if (saved_pt < from + chars)
7724         TEMP_SET_PT_BOTH (from, from_byte);
7725       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7726         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7727                           saved_pt_byte + (coding->produced - bytes));
7728       else
7729         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7730                           saved_pt_byte + (coding->produced - bytes));
7731
7732       if (need_marker_adjustment)
7733         {
7734           struct Lisp_Marker *tail;
7735
7736           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7737             if (tail->need_adjustment)
7738               {
7739                 tail->need_adjustment = 0;
7740                 if (tail->insertion_type)
7741                   {
7742                     tail->bytepos = from_byte;
7743                     tail->charpos = from;
7744                   }
7745                 else
7746                   {
7747                     tail->bytepos = from_byte + coding->produced;
7748                     tail->charpos
7749                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7750                          ? tail->bytepos : from + coding->produced_char);
7751                   }
7752               }
7753         }
7754     }
7755
7756   Vdeactivate_mark = old_deactivate_mark;
7757   unbind_to (count, coding->dst_object);
7758 }
7759
7760
7761 void
7762 encode_coding_object (struct coding_system *coding,
7763                       Lisp_Object src_object,
7764                       ptrdiff_t from, ptrdiff_t from_byte,
7765                       ptrdiff_t to, ptrdiff_t to_byte,
7766                       Lisp_Object dst_object)
7767 {
7768   ptrdiff_t count = SPECPDL_INDEX ();
7769   ptrdiff_t chars = to - from;
7770   ptrdiff_t bytes = to_byte - from_byte;
7771   Lisp_Object attrs;
7772   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7773   bool need_marker_adjustment = 0;
7774   bool kill_src_buffer = 0;
7775   Lisp_Object old_deactivate_mark;
7776
7777   old_deactivate_mark = Vdeactivate_mark;
7778
7779   coding->src_object = src_object;
7780   coding->src_chars = chars;
7781   coding->src_bytes = bytes;
7782   coding->src_multibyte = chars < bytes;
7783
7784   attrs = CODING_ID_ATTRS (coding->id);
7785
7786   if (EQ (src_object, dst_object))
7787     {
7788       struct Lisp_Marker *tail;
7789
7790       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7791         {
7792           tail->need_adjustment
7793             = tail->charpos == (tail->insertion_type ? from : to);
7794           need_marker_adjustment |= tail->need_adjustment;
7795         }
7796     }
7797
7798   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7799     {
7800       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7801       set_buffer_internal (XBUFFER (coding->src_object));
7802       if (STRINGP (src_object))
7803         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7804       else if (BUFFERP (src_object))
7805         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7806       else
7807         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7808
7809       if (EQ (src_object, dst_object))
7810         {
7811           set_buffer_internal (XBUFFER (src_object));
7812           saved_pt = PT, saved_pt_byte = PT_BYTE;
7813           del_range_both (from, from_byte, to, to_byte, 1);
7814           set_buffer_internal (XBUFFER (coding->src_object));
7815         }
7816
7817       {
7818         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7819
7820         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7821                 old_deactivate_mark);
7822         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7823                     make_number (BEG), make_number (Z));
7824         UNGCPRO;
7825       }
7826       if (XBUFFER (coding->src_object) != current_buffer)
7827         kill_src_buffer = 1;
7828       coding->src_object = Fcurrent_buffer ();
7829       if (BEG != GPT)
7830         move_gap_both (BEG, BEG_BYTE);
7831       coding->src_chars = Z - BEG;
7832       coding->src_bytes = Z_BYTE - BEG_BYTE;
7833       coding->src_pos = BEG;
7834       coding->src_pos_byte = BEG_BYTE;
7835       coding->src_multibyte = Z < Z_BYTE;
7836     }
7837   else if (STRINGP (src_object))
7838     {
7839       code_conversion_save (0, 0);
7840       coding->src_pos = from;
7841       coding->src_pos_byte = from_byte;
7842     }
7843   else if (BUFFERP (src_object))
7844     {
7845       code_conversion_save (0, 0);
7846       set_buffer_internal (XBUFFER (src_object));
7847       if (EQ (src_object, dst_object))
7848         {
7849           saved_pt = PT, saved_pt_byte = PT_BYTE;
7850           coding->src_object = del_range_1 (from, to, 1, 1);
7851           coding->src_pos = 0;
7852           coding->src_pos_byte = 0;
7853         }
7854       else
7855         {
7856           if (from < GPT && to >= GPT)
7857             move_gap_both (from, from_byte);
7858           coding->src_pos = from;
7859           coding->src_pos_byte = from_byte;
7860         }
7861     }
7862   else
7863     code_conversion_save (0, 0);
7864
7865   if (BUFFERP (dst_object))
7866     {
7867       coding->dst_object = dst_object;
7868       if (EQ (src_object, dst_object))
7869         {
7870           coding->dst_pos = from;
7871           coding->dst_pos_byte = from_byte;
7872         }
7873       else
7874         {
7875           struct buffer *current = current_buffer;
7876
7877           set_buffer_temp (XBUFFER (dst_object));
7878           coding->dst_pos = PT;
7879           coding->dst_pos_byte = PT_BYTE;
7880           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7881           set_buffer_temp (current);
7882         }
7883       coding->dst_multibyte
7884         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7885     }
7886   else if (EQ (dst_object, Qt))
7887     {
7888       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7889       coding->dst_object = Qnil;
7890       coding->destination = xmalloc (dst_bytes);
7891       coding->dst_bytes = dst_bytes;
7892       coding->dst_multibyte = 0;
7893     }
7894   else
7895     {
7896       coding->dst_object = Qnil;
7897       coding->dst_multibyte = 0;
7898     }
7899
7900   encode_coding (coding);
7901
7902   if (EQ (dst_object, Qt))
7903     {
7904       if (BUFFERP (coding->dst_object))
7905         coding->dst_object = Fbuffer_string ();
7906       else
7907         {
7908           coding->dst_object
7909             = make_unibyte_string ((char *) coding->destination,
7910                                    coding->produced);
7911           xfree (coding->destination);
7912         }
7913     }
7914
7915   if (saved_pt >= 0)
7916     {
7917       /* This is the case of:
7918          (BUFFERP (src_object) && EQ (src_object, dst_object))
7919          As we have moved PT while replacing the original buffer
7920          contents, we must recover it now.  */
7921       set_buffer_internal (XBUFFER (src_object));
7922       if (saved_pt < from)
7923         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7924       else if (saved_pt < from + chars)
7925         TEMP_SET_PT_BOTH (from, from_byte);
7926       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7927         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7928                           saved_pt_byte + (coding->produced - bytes));
7929       else
7930         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7931                           saved_pt_byte + (coding->produced - bytes));
7932
7933       if (need_marker_adjustment)
7934         {
7935           struct Lisp_Marker *tail;
7936
7937           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7938             if (tail->need_adjustment)
7939               {
7940                 tail->need_adjustment = 0;
7941                 if (tail->insertion_type)
7942                   {
7943                     tail->bytepos = from_byte;
7944                     tail->charpos = from;
7945                   }
7946                 else
7947                   {
7948                     tail->bytepos = from_byte + coding->produced;
7949                     tail->charpos
7950                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7951                          ? tail->bytepos : from + coding->produced_char);
7952                   }
7953               }
7954         }
7955     }
7956
7957   if (kill_src_buffer)
7958     Fkill_buffer (coding->src_object);
7959
7960   Vdeactivate_mark = old_deactivate_mark;
7961   unbind_to (count, Qnil);
7962 }
7963
7964
7965 Lisp_Object
7966 preferred_coding_system (void)
7967 {
7968   int id = coding_categories[coding_priorities[0]].id;
7969
7970   return CODING_ID_NAME (id);
7971 }
7972
7973 #if defined (WINDOWSNT) || defined (CYGWIN)
7974
7975 Lisp_Object
7976 from_unicode (Lisp_Object str)
7977 {
7978   CHECK_STRING (str);
7979   if (!STRING_MULTIBYTE (str) &&
7980       SBYTES (str) & 1)
7981     {
7982       str = Fsubstring (str, make_number (0), make_number (-1));
7983     }
7984
7985   return code_convert_string_norecord (str, Qutf_16le, 0);
7986 }
7987
7988 wchar_t *
7989 to_unicode (Lisp_Object str, Lisp_Object *buf)
7990 {
7991   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7992   /* We need to make a another copy (in addition to the one made by
7993      code_convert_string_norecord) to ensure that the final string is
7994      _doubly_ zero terminated --- that is, that the string is
7995      terminated by two zero bytes and one utf-16le null character.
7996      Because strings are already terminated with a single zero byte,
7997      we just add one additional zero. */
7998   str = make_uninit_string (SBYTES (*buf) + 1);
7999   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8000   SDATA (str) [SBYTES (*buf)] = '\0';
8001   *buf = str;
8002   return WCSDATA (*buf);
8003 }
8004
8005 #endif /* WINDOWSNT || CYGWIN */
8006
8007 \f
8008 #ifdef emacs
8009 /*** 8. Emacs Lisp library functions ***/
8010
8011 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8012        doc: /* Return t if OBJECT is nil or a coding-system.
8013 See the documentation of `define-coding-system' for information
8014 about coding-system objects.  */)
8015   (Lisp_Object object)
8016 {
8017   if (NILP (object)
8018       || CODING_SYSTEM_ID (object) >= 0)
8019     return Qt;
8020   if (! SYMBOLP (object)
8021       || NILP (Fget (object, Qcoding_system_define_form)))
8022     return Qnil;
8023   return Qt;
8024 }
8025
8026 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8027        Sread_non_nil_coding_system, 1, 1, 0,
8028        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8029   (Lisp_Object prompt)
8030 {
8031   Lisp_Object val;
8032   do
8033     {
8034       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8035                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8036     }
8037   while (SCHARS (val) == 0);
8038   return (Fintern (val, Qnil));
8039 }
8040
8041 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8042        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8043 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8044 Ignores case when completing coding systems (all Emacs coding systems
8045 are lower-case).  */)
8046   (Lisp_Object prompt, Lisp_Object default_coding_system)
8047 {
8048   Lisp_Object val;
8049   ptrdiff_t count = SPECPDL_INDEX ();
8050
8051   if (SYMBOLP (default_coding_system))
8052     default_coding_system = SYMBOL_NAME (default_coding_system);
8053   specbind (Qcompletion_ignore_case, Qt);
8054   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8055                           Qt, Qnil, Qcoding_system_history,
8056                           default_coding_system, Qnil);
8057   unbind_to (count, Qnil);
8058   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8059 }
8060
8061 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8062        1, 1, 0,
8063        doc: /* Check validity of CODING-SYSTEM.
8064 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8065 It is valid if it is nil or a symbol defined as a coding system by the
8066 function `define-coding-system'.  */)
8067   (Lisp_Object coding_system)
8068 {
8069   Lisp_Object define_form;
8070
8071   define_form = Fget (coding_system, Qcoding_system_define_form);
8072   if (! NILP (define_form))
8073     {
8074       Fput (coding_system, Qcoding_system_define_form, Qnil);
8075       safe_eval (define_form);
8076     }
8077   if (!NILP (Fcoding_system_p (coding_system)))
8078     return coding_system;
8079   xsignal1 (Qcoding_system_error, coding_system);
8080 }
8081
8082 \f
8083 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8084    HIGHEST, return the coding system of the highest
8085    priority among the detected coding systems.  Otherwise return a
8086    list of detected coding systems sorted by their priorities.  If
8087    MULTIBYTEP, it is assumed that the bytes are in correct
8088    multibyte form but contains only ASCII and eight-bit chars.
8089    Otherwise, the bytes are raw bytes.
8090
8091    CODING-SYSTEM controls the detection as below:
8092
8093    If it is nil, detect both text-format and eol-format.  If the
8094    text-format part of CODING-SYSTEM is already specified
8095    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8096    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8097    detect only text-format.  */
8098
8099 Lisp_Object
8100 detect_coding_system (const unsigned char *src,
8101                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8102                       bool highest, bool multibytep,
8103                       Lisp_Object coding_system)
8104 {
8105   const unsigned char *src_end = src + src_bytes;
8106   Lisp_Object attrs, eol_type;
8107   Lisp_Object val = Qnil;
8108   struct coding_system coding;
8109   ptrdiff_t id;
8110   struct coding_detection_info detect_info;
8111   enum coding_category base_category;
8112   bool null_byte_found = 0, eight_bit_found = 0;
8113
8114   if (NILP (coding_system))
8115     coding_system = Qundecided;
8116   setup_coding_system (coding_system, &coding);
8117   attrs = CODING_ID_ATTRS (coding.id);
8118   eol_type = CODING_ID_EOL_TYPE (coding.id);
8119   coding_system = CODING_ATTR_BASE_NAME (attrs);
8120
8121   coding.source = src;
8122   coding.src_chars = src_chars;
8123   coding.src_bytes = src_bytes;
8124   coding.src_multibyte = multibytep;
8125   coding.consumed = 0;
8126   coding.mode |= CODING_MODE_LAST_BLOCK;
8127   coding.head_ascii = 0;
8128
8129   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8130
8131   /* At first, detect text-format if necessary.  */
8132   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8133   if (base_category == coding_category_undecided)
8134     {
8135       enum coding_category category IF_LINT (= 0);
8136       struct coding_system *this IF_LINT (= NULL);
8137       int c, i;
8138
8139       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8140       for (; src < src_end; src++)
8141         {
8142           c = *src;
8143           if (c & 0x80)
8144             {
8145               eight_bit_found = 1;
8146               if (null_byte_found)
8147                 break;
8148             }
8149           else if (c < 0x20)
8150             {
8151               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8152                   && ! inhibit_iso_escape_detection
8153                   && ! detect_info.checked)
8154                 {
8155                   if (detect_coding_iso_2022 (&coding, &detect_info))
8156                     {
8157                       /* We have scanned the whole data.  */
8158                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8159                         {
8160                           /* We didn't find an 8-bit code.  We may
8161                              have found a null-byte, but it's very
8162                              rare that a binary file confirm to
8163                              ISO-2022.  */
8164                           src = src_end;
8165                           coding.head_ascii = src - coding.source;
8166                         }
8167                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8168                       break;
8169                     }
8170                 }
8171               else if (! c && !inhibit_null_byte_detection)
8172                 {
8173                   null_byte_found = 1;
8174                   if (eight_bit_found)
8175                     break;
8176                 }
8177               if (! eight_bit_found)
8178                 coding.head_ascii++;
8179             }
8180           else if (! eight_bit_found)
8181             coding.head_ascii++;
8182         }
8183
8184       if (null_byte_found || eight_bit_found
8185           || coding.head_ascii < coding.src_bytes
8186           || detect_info.found)
8187         {
8188           if (coding.head_ascii == coding.src_bytes)
8189             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8190             for (i = 0; i < coding_category_raw_text; i++)
8191               {
8192                 category = coding_priorities[i];
8193                 this = coding_categories + category;
8194                 if (detect_info.found & (1 << category))
8195                   break;
8196               }
8197           else
8198             {
8199               if (null_byte_found)
8200                 {
8201                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8202                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8203                 }
8204               for (i = 0; i < coding_category_raw_text; i++)
8205                 {
8206                   category = coding_priorities[i];
8207                   this = coding_categories + category;
8208
8209                   if (this->id < 0)
8210                     {
8211                       /* No coding system of this category is defined.  */
8212                       detect_info.rejected |= (1 << category);
8213                     }
8214                   else if (category >= coding_category_raw_text)
8215                     continue;
8216                   else if (detect_info.checked & (1 << category))
8217                     {
8218                       if (highest
8219                           && (detect_info.found & (1 << category)))
8220                         break;
8221                     }
8222                   else if ((*(this->detector)) (&coding, &detect_info)
8223                            && highest
8224                            && (detect_info.found & (1 << category)))
8225                     {
8226                       if (category == coding_category_utf_16_auto)
8227                         {
8228                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8229                             category = coding_category_utf_16_le;
8230                           else
8231                             category = coding_category_utf_16_be;
8232                         }
8233                       break;
8234                     }
8235                 }
8236             }
8237         }
8238
8239       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8240           || null_byte_found)
8241         {
8242           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8243           id = CODING_SYSTEM_ID (Qno_conversion);
8244           val = Fcons (make_number (id), Qnil);
8245         }
8246       else if (! detect_info.rejected && ! detect_info.found)
8247         {
8248           detect_info.found = CATEGORY_MASK_ANY;
8249           id = coding_categories[coding_category_undecided].id;
8250           val = Fcons (make_number (id), Qnil);
8251         }
8252       else if (highest)
8253         {
8254           if (detect_info.found)
8255             {
8256               detect_info.found = 1 << category;
8257               val = Fcons (make_number (this->id), Qnil);
8258             }
8259           else
8260             for (i = 0; i < coding_category_raw_text; i++)
8261               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8262                 {
8263                   detect_info.found = 1 << coding_priorities[i];
8264                   id = coding_categories[coding_priorities[i]].id;
8265                   val = Fcons (make_number (id), Qnil);
8266                   break;
8267                 }
8268         }
8269       else
8270         {
8271           int mask = detect_info.rejected | detect_info.found;
8272           int found = 0;
8273
8274           for (i = coding_category_raw_text - 1; i >= 0; i--)
8275             {
8276               category = coding_priorities[i];
8277               if (! (mask & (1 << category)))
8278                 {
8279                   found |= 1 << category;
8280                   id = coding_categories[category].id;
8281                   if (id >= 0)
8282                     val = Fcons (make_number (id), val);
8283                 }
8284             }
8285           for (i = coding_category_raw_text - 1; i >= 0; i--)
8286             {
8287               category = coding_priorities[i];
8288               if (detect_info.found & (1 << category))
8289                 {
8290                   id = coding_categories[category].id;
8291                   val = Fcons (make_number (id), val);
8292                 }
8293             }
8294           detect_info.found |= found;
8295         }
8296     }
8297   else if (base_category == coding_category_utf_8_auto)
8298     {
8299       if (detect_coding_utf_8 (&coding, &detect_info))
8300         {
8301           struct coding_system *this;
8302
8303           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8304             this = coding_categories + coding_category_utf_8_sig;
8305           else
8306             this = coding_categories + coding_category_utf_8_nosig;
8307           val = Fcons (make_number (this->id), Qnil);
8308         }
8309     }
8310   else if (base_category == coding_category_utf_16_auto)
8311     {
8312       if (detect_coding_utf_16 (&coding, &detect_info))
8313         {
8314           struct coding_system *this;
8315
8316           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8317             this = coding_categories + coding_category_utf_16_le;
8318           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8319             this = coding_categories + coding_category_utf_16_be;
8320           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8321             this = coding_categories + coding_category_utf_16_be_nosig;
8322           else
8323             this = coding_categories + coding_category_utf_16_le_nosig;
8324           val = Fcons (make_number (this->id), Qnil);
8325         }
8326     }
8327   else
8328     {
8329       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8330       val = Fcons (make_number (coding.id), Qnil);
8331     }
8332
8333   /* Then, detect eol-format if necessary.  */
8334   {
8335     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8336     Lisp_Object tail;
8337
8338     if (VECTORP (eol_type))
8339       {
8340         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8341           {
8342             if (null_byte_found)
8343               normal_eol = EOL_SEEN_LF;
8344             else
8345               normal_eol = detect_eol (coding.source, src_bytes,
8346                                        coding_category_raw_text);
8347           }
8348         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8349                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8350           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8351                                       coding_category_utf_16_be);
8352         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8353                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8354           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8355                                       coding_category_utf_16_le);
8356       }
8357     else
8358       {
8359         if (EQ (eol_type, Qunix))
8360           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8361         else if (EQ (eol_type, Qdos))
8362           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8363         else
8364           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8365       }
8366
8367     for (tail = val; CONSP (tail); tail = XCDR (tail))
8368       {
8369         enum coding_category category;
8370         int this_eol;
8371
8372         id = XINT (XCAR (tail));
8373         attrs = CODING_ID_ATTRS (id);
8374         category = XINT (CODING_ATTR_CATEGORY (attrs));
8375         eol_type = CODING_ID_EOL_TYPE (id);
8376         if (VECTORP (eol_type))
8377           {
8378             if (category == coding_category_utf_16_be
8379                 || category == coding_category_utf_16_be_nosig)
8380               this_eol = utf_16_be_eol;
8381             else if (category == coding_category_utf_16_le
8382                      || category == coding_category_utf_16_le_nosig)
8383               this_eol = utf_16_le_eol;
8384             else
8385               this_eol = normal_eol;
8386
8387             if (this_eol == EOL_SEEN_LF)
8388               XSETCAR (tail, AREF (eol_type, 0));
8389             else if (this_eol == EOL_SEEN_CRLF)
8390               XSETCAR (tail, AREF (eol_type, 1));
8391             else if (this_eol == EOL_SEEN_CR)
8392               XSETCAR (tail, AREF (eol_type, 2));
8393             else
8394               XSETCAR (tail, CODING_ID_NAME (id));
8395           }
8396         else
8397           XSETCAR (tail, CODING_ID_NAME (id));
8398       }
8399   }
8400
8401   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8402 }
8403
8404
8405 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8406        2, 3, 0,
8407        doc: /* Detect coding system of the text in the region between START and END.
8408 Return a list of possible coding systems ordered by priority.
8409 The coding systems to try and their priorities follows what
8410 the function `coding-system-priority-list' (which see) returns.
8411
8412 If only ASCII characters are found (except for such ISO-2022 control
8413 characters as ESC), it returns a list of single element `undecided'
8414 or its subsidiary coding system according to a detected end-of-line
8415 format.
8416
8417 If optional argument HIGHEST is non-nil, return the coding system of
8418 highest priority.  */)
8419   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8420 {
8421   ptrdiff_t from, to;
8422   ptrdiff_t from_byte, to_byte;
8423
8424   validate_region (&start, &end);
8425   from = XINT (start), to = XINT (end);
8426   from_byte = CHAR_TO_BYTE (from);
8427   to_byte = CHAR_TO_BYTE (to);
8428
8429   if (from < GPT && to >= GPT)
8430     move_gap_both (to, to_byte);
8431
8432   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8433                                to - from, to_byte - from_byte,
8434                                !NILP (highest),
8435                                !NILP (BVAR (current_buffer
8436                                       , enable_multibyte_characters)),
8437                                Qnil);
8438 }
8439
8440 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8441        1, 2, 0,
8442        doc: /* Detect coding system of the text in STRING.
8443 Return a list of possible coding systems ordered by priority.
8444 The coding systems to try and their priorities follows what
8445 the function `coding-system-priority-list' (which see) returns.
8446
8447 If only ASCII characters are found (except for such ISO-2022 control
8448 characters as ESC), it returns a list of single element `undecided'
8449 or its subsidiary coding system according to a detected end-of-line
8450 format.
8451
8452 If optional argument HIGHEST is non-nil, return the coding system of
8453 highest priority.  */)
8454   (Lisp_Object string, Lisp_Object highest)
8455 {
8456   CHECK_STRING (string);
8457
8458   return detect_coding_system (SDATA (string),
8459                                SCHARS (string), SBYTES (string),
8460                                !NILP (highest), STRING_MULTIBYTE (string),
8461                                Qnil);
8462 }
8463
8464
8465 static bool
8466 char_encodable_p (int c, Lisp_Object attrs)
8467 {
8468   Lisp_Object tail;
8469   struct charset *charset;
8470   Lisp_Object translation_table;
8471
8472   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8473   if (! NILP (translation_table))
8474     c = translate_char (translation_table, c);
8475   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8476        CONSP (tail); tail = XCDR (tail))
8477     {
8478       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8479       if (CHAR_CHARSET_P (c, charset))
8480         break;
8481     }
8482   return (! NILP (tail));
8483 }
8484
8485
8486 /* Return a list of coding systems that safely encode the text between
8487    START and END.  If EXCLUDE is non-nil, it is a list of coding
8488    systems not to check.  The returned list doesn't contain any such
8489    coding systems.  In any case, if the text contains only ASCII or is
8490    unibyte, return t.  */
8491
8492 DEFUN ("find-coding-systems-region-internal",
8493        Ffind_coding_systems_region_internal,
8494        Sfind_coding_systems_region_internal, 2, 3, 0,
8495        doc: /* Internal use only.  */)
8496   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8497 {
8498   Lisp_Object coding_attrs_list, safe_codings;
8499   ptrdiff_t start_byte, end_byte;
8500   const unsigned char *p, *pbeg, *pend;
8501   int c;
8502   Lisp_Object tail, elt, work_table;
8503
8504   if (STRINGP (start))
8505     {
8506       if (!STRING_MULTIBYTE (start)
8507           || SCHARS (start) == SBYTES (start))
8508         return Qt;
8509       start_byte = 0;
8510       end_byte = SBYTES (start);
8511     }
8512   else
8513     {
8514       CHECK_NUMBER_COERCE_MARKER (start);
8515       CHECK_NUMBER_COERCE_MARKER (end);
8516       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8517         args_out_of_range (start, end);
8518       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8519         return Qt;
8520       start_byte = CHAR_TO_BYTE (XINT (start));
8521       end_byte = CHAR_TO_BYTE (XINT (end));
8522       if (XINT (end) - XINT (start) == end_byte - start_byte)
8523         return Qt;
8524
8525       if (XINT (start) < GPT && XINT (end) > GPT)
8526         {
8527           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8528             move_gap_both (XINT (start), start_byte);
8529           else
8530             move_gap_both (XINT (end), end_byte);
8531         }
8532     }
8533
8534   coding_attrs_list = Qnil;
8535   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8536     if (NILP (exclude)
8537         || NILP (Fmemq (XCAR (tail), exclude)))
8538       {
8539         Lisp_Object attrs;
8540
8541         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8542         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8543             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8544           {
8545             ASET (attrs, coding_attr_trans_tbl,
8546                   get_translation_table (attrs, 1, NULL));
8547             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8548           }
8549       }
8550
8551   if (STRINGP (start))
8552     p = pbeg = SDATA (start);
8553   else
8554     p = pbeg = BYTE_POS_ADDR (start_byte);
8555   pend = p + (end_byte - start_byte);
8556
8557   while (p < pend && ASCII_BYTE_P (*p)) p++;
8558   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8559
8560   work_table = Fmake_char_table (Qnil, Qnil);
8561   while (p < pend)
8562     {
8563       if (ASCII_BYTE_P (*p))
8564         p++;
8565       else
8566         {
8567           c = STRING_CHAR_ADVANCE (p);
8568           if (!NILP (char_table_ref (work_table, c)))
8569             /* This character was already checked.  Ignore it.  */
8570             continue;
8571
8572           charset_map_loaded = 0;
8573           for (tail = coding_attrs_list; CONSP (tail);)
8574             {
8575               elt = XCAR (tail);
8576               if (NILP (elt))
8577                 tail = XCDR (tail);
8578               else if (char_encodable_p (c, elt))
8579                 tail = XCDR (tail);
8580               else if (CONSP (XCDR (tail)))
8581                 {
8582                   XSETCAR (tail, XCAR (XCDR (tail)));
8583                   XSETCDR (tail, XCDR (XCDR (tail)));
8584                 }
8585               else
8586                 {
8587                   XSETCAR (tail, Qnil);
8588                   tail = XCDR (tail);
8589                 }
8590             }
8591           if (charset_map_loaded)
8592             {
8593               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8594
8595               if (STRINGP (start))
8596                 pbeg = SDATA (start);
8597               else
8598                 pbeg = BYTE_POS_ADDR (start_byte);
8599               p = pbeg + p_offset;
8600               pend = pbeg + pend_offset;
8601             }
8602           char_table_set (work_table, c, Qt);
8603         }
8604     }
8605
8606   safe_codings = list2 (Qraw_text, Qno_conversion);
8607   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8608     if (! NILP (XCAR (tail)))
8609       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8610
8611   return safe_codings;
8612 }
8613
8614
8615 DEFUN ("unencodable-char-position", Funencodable_char_position,
8616        Sunencodable_char_position, 3, 5, 0,
8617        doc: /*
8618 Return position of first un-encodable character in a region.
8619 START and END specify the region and CODING-SYSTEM specifies the
8620 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8621
8622 If optional 4th argument COUNT is non-nil, it specifies at most how
8623 many un-encodable characters to search.  In this case, the value is a
8624 list of positions.
8625
8626 If optional 5th argument STRING is non-nil, it is a string to search
8627 for un-encodable characters.  In that case, START and END are indexes
8628 to the string.  */)
8629   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8630 {
8631   EMACS_INT n;
8632   struct coding_system coding;
8633   Lisp_Object attrs, charset_list, translation_table;
8634   Lisp_Object positions;
8635   ptrdiff_t from, to;
8636   const unsigned char *p, *stop, *pend;
8637   bool ascii_compatible;
8638
8639   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8640   attrs = CODING_ID_ATTRS (coding.id);
8641   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8642     return Qnil;
8643   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8644   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8645   translation_table = get_translation_table (attrs, 1, NULL);
8646
8647   if (NILP (string))
8648     {
8649       validate_region (&start, &end);
8650       from = XINT (start);
8651       to = XINT (end);
8652       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8653           || (ascii_compatible
8654               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8655         return Qnil;
8656       p = CHAR_POS_ADDR (from);
8657       pend = CHAR_POS_ADDR (to);
8658       if (from < GPT && to >= GPT)
8659         stop = GPT_ADDR;
8660       else
8661         stop = pend;
8662     }
8663   else
8664     {
8665       CHECK_STRING (string);
8666       CHECK_NATNUM (start);
8667       CHECK_NATNUM (end);
8668       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8669         args_out_of_range_3 (string, start, end);
8670       from = XINT (start);
8671       to = XINT (end);
8672       if (! STRING_MULTIBYTE (string))
8673         return Qnil;
8674       p = SDATA (string) + string_char_to_byte (string, from);
8675       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8676       if (ascii_compatible && (to - from) == (pend - p))
8677         return Qnil;
8678     }
8679
8680   if (NILP (count))
8681     n = 1;
8682   else
8683     {
8684       CHECK_NATNUM (count);
8685       n = XINT (count);
8686     }
8687
8688   positions = Qnil;
8689   charset_map_loaded = 0;
8690   while (1)
8691     {
8692       int c;
8693
8694       if (ascii_compatible)
8695         while (p < stop && ASCII_BYTE_P (*p))
8696           p++, from++;
8697       if (p >= stop)
8698         {
8699           if (p >= pend)
8700             break;
8701           stop = pend;
8702           p = GAP_END_ADDR;
8703         }
8704
8705       c = STRING_CHAR_ADVANCE (p);
8706       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8707           && ! char_charset (translate_char (translation_table, c),
8708                              charset_list, NULL))
8709         {
8710           positions = Fcons (make_number (from), positions);
8711           n--;
8712           if (n == 0)
8713             break;
8714         }
8715
8716       from++;
8717       if (charset_map_loaded && NILP (string))
8718         {
8719           p = CHAR_POS_ADDR (from);
8720           pend = CHAR_POS_ADDR (to);
8721           if (from < GPT && to >= GPT)
8722             stop = GPT_ADDR;
8723           else
8724             stop = pend;
8725           charset_map_loaded = 0;
8726         }
8727     }
8728
8729   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8730 }
8731
8732
8733 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8734        Scheck_coding_systems_region, 3, 3, 0,
8735        doc: /* Check if the region is encodable by coding systems.
8736
8737 START and END are buffer positions specifying the region.
8738 CODING-SYSTEM-LIST is a list of coding systems to check.
8739
8740 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8741 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8742 whole region, POS0, POS1, ... are buffer positions where non-encodable
8743 characters are found.
8744
8745 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8746 value is nil.
8747
8748 START may be a string.  In that case, check if the string is
8749 encodable, and the value contains indices to the string instead of
8750 buffer positions.  END is ignored.
8751
8752 If the current buffer (or START if it is a string) is unibyte, the value
8753 is nil.  */)
8754   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8755 {
8756   Lisp_Object list;
8757   ptrdiff_t start_byte, end_byte;
8758   ptrdiff_t pos;
8759   const unsigned char *p, *pbeg, *pend;
8760   int c;
8761   Lisp_Object tail, elt, attrs;
8762
8763   if (STRINGP (start))
8764     {
8765       if (!STRING_MULTIBYTE (start)
8766           || SCHARS (start) == SBYTES (start))
8767         return Qnil;
8768       start_byte = 0;
8769       end_byte = SBYTES (start);
8770       pos = 0;
8771     }
8772   else
8773     {
8774       CHECK_NUMBER_COERCE_MARKER (start);
8775       CHECK_NUMBER_COERCE_MARKER (end);
8776       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8777         args_out_of_range (start, end);
8778       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8779         return Qnil;
8780       start_byte = CHAR_TO_BYTE (XINT (start));
8781       end_byte = CHAR_TO_BYTE (XINT (end));
8782       if (XINT (end) - XINT (start) == end_byte - start_byte)
8783         return Qnil;
8784
8785       if (XINT (start) < GPT && XINT (end) > GPT)
8786         {
8787           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8788             move_gap_both (XINT (start), start_byte);
8789           else
8790             move_gap_both (XINT (end), end_byte);
8791         }
8792       pos = XINT (start);
8793     }
8794
8795   list = Qnil;
8796   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8797     {
8798       elt = XCAR (tail);
8799       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8800       ASET (attrs, coding_attr_trans_tbl,
8801             get_translation_table (attrs, 1, NULL));
8802       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8803     }
8804
8805   if (STRINGP (start))
8806     p = pbeg = SDATA (start);
8807   else
8808     p = pbeg = BYTE_POS_ADDR (start_byte);
8809   pend = p + (end_byte - start_byte);
8810
8811   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8812   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8813
8814   while (p < pend)
8815     {
8816       if (ASCII_BYTE_P (*p))
8817         p++;
8818       else
8819         {
8820           c = STRING_CHAR_ADVANCE (p);
8821
8822           charset_map_loaded = 0;
8823           for (tail = list; CONSP (tail); tail = XCDR (tail))
8824             {
8825               elt = XCDR (XCAR (tail));
8826               if (! char_encodable_p (c, XCAR (elt)))
8827                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8828             }
8829           if (charset_map_loaded)
8830             {
8831               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8832
8833               if (STRINGP (start))
8834                 pbeg = SDATA (start);
8835               else
8836                 pbeg = BYTE_POS_ADDR (start_byte);
8837               p = pbeg + p_offset;
8838               pend = pbeg + pend_offset;
8839             }
8840         }
8841       pos++;
8842     }
8843
8844   tail = list;
8845   list = Qnil;
8846   for (; CONSP (tail); tail = XCDR (tail))
8847     {
8848       elt = XCAR (tail);
8849       if (CONSP (XCDR (XCDR (elt))))
8850         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8851                       list);
8852     }
8853
8854   return list;
8855 }
8856
8857
8858 static Lisp_Object
8859 code_convert_region (Lisp_Object start, Lisp_Object end,
8860                      Lisp_Object coding_system, Lisp_Object dst_object,
8861                      bool encodep, bool norecord)
8862 {
8863   struct coding_system coding;
8864   ptrdiff_t from, from_byte, to, to_byte;
8865   Lisp_Object src_object;
8866
8867   if (NILP (coding_system))
8868     coding_system = Qno_conversion;
8869   else
8870     CHECK_CODING_SYSTEM (coding_system);
8871   src_object = Fcurrent_buffer ();
8872   if (NILP (dst_object))
8873     dst_object = src_object;
8874   else if (! EQ (dst_object, Qt))
8875     CHECK_BUFFER (dst_object);
8876
8877   validate_region (&start, &end);
8878   from = XFASTINT (start);
8879   from_byte = CHAR_TO_BYTE (from);
8880   to = XFASTINT (end);
8881   to_byte = CHAR_TO_BYTE (to);
8882
8883   setup_coding_system (coding_system, &coding);
8884   coding.mode |= CODING_MODE_LAST_BLOCK;
8885
8886   if (encodep)
8887     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8888                           dst_object);
8889   else
8890     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8891                           dst_object);
8892   if (! norecord)
8893     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8894
8895   return (BUFFERP (dst_object)
8896           ? make_number (coding.produced_char)
8897           : coding.dst_object);
8898 }
8899
8900
8901 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8902        3, 4, "r\nzCoding system: ",
8903        doc: /* Decode the current region from the specified coding system.
8904 When called from a program, takes four arguments:
8905         START, END, CODING-SYSTEM, and DESTINATION.
8906 START and END are buffer positions.
8907
8908 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8909 If nil, the region between START and END is replaced by the decoded text.
8910 If buffer, the decoded text is inserted in that buffer after point (point
8911 does not move).
8912 In those cases, the length of the decoded text is returned.
8913 If DESTINATION is t, the decoded text is returned.
8914
8915 This function sets `last-coding-system-used' to the precise coding system
8916 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8917 not fully specified.)  */)
8918   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8919 {
8920   return code_convert_region (start, end, coding_system, destination, 0, 0);
8921 }
8922
8923 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8924        3, 4, "r\nzCoding system: ",
8925        doc: /* Encode the current region by specified coding system.
8926 When called from a program, takes four arguments:
8927         START, END, CODING-SYSTEM and DESTINATION.
8928 START and END are buffer positions.
8929
8930 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8931 If nil, the region between START and END is replace by the encoded text.
8932 If buffer, the encoded text is inserted in that buffer after point (point
8933 does not move).
8934 In those cases, the length of the encoded text is returned.
8935 If DESTINATION is t, the encoded text is returned.
8936
8937 This function sets `last-coding-system-used' to the precise coding system
8938 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8939 not fully specified.)  */)
8940   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8941 {
8942   return code_convert_region (start, end, coding_system, destination, 1, 0);
8943 }
8944
8945 Lisp_Object
8946 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8947                      Lisp_Object dst_object, bool encodep, bool nocopy,
8948                      bool norecord)
8949 {
8950   struct coding_system coding;
8951   ptrdiff_t chars, bytes;
8952
8953   CHECK_STRING (string);
8954   if (NILP (coding_system))
8955     {
8956       if (! norecord)
8957         Vlast_coding_system_used = Qno_conversion;
8958       if (NILP (dst_object))
8959         return (nocopy ? Fcopy_sequence (string) : string);
8960     }
8961
8962   if (NILP (coding_system))
8963     coding_system = Qno_conversion;
8964   else
8965     CHECK_CODING_SYSTEM (coding_system);
8966   if (NILP (dst_object))
8967     dst_object = Qt;
8968   else if (! EQ (dst_object, Qt))
8969     CHECK_BUFFER (dst_object);
8970
8971   setup_coding_system (coding_system, &coding);
8972   coding.mode |= CODING_MODE_LAST_BLOCK;
8973   chars = SCHARS (string);
8974   bytes = SBYTES (string);
8975   if (encodep)
8976     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8977   else
8978     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8979   if (! norecord)
8980     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8981
8982   return (BUFFERP (dst_object)
8983           ? make_number (coding.produced_char)
8984           : coding.dst_object);
8985 }
8986
8987
8988 /* Encode or decode STRING according to CODING_SYSTEM.
8989    Do not set Vlast_coding_system_used.
8990
8991    This function is called only from macros DECODE_FILE and
8992    ENCODE_FILE, thus we ignore character composition.  */
8993
8994 Lisp_Object
8995 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8996                               bool encodep)
8997 {
8998   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8999 }
9000
9001
9002 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9003        2, 4, 0,
9004        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9005
9006 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9007 if the decoding operation is trivial.
9008
9009 Optional fourth arg BUFFER non-nil means that the decoded text is
9010 inserted in that buffer after point (point does not move).  In this
9011 case, the return value is the length of the decoded text.
9012
9013 This function sets `last-coding-system-used' to the precise coding system
9014 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9015 not fully specified.)  */)
9016   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9017 {
9018   return code_convert_string (string, coding_system, buffer,
9019                               0, ! NILP (nocopy), 0);
9020 }
9021
9022 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9023        2, 4, 0,
9024        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9025
9026 Optional third arg NOCOPY non-nil means it is OK to return STRING
9027 itself if the encoding operation is trivial.
9028
9029 Optional fourth arg BUFFER non-nil means that the encoded text is
9030 inserted in that buffer after point (point does not move).  In this
9031 case, the return value is the length of the encoded text.
9032
9033 This function sets `last-coding-system-used' to the precise coding system
9034 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9035 not fully specified.)  */)
9036   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9037 {
9038   return code_convert_string (string, coding_system, buffer,
9039                               1, ! NILP (nocopy), 0);
9040 }
9041
9042 \f
9043 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9044        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9045 Return the corresponding character.  */)
9046   (Lisp_Object code)
9047 {
9048   Lisp_Object spec, attrs, val;
9049   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9050   EMACS_INT ch;
9051   int c;
9052
9053   CHECK_NATNUM (code);
9054   ch = XFASTINT (code);
9055   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9056   attrs = AREF (spec, 0);
9057
9058   if (ASCII_BYTE_P (ch)
9059       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9060     return code;
9061
9062   val = CODING_ATTR_CHARSET_LIST (attrs);
9063   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9064   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9065   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9066
9067   if (ch <= 0x7F)
9068     {
9069       c = ch;
9070       charset = charset_roman;
9071     }
9072   else if (ch >= 0xA0 && ch < 0xDF)
9073     {
9074       c = ch - 0x80;
9075       charset = charset_kana;
9076     }
9077   else
9078     {
9079       EMACS_INT c1 = ch >> 8;
9080       int c2 = ch & 0xFF;
9081
9082       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9083           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9084         error ("Invalid code: %"pI"d", ch);
9085       c = ch;
9086       SJIS_TO_JIS (c);
9087       charset = charset_kanji;
9088     }
9089   c = DECODE_CHAR (charset, c);
9090   if (c < 0)
9091     error ("Invalid code: %"pI"d", ch);
9092   return make_number (c);
9093 }
9094
9095
9096 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9097        doc: /* Encode a Japanese character CH to shift_jis encoding.
9098 Return the corresponding code in SJIS.  */)
9099   (Lisp_Object ch)
9100 {
9101   Lisp_Object spec, attrs, charset_list;
9102   int c;
9103   struct charset *charset;
9104   unsigned code;
9105
9106   CHECK_CHARACTER (ch);
9107   c = XFASTINT (ch);
9108   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9109   attrs = AREF (spec, 0);
9110
9111   if (ASCII_CHAR_P (c)
9112       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9113     return ch;
9114
9115   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9116   charset = char_charset (c, charset_list, &code);
9117   if (code == CHARSET_INVALID_CODE (charset))
9118     error ("Can't encode by shift_jis encoding: %c", c);
9119   JIS_TO_SJIS (code);
9120
9121   return make_number (code);
9122 }
9123
9124 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9125        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9126 Return the corresponding character.  */)
9127   (Lisp_Object code)
9128 {
9129   Lisp_Object spec, attrs, val;
9130   struct charset *charset_roman, *charset_big5, *charset;
9131   EMACS_INT ch;
9132   int c;
9133
9134   CHECK_NATNUM (code);
9135   ch = XFASTINT (code);
9136   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9137   attrs = AREF (spec, 0);
9138
9139   if (ASCII_BYTE_P (ch)
9140       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9141     return code;
9142
9143   val = CODING_ATTR_CHARSET_LIST (attrs);
9144   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9145   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9146
9147   if (ch <= 0x7F)
9148     {
9149       c = ch;
9150       charset = charset_roman;
9151     }
9152   else
9153     {
9154       EMACS_INT b1 = ch >> 8;
9155       int b2 = ch & 0x7F;
9156       if (b1 < 0xA1 || b1 > 0xFE
9157           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9158         error ("Invalid code: %"pI"d", ch);
9159       c = ch;
9160       charset = charset_big5;
9161     }
9162   c = DECODE_CHAR (charset, c);
9163   if (c < 0)
9164     error ("Invalid code: %"pI"d", ch);
9165   return make_number (c);
9166 }
9167
9168 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9169        doc: /* Encode the Big5 character CH to BIG5 coding system.
9170 Return the corresponding character code in Big5.  */)
9171   (Lisp_Object ch)
9172 {
9173   Lisp_Object spec, attrs, charset_list;
9174   struct charset *charset;
9175   int c;
9176   unsigned code;
9177
9178   CHECK_CHARACTER (ch);
9179   c = XFASTINT (ch);
9180   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9181   attrs = AREF (spec, 0);
9182   if (ASCII_CHAR_P (c)
9183       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9184     return ch;
9185
9186   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9187   charset = char_charset (c, charset_list, &code);
9188   if (code == CHARSET_INVALID_CODE (charset))
9189     error ("Can't encode by Big5 encoding: %c", c);
9190
9191   return make_number (code);
9192 }
9193
9194 \f
9195 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9196        Sset_terminal_coding_system_internal, 1, 2, 0,
9197        doc: /* Internal use only.  */)
9198   (Lisp_Object coding_system, Lisp_Object terminal)
9199 {
9200   struct terminal *term = get_terminal (terminal, 1);
9201   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9202   CHECK_SYMBOL (coding_system);
9203   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9204   /* We had better not send unsafe characters to terminal.  */
9205   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9206   /* Character composition should be disabled.  */
9207   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9208   terminal_coding->src_multibyte = 1;
9209   terminal_coding->dst_multibyte = 0;
9210   tset_charset_list
9211     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9212             ? coding_charset_list (terminal_coding)
9213             : Fcons (make_number (charset_ascii), Qnil)));
9214   return Qnil;
9215 }
9216
9217 DEFUN ("set-safe-terminal-coding-system-internal",
9218        Fset_safe_terminal_coding_system_internal,
9219        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9220        doc: /* Internal use only.  */)
9221   (Lisp_Object coding_system)
9222 {
9223   CHECK_SYMBOL (coding_system);
9224   setup_coding_system (Fcheck_coding_system (coding_system),
9225                        &safe_terminal_coding);
9226   /* Character composition should be disabled.  */
9227   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9228   safe_terminal_coding.src_multibyte = 1;
9229   safe_terminal_coding.dst_multibyte = 0;
9230   return Qnil;
9231 }
9232
9233 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9234        Sterminal_coding_system, 0, 1, 0,
9235        doc: /* Return coding system specified for terminal output on the given terminal.
9236 TERMINAL may be a terminal object, a frame, or nil for the selected
9237 frame's terminal device.  */)
9238   (Lisp_Object terminal)
9239 {
9240   struct coding_system *terminal_coding
9241     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9242   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9243
9244   /* For backward compatibility, return nil if it is `undecided'.  */
9245   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9246 }
9247
9248 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9249        Sset_keyboard_coding_system_internal, 1, 2, 0,
9250        doc: /* Internal use only.  */)
9251   (Lisp_Object coding_system, Lisp_Object terminal)
9252 {
9253   struct terminal *t = get_terminal (terminal, 1);
9254   CHECK_SYMBOL (coding_system);
9255   if (NILP (coding_system))
9256     coding_system = Qno_conversion;
9257   else
9258     Fcheck_coding_system (coding_system);
9259   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9260   /* Character composition should be disabled.  */
9261   TERMINAL_KEYBOARD_CODING (t)->common_flags
9262     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9263   return Qnil;
9264 }
9265
9266 DEFUN ("keyboard-coding-system",
9267        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9268        doc: /* Return coding system specified for decoding keyboard input.  */)
9269   (Lisp_Object terminal)
9270 {
9271   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9272                          (get_terminal (terminal, 1))->id);
9273 }
9274
9275 \f
9276 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9277        Sfind_operation_coding_system,  1, MANY, 0,
9278        doc: /* Choose a coding system for an operation based on the target name.
9279 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9280 DECODING-SYSTEM is the coding system to use for decoding
9281 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9282 for encoding (in case OPERATION does encoding).
9283
9284 The first argument OPERATION specifies an I/O primitive:
9285   For file I/O, `insert-file-contents' or `write-region'.
9286   For process I/O, `call-process', `call-process-region', or `start-process'.
9287   For network I/O, `open-network-stream'.
9288
9289 The remaining arguments should be the same arguments that were passed
9290 to the primitive.  Depending on which primitive, one of those arguments
9291 is selected as the TARGET.  For example, if OPERATION does file I/O,
9292 whichever argument specifies the file name is TARGET.
9293
9294 TARGET has a meaning which depends on OPERATION:
9295   For file I/O, TARGET is a file name (except for the special case below).
9296   For process I/O, TARGET is a process name.
9297   For network I/O, TARGET is a service name or a port number.
9298
9299 This function looks up what is specified for TARGET in
9300 `file-coding-system-alist', `process-coding-system-alist',
9301 or `network-coding-system-alist' depending on OPERATION.
9302 They may specify a coding system, a cons of coding systems,
9303 or a function symbol to call.
9304 In the last case, we call the function with one argument,
9305 which is a list of all the arguments given to this function.
9306 If the function can't decide a coding system, it can return
9307 `undecided' so that the normal code-detection is performed.
9308
9309 If OPERATION is `insert-file-contents', the argument corresponding to
9310 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9311 file name to look up, and BUFFER is a buffer that contains the file's
9312 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9313 function to call for FILENAME, that function should examine the
9314 contents of BUFFER instead of reading the file.
9315
9316 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9317   (ptrdiff_t nargs, Lisp_Object *args)
9318 {
9319   Lisp_Object operation, target_idx, target, val;
9320   register Lisp_Object chain;
9321
9322   if (nargs < 2)
9323     error ("Too few arguments");
9324   operation = args[0];
9325   if (!SYMBOLP (operation)
9326       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9327     error ("Invalid first argument");
9328   if (nargs <= 1 + XFASTINT (target_idx))
9329     error ("Too few arguments for operation `%s'",
9330            SDATA (SYMBOL_NAME (operation)));
9331   target = args[XFASTINT (target_idx) + 1];
9332   if (!(STRINGP (target)
9333         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9334             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9335         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9336     error ("Invalid argument %"pI"d of operation `%s'",
9337            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9338   if (CONSP (target))
9339     target = XCAR (target);
9340
9341   chain = ((EQ (operation, Qinsert_file_contents)
9342             || EQ (operation, Qwrite_region))
9343            ? Vfile_coding_system_alist
9344            : (EQ (operation, Qopen_network_stream)
9345               ? Vnetwork_coding_system_alist
9346               : Vprocess_coding_system_alist));
9347   if (NILP (chain))
9348     return Qnil;
9349
9350   for (; CONSP (chain); chain = XCDR (chain))
9351     {
9352       Lisp_Object elt;
9353
9354       elt = XCAR (chain);
9355       if (CONSP (elt)
9356           && ((STRINGP (target)
9357                && STRINGP (XCAR (elt))
9358                && fast_string_match (XCAR (elt), target) >= 0)
9359               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9360         {
9361           val = XCDR (elt);
9362           /* Here, if VAL is both a valid coding system and a valid
9363              function symbol, we return VAL as a coding system.  */
9364           if (CONSP (val))
9365             return val;
9366           if (! SYMBOLP (val))
9367             return Qnil;
9368           if (! NILP (Fcoding_system_p (val)))
9369             return Fcons (val, val);
9370           if (! NILP (Ffboundp (val)))
9371             {
9372               /* We use call1 rather than safe_call1
9373                  so as to get bug reports about functions called here
9374                  which don't handle the current interface.  */
9375               val = call1 (val, Flist (nargs, args));
9376               if (CONSP (val))
9377                 return val;
9378               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9379                 return Fcons (val, val);
9380             }
9381           return Qnil;
9382         }
9383     }
9384   return Qnil;
9385 }
9386
9387 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9388        Sset_coding_system_priority, 0, MANY, 0,
9389        doc: /* Assign higher priority to the coding systems given as arguments.
9390 If multiple coding systems belong to the same category,
9391 all but the first one are ignored.
9392
9393 usage: (set-coding-system-priority &rest coding-systems)  */)
9394   (ptrdiff_t nargs, Lisp_Object *args)
9395 {
9396   ptrdiff_t i, j;
9397   bool changed[coding_category_max];
9398   enum coding_category priorities[coding_category_max];
9399
9400   memset (changed, 0, sizeof changed);
9401
9402   for (i = j = 0; i < nargs; i++)
9403     {
9404       enum coding_category category;
9405       Lisp_Object spec, attrs;
9406
9407       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9408       attrs = AREF (spec, 0);
9409       category = XINT (CODING_ATTR_CATEGORY (attrs));
9410       if (changed[category])
9411         /* Ignore this coding system because a coding system of the
9412            same category already had a higher priority.  */
9413         continue;
9414       changed[category] = 1;
9415       priorities[j++] = category;
9416       if (coding_categories[category].id >= 0
9417           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9418         setup_coding_system (args[i], &coding_categories[category]);
9419       Fset (AREF (Vcoding_category_table, category), args[i]);
9420     }
9421
9422   /* Now we have decided top J priorities.  Reflect the order of the
9423      original priorities to the remaining priorities.  */
9424
9425   for (i = j, j = 0; i < coding_category_max; i++, j++)
9426     {
9427       while (j < coding_category_max
9428              && changed[coding_priorities[j]])
9429         j++;
9430       if (j == coding_category_max)
9431         emacs_abort ();
9432       priorities[i] = coding_priorities[j];
9433     }
9434
9435   memcpy (coding_priorities, priorities, sizeof priorities);
9436
9437   /* Update `coding-category-list'.  */
9438   Vcoding_category_list = Qnil;
9439   for (i = coding_category_max; i-- > 0; )
9440     Vcoding_category_list
9441       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9442                Vcoding_category_list);
9443
9444   return Qnil;
9445 }
9446
9447 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9448        Scoding_system_priority_list, 0, 1, 0,
9449        doc: /* Return a list of coding systems ordered by their priorities.
9450 The list contains a subset of coding systems; i.e. coding systems
9451 assigned to each coding category (see `coding-category-list').
9452
9453 HIGHESTP non-nil means just return the highest priority one.  */)
9454   (Lisp_Object highestp)
9455 {
9456   int i;
9457   Lisp_Object val;
9458
9459   for (i = 0, val = Qnil; i < coding_category_max; i++)
9460     {
9461       enum coding_category category = coding_priorities[i];
9462       int id = coding_categories[category].id;
9463       Lisp_Object attrs;
9464
9465       if (id < 0)
9466         continue;
9467       attrs = CODING_ID_ATTRS (id);
9468       if (! NILP (highestp))
9469         return CODING_ATTR_BASE_NAME (attrs);
9470       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9471     }
9472   return Fnreverse (val);
9473 }
9474
9475 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9476
9477 static Lisp_Object
9478 make_subsidiaries (Lisp_Object base)
9479 {
9480   Lisp_Object subsidiaries;
9481   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9482   char *buf = alloca (base_name_len + 6);
9483   int i;
9484
9485   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9486   subsidiaries = Fmake_vector (make_number (3), Qnil);
9487   for (i = 0; i < 3; i++)
9488     {
9489       strcpy (buf + base_name_len, suffixes[i]);
9490       ASET (subsidiaries, i, intern (buf));
9491     }
9492   return subsidiaries;
9493 }
9494
9495
9496 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9497        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9498        doc: /* For internal use only.
9499 usage: (define-coding-system-internal ...)  */)
9500   (ptrdiff_t nargs, Lisp_Object *args)
9501 {
9502   Lisp_Object name;
9503   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9504   Lisp_Object attrs;            /* Vector of attributes.  */
9505   Lisp_Object eol_type;
9506   Lisp_Object aliases;
9507   Lisp_Object coding_type, charset_list, safe_charsets;
9508   enum coding_category category;
9509   Lisp_Object tail, val;
9510   int max_charset_id = 0;
9511   int i;
9512
9513   if (nargs < coding_arg_max)
9514     goto short_args;
9515
9516   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9517
9518   name = args[coding_arg_name];
9519   CHECK_SYMBOL (name);
9520   ASET (attrs, coding_attr_base_name, name);
9521
9522   val = args[coding_arg_mnemonic];
9523   if (! STRINGP (val))
9524     CHECK_CHARACTER (val);
9525   ASET (attrs, coding_attr_mnemonic, val);
9526
9527   coding_type = args[coding_arg_coding_type];
9528   CHECK_SYMBOL (coding_type);
9529   ASET (attrs, coding_attr_type, coding_type);
9530
9531   charset_list = args[coding_arg_charset_list];
9532   if (SYMBOLP (charset_list))
9533     {
9534       if (EQ (charset_list, Qiso_2022))
9535         {
9536           if (! EQ (coding_type, Qiso_2022))
9537             error ("Invalid charset-list");
9538           charset_list = Viso_2022_charset_list;
9539         }
9540       else if (EQ (charset_list, Qemacs_mule))
9541         {
9542           if (! EQ (coding_type, Qemacs_mule))
9543             error ("Invalid charset-list");
9544           charset_list = Vemacs_mule_charset_list;
9545         }
9546       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9547         {
9548           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9549             error ("Invalid charset-list");
9550           if (max_charset_id < XFASTINT (XCAR (tail)))
9551             max_charset_id = XFASTINT (XCAR (tail));
9552         }
9553     }
9554   else
9555     {
9556       charset_list = Fcopy_sequence (charset_list);
9557       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9558         {
9559           struct charset *charset;
9560
9561           val = XCAR (tail);
9562           CHECK_CHARSET_GET_CHARSET (val, charset);
9563           if (EQ (coding_type, Qiso_2022)
9564               ? CHARSET_ISO_FINAL (charset) < 0
9565               : EQ (coding_type, Qemacs_mule)
9566               ? CHARSET_EMACS_MULE_ID (charset) < 0
9567               : 0)
9568             error ("Can't handle charset `%s'",
9569                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9570
9571           XSETCAR (tail, make_number (charset->id));
9572           if (max_charset_id < charset->id)
9573             max_charset_id = charset->id;
9574         }
9575     }
9576   ASET (attrs, coding_attr_charset_list, charset_list);
9577
9578   safe_charsets = make_uninit_string (max_charset_id + 1);
9579   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9580   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9581     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9582   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9583
9584   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9585
9586   val = args[coding_arg_decode_translation_table];
9587   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9588     CHECK_SYMBOL (val);
9589   ASET (attrs, coding_attr_decode_tbl, val);
9590
9591   val = args[coding_arg_encode_translation_table];
9592   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9593     CHECK_SYMBOL (val);
9594   ASET (attrs, coding_attr_encode_tbl, val);
9595
9596   val = args[coding_arg_post_read_conversion];
9597   CHECK_SYMBOL (val);
9598   ASET (attrs, coding_attr_post_read, val);
9599
9600   val = args[coding_arg_pre_write_conversion];
9601   CHECK_SYMBOL (val);
9602   ASET (attrs, coding_attr_pre_write, val);
9603
9604   val = args[coding_arg_default_char];
9605   if (NILP (val))
9606     ASET (attrs, coding_attr_default_char, make_number (' '));
9607   else
9608     {
9609       CHECK_CHARACTER (val);
9610       ASET (attrs, coding_attr_default_char, val);
9611     }
9612
9613   val = args[coding_arg_for_unibyte];
9614   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9615
9616   val = args[coding_arg_plist];
9617   CHECK_LIST (val);
9618   ASET (attrs, coding_attr_plist, val);
9619
9620   if (EQ (coding_type, Qcharset))
9621     {
9622       /* Generate a lisp vector of 256 elements.  Each element is nil,
9623          integer, or a list of charset IDs.
9624
9625          If Nth element is nil, the byte code N is invalid in this
9626          coding system.
9627
9628          If Nth element is a number NUM, N is the first byte of a
9629          charset whose ID is NUM.
9630
9631          If Nth element is a list of charset IDs, N is the first byte
9632          of one of them.  The list is sorted by dimensions of the
9633          charsets.  A charset of smaller dimension comes first. */
9634       val = Fmake_vector (make_number (256), Qnil);
9635
9636       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9637         {
9638           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9639           int dim = CHARSET_DIMENSION (charset);
9640           int idx = (dim - 1) * 4;
9641
9642           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9643             ASET (attrs, coding_attr_ascii_compat, Qt);
9644
9645           for (i = charset->code_space[idx];
9646                i <= charset->code_space[idx + 1]; i++)
9647             {
9648               Lisp_Object tmp, tmp2;
9649               int dim2;
9650
9651               tmp = AREF (val, i);
9652               if (NILP (tmp))
9653                 tmp = XCAR (tail);
9654               else if (NUMBERP (tmp))
9655                 {
9656                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9657                   if (dim < dim2)
9658                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9659                   else
9660                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9661                 }
9662               else
9663                 {
9664                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9665                     {
9666                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9667                       if (dim < dim2)
9668                         break;
9669                     }
9670                   if (NILP (tmp2))
9671                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9672                   else
9673                     {
9674                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9675                       XSETCAR (tmp2, XCAR (tail));
9676                     }
9677                 }
9678               ASET (val, i, tmp);
9679             }
9680         }
9681       ASET (attrs, coding_attr_charset_valids, val);
9682       category = coding_category_charset;
9683     }
9684   else if (EQ (coding_type, Qccl))
9685     {
9686       Lisp_Object valids;
9687
9688       if (nargs < coding_arg_ccl_max)
9689         goto short_args;
9690
9691       val = args[coding_arg_ccl_decoder];
9692       CHECK_CCL_PROGRAM (val);
9693       if (VECTORP (val))
9694         val = Fcopy_sequence (val);
9695       ASET (attrs, coding_attr_ccl_decoder, val);
9696
9697       val = args[coding_arg_ccl_encoder];
9698       CHECK_CCL_PROGRAM (val);
9699       if (VECTORP (val))
9700         val = Fcopy_sequence (val);
9701       ASET (attrs, coding_attr_ccl_encoder, val);
9702
9703       val = args[coding_arg_ccl_valids];
9704       valids = Fmake_string (make_number (256), make_number (0));
9705       for (tail = val; CONSP (tail); tail = XCDR (tail))
9706         {
9707           int from, to;
9708
9709           val = XCAR (tail);
9710           if (INTEGERP (val))
9711             {
9712               if (! (0 <= XINT (val) && XINT (val) <= 255))
9713                 args_out_of_range_3 (val, make_number (0), make_number (255));
9714               from = to = XINT (val);
9715             }
9716           else
9717             {
9718               CHECK_CONS (val);
9719               CHECK_NATNUM_CAR (val);
9720               CHECK_NUMBER_CDR (val);
9721               if (XINT (XCAR (val)) > 255)
9722                 args_out_of_range_3 (XCAR (val),
9723                                      make_number (0), make_number (255));
9724               from = XINT (XCAR (val));
9725               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9726                 args_out_of_range_3 (XCDR (val),
9727                                      XCAR (val), make_number (255));
9728               to = XINT (XCDR (val));
9729             }
9730           for (i = from; i <= to; i++)
9731             SSET (valids, i, 1);
9732         }
9733       ASET (attrs, coding_attr_ccl_valids, valids);
9734
9735       category = coding_category_ccl;
9736     }
9737   else if (EQ (coding_type, Qutf_16))
9738     {
9739       Lisp_Object bom, endian;
9740
9741       ASET (attrs, coding_attr_ascii_compat, Qnil);
9742
9743       if (nargs < coding_arg_utf16_max)
9744         goto short_args;
9745
9746       bom = args[coding_arg_utf16_bom];
9747       if (! NILP (bom) && ! EQ (bom, Qt))
9748         {
9749           CHECK_CONS (bom);
9750           val = XCAR (bom);
9751           CHECK_CODING_SYSTEM (val);
9752           val = XCDR (bom);
9753           CHECK_CODING_SYSTEM (val);
9754         }
9755       ASET (attrs, coding_attr_utf_bom, bom);
9756
9757       endian = args[coding_arg_utf16_endian];
9758       CHECK_SYMBOL (endian);
9759       if (NILP (endian))
9760         endian = Qbig;
9761       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9762         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9763       ASET (attrs, coding_attr_utf_16_endian, endian);
9764
9765       category = (CONSP (bom)
9766                   ? coding_category_utf_16_auto
9767                   : NILP (bom)
9768                   ? (EQ (endian, Qbig)
9769                      ? coding_category_utf_16_be_nosig
9770                      : coding_category_utf_16_le_nosig)
9771                   : (EQ (endian, Qbig)
9772                      ? coding_category_utf_16_be
9773                      : coding_category_utf_16_le));
9774     }
9775   else if (EQ (coding_type, Qiso_2022))
9776     {
9777       Lisp_Object initial, reg_usage, request, flags;
9778
9779       if (nargs < coding_arg_iso2022_max)
9780         goto short_args;
9781
9782       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9783       CHECK_VECTOR (initial);
9784       for (i = 0; i < 4; i++)
9785         {
9786           val = Faref (initial, make_number (i));
9787           if (! NILP (val))
9788             {
9789               struct charset *charset;
9790
9791               CHECK_CHARSET_GET_CHARSET (val, charset);
9792               ASET (initial, i, make_number (CHARSET_ID (charset)));
9793               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9794                 ASET (attrs, coding_attr_ascii_compat, Qt);
9795             }
9796           else
9797             ASET (initial, i, make_number (-1));
9798         }
9799
9800       reg_usage = args[coding_arg_iso2022_reg_usage];
9801       CHECK_CONS (reg_usage);
9802       CHECK_NUMBER_CAR (reg_usage);
9803       CHECK_NUMBER_CDR (reg_usage);
9804
9805       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9806       for (tail = request; CONSP (tail); tail = XCDR (tail))
9807         {
9808           int id;
9809           Lisp_Object tmp1;
9810
9811           val = XCAR (tail);
9812           CHECK_CONS (val);
9813           tmp1 = XCAR (val);
9814           CHECK_CHARSET_GET_ID (tmp1, id);
9815           CHECK_NATNUM_CDR (val);
9816           if (XINT (XCDR (val)) >= 4)
9817             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9818           XSETCAR (val, make_number (id));
9819         }
9820
9821       flags = args[coding_arg_iso2022_flags];
9822       CHECK_NATNUM (flags);
9823       i = XINT (flags) & INT_MAX;
9824       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9825         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9826       flags = make_number (i);
9827
9828       ASET (attrs, coding_attr_iso_initial, initial);
9829       ASET (attrs, coding_attr_iso_usage, reg_usage);
9830       ASET (attrs, coding_attr_iso_request, request);
9831       ASET (attrs, coding_attr_iso_flags, flags);
9832       setup_iso_safe_charsets (attrs);
9833
9834       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9835         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9836                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9837                     ? coding_category_iso_7_else
9838                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9839                     ? coding_category_iso_7
9840                     : coding_category_iso_7_tight);
9841       else
9842         {
9843           int id = XINT (AREF (initial, 1));
9844
9845           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9846                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9847                        || id < 0)
9848                       ? coding_category_iso_8_else
9849                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9850                       ? coding_category_iso_8_1
9851                       : coding_category_iso_8_2);
9852         }
9853       if (category != coding_category_iso_8_1
9854           && category != coding_category_iso_8_2)
9855         ASET (attrs, coding_attr_ascii_compat, Qnil);
9856     }
9857   else if (EQ (coding_type, Qemacs_mule))
9858     {
9859       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9860         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9861       ASET (attrs, coding_attr_ascii_compat, Qt);
9862       category = coding_category_emacs_mule;
9863     }
9864   else if (EQ (coding_type, Qshift_jis))
9865     {
9866
9867       struct charset *charset;
9868
9869       if (XINT (Flength (charset_list)) != 3
9870           && XINT (Flength (charset_list)) != 4)
9871         error ("There should be three or four charsets");
9872
9873       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9874       if (CHARSET_DIMENSION (charset) != 1)
9875         error ("Dimension of charset %s is not one",
9876                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9877       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9878         ASET (attrs, coding_attr_ascii_compat, Qt);
9879
9880       charset_list = XCDR (charset_list);
9881       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9882       if (CHARSET_DIMENSION (charset) != 1)
9883         error ("Dimension of charset %s is not one",
9884                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9885
9886       charset_list = XCDR (charset_list);
9887       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9888       if (CHARSET_DIMENSION (charset) != 2)
9889         error ("Dimension of charset %s is not two",
9890                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9891
9892       charset_list = XCDR (charset_list);
9893       if (! NILP (charset_list))
9894         {
9895           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9896           if (CHARSET_DIMENSION (charset) != 2)
9897             error ("Dimension of charset %s is not two",
9898                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9899         }
9900
9901       category = coding_category_sjis;
9902       Vsjis_coding_system = name;
9903     }
9904   else if (EQ (coding_type, Qbig5))
9905     {
9906       struct charset *charset;
9907
9908       if (XINT (Flength (charset_list)) != 2)
9909         error ("There should be just two charsets");
9910
9911       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9912       if (CHARSET_DIMENSION (charset) != 1)
9913         error ("Dimension of charset %s is not one",
9914                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9915       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9916         ASET (attrs, coding_attr_ascii_compat, Qt);
9917
9918       charset_list = XCDR (charset_list);
9919       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9920       if (CHARSET_DIMENSION (charset) != 2)
9921         error ("Dimension of charset %s is not two",
9922                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9923
9924       category = coding_category_big5;
9925       Vbig5_coding_system = name;
9926     }
9927   else if (EQ (coding_type, Qraw_text))
9928     {
9929       category = coding_category_raw_text;
9930       ASET (attrs, coding_attr_ascii_compat, Qt);
9931     }
9932   else if (EQ (coding_type, Qutf_8))
9933     {
9934       Lisp_Object bom;
9935
9936       if (nargs < coding_arg_utf8_max)
9937         goto short_args;
9938
9939       bom = args[coding_arg_utf8_bom];
9940       if (! NILP (bom) && ! EQ (bom, Qt))
9941         {
9942           CHECK_CONS (bom);
9943           val = XCAR (bom);
9944           CHECK_CODING_SYSTEM (val);
9945           val = XCDR (bom);
9946           CHECK_CODING_SYSTEM (val);
9947         }
9948       ASET (attrs, coding_attr_utf_bom, bom);
9949       if (NILP (bom))
9950         ASET (attrs, coding_attr_ascii_compat, Qt);
9951
9952       category = (CONSP (bom) ? coding_category_utf_8_auto
9953                   : NILP (bom) ? coding_category_utf_8_nosig
9954                   : coding_category_utf_8_sig);
9955     }
9956   else if (EQ (coding_type, Qundecided))
9957     category = coding_category_undecided;
9958   else
9959     error ("Invalid coding system type: %s",
9960            SDATA (SYMBOL_NAME (coding_type)));
9961
9962   ASET (attrs, coding_attr_category, make_number (category));
9963   ASET (attrs, coding_attr_plist,
9964         Fcons (QCcategory,
9965                Fcons (AREF (Vcoding_category_table, category),
9966                       CODING_ATTR_PLIST (attrs))));
9967   ASET (attrs, coding_attr_plist,
9968         Fcons (QCascii_compatible_p,
9969                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9970                       CODING_ATTR_PLIST (attrs))));
9971
9972   eol_type = args[coding_arg_eol_type];
9973   if (! NILP (eol_type)
9974       && ! EQ (eol_type, Qunix)
9975       && ! EQ (eol_type, Qdos)
9976       && ! EQ (eol_type, Qmac))
9977     error ("Invalid eol-type");
9978
9979   aliases = Fcons (name, Qnil);
9980
9981   if (NILP (eol_type))
9982     {
9983       eol_type = make_subsidiaries (name);
9984       for (i = 0; i < 3; i++)
9985         {
9986           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9987
9988           this_name = AREF (eol_type, i);
9989           this_aliases = Fcons (this_name, Qnil);
9990           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9991           this_spec = Fmake_vector (make_number (3), attrs);
9992           ASET (this_spec, 1, this_aliases);
9993           ASET (this_spec, 2, this_eol_type);
9994           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9995           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9996           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9997           if (NILP (val))
9998             Vcoding_system_alist
9999               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10000                        Vcoding_system_alist);
10001         }
10002     }
10003
10004   spec_vec = Fmake_vector (make_number (3), attrs);
10005   ASET (spec_vec, 1, aliases);
10006   ASET (spec_vec, 2, eol_type);
10007
10008   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10009   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10010   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10011   if (NILP (val))
10012     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10013                                   Vcoding_system_alist);
10014
10015   {
10016     int id = coding_categories[category].id;
10017
10018     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10019       setup_coding_system (name, &coding_categories[category]);
10020   }
10021
10022   return Qnil;
10023
10024  short_args:
10025   return Fsignal (Qwrong_number_of_arguments,
10026                   Fcons (intern ("define-coding-system-internal"),
10027                          make_number (nargs)));
10028 }
10029
10030
10031 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10032        3, 3, 0,
10033        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10034   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10035 {
10036   Lisp_Object spec, attrs;
10037
10038   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10039   attrs = AREF (spec, 0);
10040   if (EQ (prop, QCmnemonic))
10041     {
10042       if (! STRINGP (val))
10043         CHECK_CHARACTER (val);
10044       ASET (attrs, coding_attr_mnemonic, val);
10045     }
10046   else if (EQ (prop, QCdefault_char))
10047     {
10048       if (NILP (val))
10049         val = make_number (' ');
10050       else
10051         CHECK_CHARACTER (val);
10052       ASET (attrs, coding_attr_default_char, val);
10053     }
10054   else if (EQ (prop, QCdecode_translation_table))
10055     {
10056       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10057         CHECK_SYMBOL (val);
10058       ASET (attrs, coding_attr_decode_tbl, val);
10059     }
10060   else if (EQ (prop, QCencode_translation_table))
10061     {
10062       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10063         CHECK_SYMBOL (val);
10064       ASET (attrs, coding_attr_encode_tbl, val);
10065     }
10066   else if (EQ (prop, QCpost_read_conversion))
10067     {
10068       CHECK_SYMBOL (val);
10069       ASET (attrs, coding_attr_post_read, val);
10070     }
10071   else if (EQ (prop, QCpre_write_conversion))
10072     {
10073       CHECK_SYMBOL (val);
10074       ASET (attrs, coding_attr_pre_write, val);
10075     }
10076   else if (EQ (prop, QCascii_compatible_p))
10077     {
10078       ASET (attrs, coding_attr_ascii_compat, val);
10079     }
10080
10081   ASET (attrs, coding_attr_plist,
10082         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10083   return val;
10084 }
10085
10086
10087 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10088        Sdefine_coding_system_alias, 2, 2, 0,
10089        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10090   (Lisp_Object alias, Lisp_Object coding_system)
10091 {
10092   Lisp_Object spec, aliases, eol_type, val;
10093
10094   CHECK_SYMBOL (alias);
10095   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10096   aliases = AREF (spec, 1);
10097   /* ALIASES should be a list of length more than zero, and the first
10098      element is a base coding system.  Append ALIAS at the tail of the
10099      list.  */
10100   while (!NILP (XCDR (aliases)))
10101     aliases = XCDR (aliases);
10102   XSETCDR (aliases, Fcons (alias, Qnil));
10103
10104   eol_type = AREF (spec, 2);
10105   if (VECTORP (eol_type))
10106     {
10107       Lisp_Object subsidiaries;
10108       int i;
10109
10110       subsidiaries = make_subsidiaries (alias);
10111       for (i = 0; i < 3; i++)
10112         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10113                                      AREF (eol_type, i));
10114     }
10115
10116   Fputhash (alias, spec, Vcoding_system_hash_table);
10117   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10118   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10119   if (NILP (val))
10120     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10121                                   Vcoding_system_alist);
10122
10123   return Qnil;
10124 }
10125
10126 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10127        1, 1, 0,
10128        doc: /* Return the base of CODING-SYSTEM.
10129 Any alias or subsidiary coding system is not a base coding system.  */)
10130   (Lisp_Object coding_system)
10131 {
10132   Lisp_Object spec, attrs;
10133
10134   if (NILP (coding_system))
10135     return (Qno_conversion);
10136   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10137   attrs = AREF (spec, 0);
10138   return CODING_ATTR_BASE_NAME (attrs);
10139 }
10140
10141 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10142        1, 1, 0,
10143        doc: "Return the property list of CODING-SYSTEM.")
10144   (Lisp_Object coding_system)
10145 {
10146   Lisp_Object spec, attrs;
10147
10148   if (NILP (coding_system))
10149     coding_system = Qno_conversion;
10150   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10151   attrs = AREF (spec, 0);
10152   return CODING_ATTR_PLIST (attrs);
10153 }
10154
10155
10156 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10157        1, 1, 0,
10158        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10159   (Lisp_Object coding_system)
10160 {
10161   Lisp_Object spec;
10162
10163   if (NILP (coding_system))
10164     coding_system = Qno_conversion;
10165   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10166   return AREF (spec, 1);
10167 }
10168
10169 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10170        Scoding_system_eol_type, 1, 1, 0,
10171        doc: /* Return eol-type of CODING-SYSTEM.
10172 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10173
10174 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10175 and CR respectively.
10176
10177 A vector value indicates that a format of end-of-line should be
10178 detected automatically.  Nth element of the vector is the subsidiary
10179 coding system whose eol-type is N.  */)
10180   (Lisp_Object coding_system)
10181 {
10182   Lisp_Object spec, eol_type;
10183   int n;
10184
10185   if (NILP (coding_system))
10186     coding_system = Qno_conversion;
10187   if (! CODING_SYSTEM_P (coding_system))
10188     return Qnil;
10189   spec = CODING_SYSTEM_SPEC (coding_system);
10190   eol_type = AREF (spec, 2);
10191   if (VECTORP (eol_type))
10192     return Fcopy_sequence (eol_type);
10193   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10194   return make_number (n);
10195 }
10196
10197 #endif /* emacs */
10198
10199 \f
10200 /*** 9. Post-amble ***/
10201
10202 void
10203 init_coding_once (void)
10204 {
10205   int i;
10206
10207   for (i = 0; i < coding_category_max; i++)
10208     {
10209       coding_categories[i].id = -1;
10210       coding_priorities[i] = i;
10211     }
10212
10213   /* ISO2022 specific initialize routine.  */
10214   for (i = 0; i < 0x20; i++)
10215     iso_code_class[i] = ISO_control_0;
10216   for (i = 0x21; i < 0x7F; i++)
10217     iso_code_class[i] = ISO_graphic_plane_0;
10218   for (i = 0x80; i < 0xA0; i++)
10219     iso_code_class[i] = ISO_control_1;
10220   for (i = 0xA1; i < 0xFF; i++)
10221     iso_code_class[i] = ISO_graphic_plane_1;
10222   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10223   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10224   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10225   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10226   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10227   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10228   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10229   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10230   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10231
10232   for (i = 0; i < 256; i++)
10233     {
10234       emacs_mule_bytes[i] = 1;
10235     }
10236   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10237   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10238   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10239   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10240 }
10241
10242 #ifdef emacs
10243
10244 void
10245 syms_of_coding (void)
10246 {
10247   staticpro (&Vcoding_system_hash_table);
10248   {
10249     Lisp_Object args[2];
10250     args[0] = QCtest;
10251     args[1] = Qeq;
10252     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10253   }
10254
10255   staticpro (&Vsjis_coding_system);
10256   Vsjis_coding_system = Qnil;
10257
10258   staticpro (&Vbig5_coding_system);
10259   Vbig5_coding_system = Qnil;
10260
10261   staticpro (&Vcode_conversion_reused_workbuf);
10262   Vcode_conversion_reused_workbuf = Qnil;
10263
10264   staticpro (&Vcode_conversion_workbuf_name);
10265   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10266
10267   reused_workbuf_in_use = 0;
10268
10269   DEFSYM (Qcharset, "charset");
10270   DEFSYM (Qtarget_idx, "target-idx");
10271   DEFSYM (Qcoding_system_history, "coding-system-history");
10272   Fset (Qcoding_system_history, Qnil);
10273
10274   /* Target FILENAME is the first argument.  */
10275   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10276   /* Target FILENAME is the third argument.  */
10277   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10278
10279   DEFSYM (Qcall_process, "call-process");
10280   /* Target PROGRAM is the first argument.  */
10281   Fput (Qcall_process, Qtarget_idx, make_number (0));
10282
10283   DEFSYM (Qcall_process_region, "call-process-region");
10284   /* Target PROGRAM is the third argument.  */
10285   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10286
10287   DEFSYM (Qstart_process, "start-process");
10288   /* Target PROGRAM is the third argument.  */
10289   Fput (Qstart_process, Qtarget_idx, make_number (2));
10290
10291   DEFSYM (Qopen_network_stream, "open-network-stream");
10292   /* Target SERVICE is the fourth argument.  */
10293   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10294
10295   DEFSYM (Qcoding_system, "coding-system");
10296   DEFSYM (Qcoding_aliases, "coding-aliases");
10297
10298   DEFSYM (Qeol_type, "eol-type");
10299   DEFSYM (Qunix, "unix");
10300   DEFSYM (Qdos, "dos");
10301   DEFSYM (Qmac, "mac");
10302
10303   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10304   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10305   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10306   DEFSYM (Qdefault_char, "default-char");
10307   DEFSYM (Qundecided, "undecided");
10308   DEFSYM (Qno_conversion, "no-conversion");
10309   DEFSYM (Qraw_text, "raw-text");
10310
10311   DEFSYM (Qiso_2022, "iso-2022");
10312
10313   DEFSYM (Qutf_8, "utf-8");
10314   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10315
10316 #if defined (WINDOWSNT) || defined (CYGWIN)
10317   /* No, not utf-16-le: that one has a BOM.  */
10318   DEFSYM (Qutf_16le, "utf-16le");
10319 #endif
10320
10321   DEFSYM (Qutf_16, "utf-16");
10322   DEFSYM (Qbig, "big");
10323   DEFSYM (Qlittle, "little");
10324
10325   DEFSYM (Qshift_jis, "shift-jis");
10326   DEFSYM (Qbig5, "big5");
10327
10328   DEFSYM (Qcoding_system_p, "coding-system-p");
10329
10330   DEFSYM (Qcoding_system_error, "coding-system-error");
10331   Fput (Qcoding_system_error, Qerror_conditions,
10332         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10333   Fput (Qcoding_system_error, Qerror_message,
10334         build_pure_c_string ("Invalid coding system"));
10335
10336   /* Intern this now in case it isn't already done.
10337      Setting this variable twice is harmless.
10338      But don't staticpro it here--that is done in alloc.c.  */
10339   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10340
10341   DEFSYM (Qtranslation_table, "translation-table");
10342   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10343   DEFSYM (Qtranslation_table_id, "translation-table-id");
10344   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10345   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10346
10347   DEFSYM (Qvalid_codes, "valid-codes");
10348
10349   DEFSYM (Qemacs_mule, "emacs-mule");
10350
10351   DEFSYM (QCcategory, ":category");
10352   DEFSYM (QCmnemonic, ":mnemonic");
10353   DEFSYM (QCdefault_char, ":default-char");
10354   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10355   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10356   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10357   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10358   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10359
10360   Vcoding_category_table
10361     = Fmake_vector (make_number (coding_category_max), Qnil);
10362   staticpro (&Vcoding_category_table);
10363   /* Followings are target of code detection.  */
10364   ASET (Vcoding_category_table, coding_category_iso_7,
10365         intern_c_string ("coding-category-iso-7"));
10366   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10367         intern_c_string ("coding-category-iso-7-tight"));
10368   ASET (Vcoding_category_table, coding_category_iso_8_1,
10369         intern_c_string ("coding-category-iso-8-1"));
10370   ASET (Vcoding_category_table, coding_category_iso_8_2,
10371         intern_c_string ("coding-category-iso-8-2"));
10372   ASET (Vcoding_category_table, coding_category_iso_7_else,
10373         intern_c_string ("coding-category-iso-7-else"));
10374   ASET (Vcoding_category_table, coding_category_iso_8_else,
10375         intern_c_string ("coding-category-iso-8-else"));
10376   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10377         intern_c_string ("coding-category-utf-8-auto"));
10378   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10379         intern_c_string ("coding-category-utf-8"));
10380   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10381         intern_c_string ("coding-category-utf-8-sig"));
10382   ASET (Vcoding_category_table, coding_category_utf_16_be,
10383         intern_c_string ("coding-category-utf-16-be"));
10384   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10385         intern_c_string ("coding-category-utf-16-auto"));
10386   ASET (Vcoding_category_table, coding_category_utf_16_le,
10387         intern_c_string ("coding-category-utf-16-le"));
10388   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10389         intern_c_string ("coding-category-utf-16-be-nosig"));
10390   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10391         intern_c_string ("coding-category-utf-16-le-nosig"));
10392   ASET (Vcoding_category_table, coding_category_charset,
10393         intern_c_string ("coding-category-charset"));
10394   ASET (Vcoding_category_table, coding_category_sjis,
10395         intern_c_string ("coding-category-sjis"));
10396   ASET (Vcoding_category_table, coding_category_big5,
10397         intern_c_string ("coding-category-big5"));
10398   ASET (Vcoding_category_table, coding_category_ccl,
10399         intern_c_string ("coding-category-ccl"));
10400   ASET (Vcoding_category_table, coding_category_emacs_mule,
10401         intern_c_string ("coding-category-emacs-mule"));
10402   /* Followings are NOT target of code detection.  */
10403   ASET (Vcoding_category_table, coding_category_raw_text,
10404         intern_c_string ("coding-category-raw-text"));
10405   ASET (Vcoding_category_table, coding_category_undecided,
10406         intern_c_string ("coding-category-undecided"));
10407
10408   DEFSYM (Qinsufficient_source, "insufficient-source");
10409   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10410   DEFSYM (Qinvalid_source, "invalid-source");
10411   DEFSYM (Qinterrupted, "interrupted");
10412   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10413   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10414
10415   defsubr (&Scoding_system_p);
10416   defsubr (&Sread_coding_system);
10417   defsubr (&Sread_non_nil_coding_system);
10418   defsubr (&Scheck_coding_system);
10419   defsubr (&Sdetect_coding_region);
10420   defsubr (&Sdetect_coding_string);
10421   defsubr (&Sfind_coding_systems_region_internal);
10422   defsubr (&Sunencodable_char_position);
10423   defsubr (&Scheck_coding_systems_region);
10424   defsubr (&Sdecode_coding_region);
10425   defsubr (&Sencode_coding_region);
10426   defsubr (&Sdecode_coding_string);
10427   defsubr (&Sencode_coding_string);
10428   defsubr (&Sdecode_sjis_char);
10429   defsubr (&Sencode_sjis_char);
10430   defsubr (&Sdecode_big5_char);
10431   defsubr (&Sencode_big5_char);
10432   defsubr (&Sset_terminal_coding_system_internal);
10433   defsubr (&Sset_safe_terminal_coding_system_internal);
10434   defsubr (&Sterminal_coding_system);
10435   defsubr (&Sset_keyboard_coding_system_internal);
10436   defsubr (&Skeyboard_coding_system);
10437   defsubr (&Sfind_operation_coding_system);
10438   defsubr (&Sset_coding_system_priority);
10439   defsubr (&Sdefine_coding_system_internal);
10440   defsubr (&Sdefine_coding_system_alias);
10441   defsubr (&Scoding_system_put);
10442   defsubr (&Scoding_system_base);
10443   defsubr (&Scoding_system_plist);
10444   defsubr (&Scoding_system_aliases);
10445   defsubr (&Scoding_system_eol_type);
10446   defsubr (&Scoding_system_priority_list);
10447
10448   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10449                doc: /* List of coding systems.
10450
10451 Do not alter the value of this variable manually.  This variable should be
10452 updated by the functions `define-coding-system' and
10453 `define-coding-system-alias'.  */);
10454   Vcoding_system_list = Qnil;
10455
10456   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10457                doc: /* Alist of coding system names.
10458 Each element is one element list of coding system name.
10459 This variable is given to `completing-read' as COLLECTION argument.
10460
10461 Do not alter the value of this variable manually.  This variable should be
10462 updated by the functions `make-coding-system' and
10463 `define-coding-system-alias'.  */);
10464   Vcoding_system_alist = Qnil;
10465
10466   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10467                doc: /* List of coding-categories (symbols) ordered by priority.
10468
10469 On detecting a coding system, Emacs tries code detection algorithms
10470 associated with each coding-category one by one in this order.  When
10471 one algorithm agrees with a byte sequence of source text, the coding
10472 system bound to the corresponding coding-category is selected.
10473
10474 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10475   {
10476     int i;
10477
10478     Vcoding_category_list = Qnil;
10479     for (i = coding_category_max - 1; i >= 0; i--)
10480       Vcoding_category_list
10481         = Fcons (AREF (Vcoding_category_table, i),
10482                  Vcoding_category_list);
10483   }
10484
10485   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10486                doc: /* Specify the coding system for read operations.
10487 It is useful to bind this variable with `let', but do not set it globally.
10488 If the value is a coding system, it is used for decoding on read operation.
10489 If not, an appropriate element is used from one of the coding system alists.
10490 There are three such tables: `file-coding-system-alist',
10491 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10492   Vcoding_system_for_read = Qnil;
10493
10494   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10495                doc: /* Specify the coding system for write operations.
10496 Programs bind this variable with `let', but you should not set it globally.
10497 If the value is a coding system, it is used for encoding of output,
10498 when writing it to a file and when sending it to a file or subprocess.
10499
10500 If this does not specify a coding system, an appropriate element
10501 is used from one of the coding system alists.
10502 There are three such tables: `file-coding-system-alist',
10503 `process-coding-system-alist', and `network-coding-system-alist'.
10504 For output to files, if the above procedure does not specify a coding system,
10505 the value of `buffer-file-coding-system' is used.  */);
10506   Vcoding_system_for_write = Qnil;
10507
10508   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10509                doc: /*
10510 Coding system used in the latest file or process I/O.  */);
10511   Vlast_coding_system_used = Qnil;
10512
10513   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10514                doc: /*
10515 Error status of the last code conversion.
10516
10517 When an error was detected in the last code conversion, this variable
10518 is set to one of the following symbols.
10519   `insufficient-source'
10520   `inconsistent-eol'
10521   `invalid-source'
10522   `interrupted'
10523   `insufficient-memory'
10524 When no error was detected, the value doesn't change.  So, to check
10525 the error status of a code conversion by this variable, you must
10526 explicitly set this variable to nil before performing code
10527 conversion.  */);
10528   Vlast_code_conversion_error = Qnil;
10529
10530   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10531                doc: /*
10532 *Non-nil means always inhibit code conversion of end-of-line format.
10533 See info node `Coding Systems' and info node `Text and Binary' concerning
10534 such conversion.  */);
10535   inhibit_eol_conversion = 0;
10536
10537   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10538                doc: /*
10539 Non-nil means process buffer inherits coding system of process output.
10540 Bind it to t if the process output is to be treated as if it were a file
10541 read from some filesystem.  */);
10542   inherit_process_coding_system = 0;
10543
10544   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10545                doc: /*
10546 Alist to decide a coding system to use for a file I/O operation.
10547 The format is ((PATTERN . VAL) ...),
10548 where PATTERN is a regular expression matching a file name,
10549 VAL is a coding system, a cons of coding systems, or a function symbol.
10550 If VAL is a coding system, it is used for both decoding and encoding
10551 the file contents.
10552 If VAL is a cons of coding systems, the car part is used for decoding,
10553 and the cdr part is used for encoding.
10554 If VAL is a function symbol, the function must return a coding system
10555 or a cons of coding systems which are used as above.  The function is
10556 called with an argument that is a list of the arguments with which
10557 `find-operation-coding-system' was called.  If the function can't decide
10558 a coding system, it can return `undecided' so that the normal
10559 code-detection is performed.
10560
10561 See also the function `find-operation-coding-system'
10562 and the variable `auto-coding-alist'.  */);
10563   Vfile_coding_system_alist = Qnil;
10564
10565   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10566                doc: /*
10567 Alist to decide a coding system to use for a process I/O operation.
10568 The format is ((PATTERN . VAL) ...),
10569 where PATTERN is a regular expression matching a program name,
10570 VAL is a coding system, a cons of coding systems, or a function symbol.
10571 If VAL is a coding system, it is used for both decoding what received
10572 from the program and encoding what sent to the program.
10573 If VAL is a cons of coding systems, the car part is used for decoding,
10574 and the cdr part is used for encoding.
10575 If VAL is a function symbol, the function must return a coding system
10576 or a cons of coding systems which are used as above.
10577
10578 See also the function `find-operation-coding-system'.  */);
10579   Vprocess_coding_system_alist = Qnil;
10580
10581   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10582                doc: /*
10583 Alist to decide a coding system to use for a network I/O operation.
10584 The format is ((PATTERN . VAL) ...),
10585 where PATTERN is a regular expression matching a network service name
10586 or is a port number to connect to,
10587 VAL is a coding system, a cons of coding systems, or a function symbol.
10588 If VAL is a coding system, it is used for both decoding what received
10589 from the network stream and encoding what sent to the network stream.
10590 If VAL is a cons of coding systems, the car part is used for decoding,
10591 and the cdr part is used for encoding.
10592 If VAL is a function symbol, the function must return a coding system
10593 or a cons of coding systems which are used as above.
10594
10595 See also the function `find-operation-coding-system'.  */);
10596   Vnetwork_coding_system_alist = Qnil;
10597
10598   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10599                doc: /* Coding system to use with system messages.
10600 Also used for decoding keyboard input on X Window system.  */);
10601   Vlocale_coding_system = Qnil;
10602
10603   /* The eol mnemonics are reset in startup.el system-dependently.  */
10604   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10605                doc: /*
10606 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10607   eol_mnemonic_unix = build_pure_c_string (":");
10608
10609   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10610                doc: /*
10611 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10612   eol_mnemonic_dos = build_pure_c_string ("\\");
10613
10614   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10615                doc: /*
10616 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10617   eol_mnemonic_mac = build_pure_c_string ("/");
10618
10619   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10620                doc: /*
10621 *String displayed in mode line when end-of-line format is not yet determined.  */);
10622   eol_mnemonic_undecided = build_pure_c_string (":");
10623
10624   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10625                doc: /*
10626 *Non-nil enables character translation while encoding and decoding.  */);
10627   Venable_character_translation = Qt;
10628
10629   DEFVAR_LISP ("standard-translation-table-for-decode",
10630                Vstandard_translation_table_for_decode,
10631                doc: /* Table for translating characters while decoding.  */);
10632   Vstandard_translation_table_for_decode = Qnil;
10633
10634   DEFVAR_LISP ("standard-translation-table-for-encode",
10635                Vstandard_translation_table_for_encode,
10636                doc: /* Table for translating characters while encoding.  */);
10637   Vstandard_translation_table_for_encode = Qnil;
10638
10639   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10640                doc: /* Alist of charsets vs revision numbers.
10641 While encoding, if a charset (car part of an element) is found,
10642 designate it with the escape sequence identifying revision (cdr part
10643 of the element).  */);
10644   Vcharset_revision_table = Qnil;
10645
10646   DEFVAR_LISP ("default-process-coding-system",
10647                Vdefault_process_coding_system,
10648                doc: /* Cons of coding systems used for process I/O by default.
10649 The car part is used for decoding a process output,
10650 the cdr part is used for encoding a text to be sent to a process.  */);
10651   Vdefault_process_coding_system = Qnil;
10652
10653   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10654                doc: /*
10655 Table of extra Latin codes in the range 128..159 (inclusive).
10656 This is a vector of length 256.
10657 If Nth element is non-nil, the existence of code N in a file
10658 \(or output of subprocess) doesn't prevent it to be detected as
10659 a coding system of ISO 2022 variant which has a flag
10660 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10661 or reading output of a subprocess.
10662 Only 128th through 159th elements have a meaning.  */);
10663   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10664
10665   DEFVAR_LISP ("select-safe-coding-system-function",
10666                Vselect_safe_coding_system_function,
10667                doc: /*
10668 Function to call to select safe coding system for encoding a text.
10669
10670 If set, this function is called to force a user to select a proper
10671 coding system which can encode the text in the case that a default
10672 coding system used in each operation can't encode the text.  The
10673 function should take care that the buffer is not modified while
10674 the coding system is being selected.
10675
10676 The default value is `select-safe-coding-system' (which see).  */);
10677   Vselect_safe_coding_system_function = Qnil;
10678
10679   DEFVAR_BOOL ("coding-system-require-warning",
10680                coding_system_require_warning,
10681                doc: /* Internal use only.
10682 If non-nil, on writing a file, `select-safe-coding-system-function' is
10683 called even if `coding-system-for-write' is non-nil.  The command
10684 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10685   coding_system_require_warning = 0;
10686
10687
10688   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10689                inhibit_iso_escape_detection,
10690                doc: /*
10691 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10692
10693 When Emacs reads text, it tries to detect how the text is encoded.
10694 This code detection is sensitive to escape sequences.  If Emacs sees
10695 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10696 of the ISO2022 encodings, and decodes text by the corresponding coding
10697 system (e.g. `iso-2022-7bit').
10698
10699 However, there may be a case that you want to read escape sequences in
10700 a file as is.  In such a case, you can set this variable to non-nil.
10701 Then the code detection will ignore any escape sequences, and no text is
10702 detected as encoded in some ISO-2022 encoding.  The result is that all
10703 escape sequences become visible in a buffer.
10704
10705 The default value is nil, and it is strongly recommended not to change
10706 it.  That is because many Emacs Lisp source files that contain
10707 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10708 in Emacs's distribution, and they won't be decoded correctly on
10709 reading if you suppress escape sequence detection.
10710
10711 The other way to read escape sequences in a file without decoding is
10712 to explicitly specify some coding system that doesn't use ISO-2022
10713 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10714   inhibit_iso_escape_detection = 0;
10715
10716   DEFVAR_BOOL ("inhibit-null-byte-detection",
10717                inhibit_null_byte_detection,
10718                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10719 By default, Emacs treats it as binary data, and does not attempt to
10720 decode it.  The effect is as if you specified `no-conversion' for
10721 reading that text.
10722
10723 Set this to non-nil when a regular text happens to include null bytes.
10724 Examples are Index nodes of Info files and null-byte delimited output
10725 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10726 decode text as usual.  */);
10727   inhibit_null_byte_detection = 0;
10728
10729   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10730                doc: /* Char table for translating self-inserting characters.
10731 This is applied to the result of input methods, not their input.
10732 See also `keyboard-translate-table'.
10733
10734 Use of this variable for character code unification was rendered
10735 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10736 internal character representation.  */);
10737     Vtranslation_table_for_input = Qnil;
10738
10739   {
10740     Lisp_Object args[coding_arg_max];
10741     Lisp_Object plist[16];
10742     int i;
10743
10744     for (i = 0; i < coding_arg_max; i++)
10745       args[i] = Qnil;
10746
10747     plist[0] = intern_c_string (":name");
10748     plist[1] = args[coding_arg_name] = Qno_conversion;
10749     plist[2] = intern_c_string (":mnemonic");
10750     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10751     plist[4] = intern_c_string (":coding-type");
10752     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10753     plist[6] = intern_c_string (":ascii-compatible-p");
10754     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10755     plist[8] = intern_c_string (":default-char");
10756     plist[9] = args[coding_arg_default_char] = make_number (0);
10757     plist[10] = intern_c_string (":for-unibyte");
10758     plist[11] = args[coding_arg_for_unibyte] = Qt;
10759     plist[12] = intern_c_string (":docstring");
10760     plist[13] = build_pure_c_string ("Do no conversion.\n\
10761 \n\
10762 When you visit a file with this coding, the file is read into a\n\
10763 unibyte buffer as is, thus each byte of a file is treated as a\n\
10764 character.");
10765     plist[14] = intern_c_string (":eol-type");
10766     plist[15] = args[coding_arg_eol_type] = Qunix;
10767     args[coding_arg_plist] = Flist (16, plist);
10768     Fdefine_coding_system_internal (coding_arg_max, args);
10769
10770     plist[1] = args[coding_arg_name] = Qundecided;
10771     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10772     plist[5] = args[coding_arg_coding_type] = Qundecided;
10773     /* This is already set.
10774        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10775     plist[8] = intern_c_string (":charset-list");
10776     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10777     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10778     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10779     plist[15] = args[coding_arg_eol_type] = Qnil;
10780     args[coding_arg_plist] = Flist (16, plist);
10781     Fdefine_coding_system_internal (coding_arg_max, args);
10782   }
10783
10784   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10785
10786   {
10787     int i;
10788
10789     for (i = 0; i < coding_category_max; i++)
10790       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10791   }
10792 #if defined (DOS_NT)
10793   system_eol_type = Qdos;
10794 #else
10795   system_eol_type = Qunix;
10796 #endif
10797   staticpro (&system_eol_type);
10798 }
10799
10800 char *
10801 emacs_strerror (int error_number)
10802 {
10803   char *str;
10804
10805   synchronize_system_messages_locale ();
10806   str = strerror (error_number);
10807
10808   if (! NILP (Vlocale_coding_system))
10809     {
10810       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10811                                                       Vlocale_coding_system,
10812                                                       0);
10813       str = SSDATA (dec);
10814     }
10815
10816   return str;
10817 }
10818
10819 #endif /* emacs */