src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts the number
1194    of characters and checks the EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   ptrdiff_t nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_bytes = src_base - coding->source;
1304   coding->detected_utf8_chars = nchars;
1305   return 1;
1306 }
1307
1308
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1311 {
1312   const unsigned char *src = coding->source + coding->consumed;
1313   const unsigned char *src_end = coding->source + coding->src_bytes;
1314   const unsigned char *src_base;
1315   int *charbuf = coding->charbuf + coding->charbuf_used;
1316   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318   bool multibytep = coding->src_multibyte;
1319   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320   bool eol_dos
1321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322   int byte_after_cr = -1;
1323
1324   if (bom != utf_without_bom)
1325     {
1326       int c1, c2, c3;
1327
1328       src_base = src;
1329       ONE_MORE_BYTE (c1);
1330       if (! UTF_8_3_OCTET_LEADING_P (c1))
1331         src = src_base;
1332       else
1333         {
1334           ONE_MORE_BYTE (c2);
1335           if (! UTF_8_EXTRA_OCTET_P (c2))
1336             src = src_base;
1337           else
1338             {
1339               ONE_MORE_BYTE (c3);
1340               if (! UTF_8_EXTRA_OCTET_P (c3))
1341                 src = src_base;
1342               else
1343                 {
1344                   if ((c1 != UTF_8_BOM_1)
1345                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346                     src = src_base;
1347                   else
1348                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1349                 }
1350             }
1351         }
1352     }
1353   CODING_UTF_8_BOM (coding) = utf_without_bom;
1354
1355   while (1)
1356     {
1357       int c, c1, c2, c3, c4, c5;
1358
1359       src_base = src;
1360       consumed_chars_base = consumed_chars;
1361
1362       if (charbuf >= charbuf_end)
1363         {
1364           if (byte_after_cr >= 0)
1365             src_base--;
1366           break;
1367         }
1368
1369       /* In the simple case, rapidly handle ordinary characters */
1370       if (multibytep && ! eol_dos
1371           && charbuf < charbuf_end - 6 && src < src_end - 6)
1372         {
1373           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374             {
1375               c1 = *src;
1376               if (c1 & 0x80)
1377                 break;
1378               src++;
1379               consumed_chars++;
1380               *charbuf++ = c1;
1381
1382               c1 = *src;
1383               if (c1 & 0x80)
1384                 break;
1385               src++;
1386               consumed_chars++;
1387               *charbuf++ = c1;
1388
1389               c1 = *src;
1390               if (c1 & 0x80)
1391                 break;
1392               src++;
1393               consumed_chars++;
1394               *charbuf++ = c1;
1395
1396               c1 = *src;
1397               if (c1 & 0x80)
1398                 break;
1399               src++;
1400               consumed_chars++;
1401               *charbuf++ = c1;
1402             }
1403           /* If we handled at least one character, restart the main loop.  */
1404           if (src != src_base)
1405             continue;
1406         }
1407
1408       if (byte_after_cr >= 0)
1409         c1 = byte_after_cr, byte_after_cr = -1;
1410       else
1411         ONE_MORE_BYTE (c1);
1412       if (c1 < 0)
1413         {
1414           c = - c1;
1415         }
1416       else if (UTF_8_1_OCTET_P (c1))
1417         {
1418           if (eol_dos && c1 == '\r')
1419             ONE_MORE_BYTE (byte_after_cr);
1420           c = c1;
1421         }
1422       else
1423         {
1424           ONE_MORE_BYTE (c2);
1425           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426             goto invalid_code;
1427           if (UTF_8_2_OCTET_LEADING_P (c1))
1428             {
1429               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430               /* Reject overlong sequences here and below.  Encoders
1431                  producing them are incorrect, they can be misleading,
1432                  and they mess up read/write invariance.  */
1433               if (c < 128)
1434                 goto invalid_code;
1435             }
1436           else
1437             {
1438               ONE_MORE_BYTE (c3);
1439               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440                 goto invalid_code;
1441               if (UTF_8_3_OCTET_LEADING_P (c1))
1442                 {
1443                   c = (((c1 & 0xF) << 12)
1444                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445                   if (c < 0x800
1446                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447                     goto invalid_code;
1448                 }
1449               else
1450                 {
1451                   ONE_MORE_BYTE (c4);
1452                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453                     goto invalid_code;
1454                   if (UTF_8_4_OCTET_LEADING_P (c1))
1455                     {
1456                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458                     if (c < 0x10000)
1459                       goto invalid_code;
1460                     }
1461                   else
1462                     {
1463                       ONE_MORE_BYTE (c5);
1464                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465                         goto invalid_code;
1466                       if (UTF_8_5_OCTET_LEADING_P (c1))
1467                         {
1468                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470                                | (c5 & 0x3F));
1471                           if ((c > MAX_CHAR) || (c < 0x200000))
1472                             goto invalid_code;
1473                         }
1474                       else
1475                         goto invalid_code;
1476                     }
1477                 }
1478             }
1479         }
1480
1481       *charbuf++ = c;
1482       continue;
1483
1484     invalid_code:
1485       src = src_base;
1486       consumed_chars = consumed_chars_base;
1487       ONE_MORE_BYTE (c);
1488       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1489       coding->errors++;
1490     }
1491
1492  no_more_source:
1493   coding->consumed_char += consumed_chars_base;
1494   coding->consumed = src_base - coding->source;
1495   coding->charbuf_used = charbuf - coding->charbuf;
1496 }
1497
1498
1499 static bool
1500 encode_coding_utf_8 (struct coding_system *coding)
1501 {
1502   bool multibytep = coding->dst_multibyte;
1503   int *charbuf = coding->charbuf;
1504   int *charbuf_end = charbuf + coding->charbuf_used;
1505   unsigned char *dst = coding->destination + coding->produced;
1506   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1507   ptrdiff_t produced_chars = 0;
1508   int c;
1509
1510   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1511     {
1512       ASSURE_DESTINATION (3);
1513       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1514       CODING_UTF_8_BOM (coding) = utf_without_bom;
1515     }
1516
1517   if (multibytep)
1518     {
1519       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1520
1521       while (charbuf < charbuf_end)
1522         {
1523           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1524
1525           ASSURE_DESTINATION (safe_room);
1526           c = *charbuf++;
1527           if (CHAR_BYTE8_P (c))
1528             {
1529               c = CHAR_TO_BYTE8 (c);
1530               EMIT_ONE_BYTE (c);
1531             }
1532           else
1533             {
1534               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1535               for (p = str; p < pend; p++)
1536                 EMIT_ONE_BYTE (*p);
1537             }
1538         }
1539     }
1540   else
1541     {
1542       int safe_room = MAX_MULTIBYTE_LENGTH;
1543
1544       while (charbuf < charbuf_end)
1545         {
1546           ASSURE_DESTINATION (safe_room);
1547           c = *charbuf++;
1548           if (CHAR_BYTE8_P (c))
1549             *dst++ = CHAR_TO_BYTE8 (c);
1550           else
1551             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1552         }
1553       produced_chars = dst - (coding->destination + coding->produced);
1554     }
1555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1556   coding->produced_char += produced_chars;
1557   coding->produced = dst - coding->destination;
1558   return 0;
1559 }
1560
1561
1562 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1563    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1564
1565 #define UTF_16_HIGH_SURROGATE_P(val) \
1566   (((val) & 0xFC00) == 0xD800)
1567
1568 #define UTF_16_LOW_SURROGATE_P(val) \
1569   (((val) & 0xFC00) == 0xDC00)
1570
1571
1572 static bool
1573 detect_coding_utf_16 (struct coding_system *coding,
1574                       struct coding_detection_info *detect_info)
1575 {
1576   const unsigned char *src = coding->source;
1577   const unsigned char *src_end = coding->source + coding->src_bytes;
1578   bool multibytep = coding->src_multibyte;
1579   int c1, c2;
1580
1581   detect_info->checked |= CATEGORY_MASK_UTF_16;
1582   if (coding->mode & CODING_MODE_LAST_BLOCK
1583       && (coding->src_chars & 1))
1584     {
1585       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586       return 0;
1587     }
1588
1589   TWO_MORE_BYTES (c1, c2);
1590   if ((c1 == 0xFF) && (c2 == 0xFE))
1591     {
1592       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593                              | CATEGORY_MASK_UTF_16_AUTO);
1594       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1597     }
1598   else if ((c1 == 0xFE) && (c2 == 0xFF))
1599     {
1600       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601                              | CATEGORY_MASK_UTF_16_AUTO);
1602       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1605     }
1606   else if (c2 < 0)
1607     {
1608       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609       return 0;
1610     }
1611   else
1612     {
1613       /* We check the dispersion of Eth and Oth bytes where E is even and
1614          O is odd.  If both are high, we assume binary data.*/
1615       unsigned char e[256], o[256];
1616       unsigned e_num = 1, o_num = 1;
1617
1618       memset (e, 0, 256);
1619       memset (o, 0, 256);
1620       e[c1] = 1;
1621       o[c2] = 1;
1622
1623       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624                                 |CATEGORY_MASK_UTF_16_BE
1625                                 | CATEGORY_MASK_UTF_16_LE);
1626
1627       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628              != CATEGORY_MASK_UTF_16)
1629         {
1630           TWO_MORE_BYTES (c1, c2);
1631           if (c2 < 0)
1632             break;
1633           if (! e[c1])
1634             {
1635               e[c1] = 1;
1636               e_num++;
1637               if (e_num >= 128)
1638                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1639             }
1640           if (! o[c2])
1641             {
1642               o[c2] = 1;
1643               o_num++;
1644               if (o_num >= 128)
1645                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1646             }
1647         }
1648       return 0;
1649     }
1650
1651  no_more_source:
1652   return 1;
1653 }
1654
1655 static void
1656 decode_coding_utf_16 (struct coding_system *coding)
1657 {
1658   const unsigned char *src = coding->source + coding->consumed;
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base;
1661   int *charbuf = coding->charbuf + coding->charbuf_used;
1662   /* We may produces at most 3 chars in one loop.  */
1663   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1664   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1665   bool multibytep = coding->src_multibyte;
1666   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1667   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668   int surrogate = CODING_UTF_16_SURROGATE (coding);
1669   bool eol_dos
1670     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1671   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1672
1673   if (bom == utf_with_bom)
1674     {
1675       int c, c1, c2;
1676
1677       src_base = src;
1678       ONE_MORE_BYTE (c1);
1679       ONE_MORE_BYTE (c2);
1680       c = (c1 << 8) | c2;
1681
1682       if (endian == utf_16_big_endian
1683           ? c != 0xFEFF : c != 0xFFFE)
1684         {
1685           /* The first two bytes are not BOM.  Treat them as bytes
1686              for a normal character.  */
1687           src = src_base;
1688           coding->errors++;
1689         }
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692   else if (bom == utf_detect_bom)
1693     {
1694       /* We have already tried to detect BOM and failed in
1695          detect_coding.  */
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (1)
1700     {
1701       int c, c1, c2;
1702
1703       src_base = src;
1704       consumed_chars_base = consumed_chars;
1705
1706       if (charbuf >= charbuf_end)
1707         {
1708           if (byte_after_cr1 >= 0)
1709             src_base -= 2;
1710           break;
1711         }
1712
1713       if (byte_after_cr1 >= 0)
1714         c1 = byte_after_cr1, byte_after_cr1 = -1;
1715       else
1716         ONE_MORE_BYTE (c1);
1717       if (c1 < 0)
1718         {
1719           *charbuf++ = -c1;
1720           continue;
1721         }
1722       if (byte_after_cr2 >= 0)
1723         c2 = byte_after_cr2, byte_after_cr2 = -1;
1724       else
1725         ONE_MORE_BYTE (c2);
1726       if (c2 < 0)
1727         {
1728           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729           *charbuf++ = -c2;
1730           continue;
1731         }
1732       c = (endian == utf_16_big_endian
1733            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1734
1735       if (surrogate)
1736         {
1737           if (! UTF_16_LOW_SURROGATE_P (c))
1738             {
1739               if (endian == utf_16_big_endian)
1740                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741               else
1742                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743               *charbuf++ = c1;
1744               *charbuf++ = c2;
1745               coding->errors++;
1746               if (UTF_16_HIGH_SURROGATE_P (c))
1747                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1748               else
1749                 *charbuf++ = c;
1750             }
1751           else
1752             {
1753               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1755               *charbuf++ = 0x10000 + c;
1756             }
1757         }
1758       else
1759         {
1760           if (UTF_16_HIGH_SURROGATE_P (c))
1761             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762           else
1763             {
1764               if (eol_dos && c == '\r')
1765                 {
1766                   ONE_MORE_BYTE (byte_after_cr1);
1767                   ONE_MORE_BYTE (byte_after_cr2);
1768                 }
1769               *charbuf++ = c;
1770             }
1771         }
1772     }
1773
1774  no_more_source:
1775   coding->consumed_char += consumed_chars_base;
1776   coding->consumed = src_base - coding->source;
1777   coding->charbuf_used = charbuf - coding->charbuf;
1778 }
1779
1780 static bool
1781 encode_coding_utf_16 (struct coding_system *coding)
1782 {
1783   bool multibytep = coding->dst_multibyte;
1784   int *charbuf = coding->charbuf;
1785   int *charbuf_end = charbuf + coding->charbuf_used;
1786   unsigned char *dst = coding->destination + coding->produced;
1787   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788   int safe_room = 8;
1789   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1790   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1791   ptrdiff_t produced_chars = 0;
1792   int c;
1793
1794   if (bom != utf_without_bom)
1795     {
1796       ASSURE_DESTINATION (safe_room);
1797       if (big_endian)
1798         EMIT_TWO_BYTES (0xFE, 0xFF);
1799       else
1800         EMIT_TWO_BYTES (0xFF, 0xFE);
1801       CODING_UTF_16_BOM (coding) = utf_without_bom;
1802     }
1803
1804   while (charbuf < charbuf_end)
1805     {
1806       ASSURE_DESTINATION (safe_room);
1807       c = *charbuf++;
1808       if (c > MAX_UNICODE_CHAR)
1809         c = coding->default_char;
1810
1811       if (c < 0x10000)
1812         {
1813           if (big_endian)
1814             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815           else
1816             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1817         }
1818       else
1819         {
1820           int c1, c2;
1821
1822           c -= 0x10000;
1823           c1 = (c >> 10) + 0xD800;
1824           c2 = (c & 0x3FF) + 0xDC00;
1825           if (big_endian)
1826             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827           else
1828             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1829         }
1830     }
1831   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1832   coding->produced = dst - coding->destination;
1833   coding->produced_char += produced_chars;
1834   return 0;
1835 }
1836
1837 \f
1838 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1839
1840 /* Emacs' internal format for representation of multiple character
1841    sets is a kind of multi-byte encoding, i.e. characters are
1842    represented by variable-length sequences of one-byte codes.
1843
1844    ASCII characters and control characters (e.g. `tab', `newline') are
1845    represented by one-byte sequences which are their ASCII codes, in
1846    the range 0x00 through 0x7F.
1847
1848    8-bit characters of the range 0x80..0x9F are represented by
1849    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850    code + 0x20).
1851
1852    8-bit characters of the range 0xA0..0xFF are represented by
1853    one-byte sequences which are their 8-bit code.
1854
1855    The other characters are represented by a sequence of `base
1856    leading-code', optional `extended leading-code', and one or two
1857    `position-code's.  The length of the sequence is determined by the
1858    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1859    whereas extended leading-code and position-code take the range 0xA0
1860    through 0xFF.  See `charset.h' for more details about leading-code
1861    and position-code.
1862
1863    --- CODE RANGE of Emacs' internal format ---
1864    character set        range
1865    -------------        -----
1866    ascii                0x00..0x7F
1867    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868    eight-bit-graphic    0xA0..0xBF
1869    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1870    ---------------------------------------------
1871
1872    As this is the internal character representation, the format is
1873    usually not used externally (i.e. in a file or in a data sent to a
1874    process).  But, it is possible to have a text externally in this
1875    format (i.e. by encoding by the coding system `emacs-mule').
1876
1877    In that case, a sequence of one-byte codes has a slightly different
1878    form.
1879
1880    At first, all characters in eight-bit-control are represented by
1881    one-byte sequences which are their 8-bit code.
1882
1883    Next, character composition data are represented by the byte
1884    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885    where,
1886         METHOD is 0xF2 plus one of composition method (enum
1887         composition_method),
1888
1889         BYTES is 0xA0 plus a byte length of this composition data,
1890
1891         CHARS is 0xA0 plus a number of characters composed by this
1892         data,
1893
1894         COMPONENTs are characters of multibyte form or composition
1895         rules encoded by two-byte of ASCII codes.
1896
1897    In addition, for backward compatibility, the following formats are
1898    also recognized as composition data on decoding.
1899
1900    0x80 MSEQ ...
1901    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1902
1903    Here,
1904         MSEQ is a multibyte form but in these special format:
1905           ASCII: 0xA0 ASCII_CODE+0x80,
1906           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907         RULE is a one byte code of the range 0xA0..0xF0 that
1908         represents a composition rule.
1909   */
1910
1911 char emacs_mule_bytes[256];
1912
1913
1914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915    Return true if a text is encoded in 'emacs-mule'.  */
1916
1917 static bool
1918 detect_coding_emacs_mule (struct coding_system *coding,
1919                           struct coding_detection_info *detect_info)
1920 {
1921   const unsigned char *src = coding->source, *src_base;
1922   const unsigned char *src_end = coding->source + coding->src_bytes;
1923   bool multibytep = coding->src_multibyte;
1924   ptrdiff_t consumed_chars = 0;
1925   int c;
1926   int found = 0;
1927
1928   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1929   /* A coding system of this category is always ASCII compatible.  */
1930   src += coding->head_ascii;
1931
1932   while (1)
1933     {
1934       src_base = src;
1935       ONE_MORE_BYTE (c);
1936       if (c < 0)
1937         continue;
1938       if (c == 0x80)
1939         {
1940           /* Perhaps the start of composite character.  We simply skip
1941              it because analyzing it is too heavy for detecting.  But,
1942              at least, we check that the composite character
1943              constitutes of more than 4 bytes.  */
1944           const unsigned char *src_start;
1945
1946         repeat:
1947           src_start = src;
1948           do
1949             {
1950               ONE_MORE_BYTE (c);
1951             }
1952           while (c >= 0xA0);
1953
1954           if (src - src_start <= 4)
1955             break;
1956           found = CATEGORY_MASK_EMACS_MULE;
1957           if (c == 0x80)
1958             goto repeat;
1959         }
1960
1961       if (c < 0x80)
1962         {
1963           if (c < 0x20
1964               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1965             break;
1966         }
1967       else
1968         {
1969           int more_bytes = emacs_mule_bytes[c] - 1;
1970
1971           while (more_bytes > 0)
1972             {
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 {
1976                   src--;        /* Unread the last byte.  */
1977                   break;
1978                 }
1979               more_bytes--;
1980             }
1981           if (more_bytes != 0)
1982             break;
1983           found = CATEGORY_MASK_EMACS_MULE;
1984         }
1985     }
1986   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1987   return 0;
1988
1989  no_more_source:
1990   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1991     {
1992       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1993       return 0;
1994     }
1995   detect_info->found |= found;
1996   return 1;
1997 }
1998
1999
2000 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2001    character.  If CMP_STATUS indicates that we must expect MSEQ or
2002    RULE described above, decode it and return the negative value of
2003    the decoded character or rule.  If an invalid byte is found, return
2004    -1.  If SRC is too short, return -2.  */
2005
2006 static int
2007 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2008                  int *nbytes, int *nchars, int *id,
2009                  struct composition_status *cmp_status)
2010 {
2011   const unsigned char *src_end = coding->source + coding->src_bytes;
2012   const unsigned char *src_base = src;
2013   bool multibytep = coding->src_multibyte;
2014   int charset_ID;
2015   unsigned code;
2016   int c;
2017   ptrdiff_t consumed_chars = 0;
2018   bool mseq_found = 0;
2019
2020   ONE_MORE_BYTE (c);
2021   if (c < 0)
2022     {
2023       c = -c;
2024       charset_ID = emacs_mule_charset[0];
2025     }
2026   else
2027     {
2028       if (c >= 0xA0)
2029         {
2030           if (cmp_status->state != COMPOSING_NO
2031               && cmp_status->old_form)
2032             {
2033               if (cmp_status->state == COMPOSING_CHAR)
2034                 {
2035                   if (c == 0xA0)
2036                     {
2037                       ONE_MORE_BYTE (c);
2038                       c -= 0x80;
2039                       if (c < 0)
2040                         goto invalid_code;
2041                     }
2042                   else
2043                     c -= 0x20;
2044                   mseq_found = 1;
2045                 }
2046               else
2047                 {
2048                   *nbytes = src - src_base;
2049                   *nchars = consumed_chars;
2050                   return -c;
2051                 }
2052             }
2053           else
2054             goto invalid_code;
2055         }
2056
2057       switch (emacs_mule_bytes[c])
2058         {
2059         case 2:
2060           if ((charset_ID = emacs_mule_charset[c]) < 0)
2061             goto invalid_code;
2062           ONE_MORE_BYTE (c);
2063           if (c < 0xA0)
2064             goto invalid_code;
2065           code = c & 0x7F;
2066           break;
2067
2068         case 3:
2069           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2070               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2071             {
2072               ONE_MORE_BYTE (c);
2073               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2074                 goto invalid_code;
2075               ONE_MORE_BYTE (c);
2076               if (c < 0xA0)
2077                 goto invalid_code;
2078               code = c & 0x7F;
2079             }
2080           else
2081             {
2082               if ((charset_ID = emacs_mule_charset[c]) < 0)
2083                 goto invalid_code;
2084               ONE_MORE_BYTE (c);
2085               if (c < 0xA0)
2086                 goto invalid_code;
2087               code = (c & 0x7F) << 8;
2088               ONE_MORE_BYTE (c);
2089               if (c < 0xA0)
2090                 goto invalid_code;
2091               code |= c & 0x7F;
2092             }
2093           break;
2094
2095         case 4:
2096           ONE_MORE_BYTE (c);
2097           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2098             goto invalid_code;
2099           ONE_MORE_BYTE (c);
2100           if (c < 0xA0)
2101             goto invalid_code;
2102           code = (c & 0x7F) << 8;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code |= c & 0x7F;
2107           break;
2108
2109         case 1:
2110           code = c;
2111           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2112           break;
2113
2114         default:
2115           emacs_abort ();
2116         }
2117       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2118                           CHARSET_FROM_ID (charset_ID), code, c);
2119       if (c < 0)
2120         goto invalid_code;
2121     }
2122   *nbytes = src - src_base;
2123   *nchars = consumed_chars;
2124   if (id)
2125     *id = charset_ID;
2126   return (mseq_found ? -c : c);
2127
2128  no_more_source:
2129   return -2;
2130
2131  invalid_code:
2132   return -1;
2133 }
2134
2135
2136 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2137
2138 /* Handle these composition sequence ('|': the end of header elements,
2139    BYTES and CHARS >= 0xA0):
2140
2141    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2142    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2143    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2144
2145    and these old form:
2146
2147    (4) relative composition: 0x80 | MSEQ ... MSEQ
2148    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2149
2150    When the starter 0x80 and the following header elements are found,
2151    this annotation header is produced.
2152
2153         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2154
2155    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157
2158    Then, upon reading the following elements, these codes are produced
2159    until the composition end is found:
2160
2161    (1) CHAR ... CHAR
2162    (2) ALT ... ALT CHAR ... CHAR
2163    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2164    (4) CHAR ... CHAR
2165    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2166
2167    When the composition end is found, LENGTH and NCHARS in the
2168    annotation header is updated as below:
2169
2170    (1) LENGTH: unchanged, NCHARS: unchanged
2171    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2174    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2175
2176    If an error is found while composing, the annotation header is
2177    changed to the original composition header (plus filler -1s) as
2178    below:
2179
2180    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2181    (5)          [ 0x80 0xFF -1 -1- -1 ]
2182
2183    and the sequence [ -2 DECODED-RULE ] is changed to the original
2184    byte sequence as below:
2185         o the original byte sequence is B: [ B -1 ]
2186         o the original byte sequence is B1 B2: [ B1 B2 ]
2187
2188    Most of the routines are implemented by macros because many
2189    variables and labels in the caller decode_coding_emacs_mule must be
2190    accessible, and they are usually called just once (thus doesn't
2191    increase the size of compiled object).  */
2192
2193 /* Decode a composition rule represented by C as a component of
2194    composition sequence of Emacs 20 style.  Set RULE to the decoded
2195    rule. */
2196
2197 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2198   do {                                                  \
2199     int gref, nref;                                     \
2200                                                         \
2201     c -= 0xA0;                                          \
2202     if (c < 0 || c >= 81)                               \
2203       goto invalid_code;                                \
2204     gref = c / 9, nref = c % 9;                         \
2205     if (gref == 4) gref = 10;                           \
2206     if (nref == 4) nref = 10;                           \
2207     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2208   } while (0)
2209
2210
2211 /* Decode a composition rule represented by C and the following byte
2212    at SRC as a component of composition sequence of Emacs 21 style.
2213    Set RULE to the decoded rule.  */
2214
2215 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2216   do {                                                  \
2217     int gref, nref;                                     \
2218                                                         \
2219     gref = c - 0x20;                                    \
2220     if (gref < 0 || gref >= 81)                         \
2221       goto invalid_code;                                \
2222     ONE_MORE_BYTE (c);                                  \
2223     nref = c - 0x20;                                    \
2224     if (nref < 0 || nref >= 81)                         \
2225       goto invalid_code;                                \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2231    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2232    byte length of this composition information, CHARS is the number of
2233    characters composed by this composition.  */
2234
2235 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2236   do {                                                                  \
2237     enum composition_method method = c - 0xF2;                          \
2238     int nbytes, nchars;                                                 \
2239                                                                         \
2240     ONE_MORE_BYTE (c);                                                  \
2241     if (c < 0)                                                          \
2242       goto invalid_code;                                                \
2243     nbytes = c - 0xA0;                                                  \
2244     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2245       goto invalid_code;                                                \
2246     ONE_MORE_BYTE (c);                                                  \
2247     nchars = c - 0xA0;                                                  \
2248     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2249       goto invalid_code;                                                \
2250     cmp_status->old_form = 0;                                           \
2251     cmp_status->method = method;                                        \
2252     if (method == COMPOSITION_RELATIVE)                                 \
2253       cmp_status->state = COMPOSING_CHAR;                               \
2254     else                                                                \
2255       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2256     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2257     cmp_status->nchars = nchars;                                        \
2258     cmp_status->ncomps = nbytes - 4;                                    \
2259     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2260   } while (0)
2261
2262
2263 /* Start of Emacs 20 style format for relative composition.  */
2264
2265 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2266   do {                                                          \
2267     cmp_status->old_form = 1;                                   \
2268     cmp_status->method = COMPOSITION_RELATIVE;                  \
2269     cmp_status->state = COMPOSING_CHAR;                         \
2270     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2271     cmp_status->nchars = cmp_status->ncomps = 0;                \
2272     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2273   } while (0)
2274
2275
2276 /* Start of Emacs 20 style format for rule-base composition.  */
2277
2278 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2279   do {                                                          \
2280     cmp_status->old_form = 1;                                   \
2281     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2282     cmp_status->state = COMPOSING_CHAR;                         \
2283     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2284     cmp_status->nchars = cmp_status->ncomps = 0;                \
2285     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2286   } while (0)
2287
2288
2289 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2290   do {                                                  \
2291     const unsigned char *current_src = src;             \
2292                                                         \
2293     ONE_MORE_BYTE (c);                                  \
2294     if (c < 0)                                          \
2295       goto invalid_code;                                \
2296     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2297         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2298       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2299     else if (c < 0xA0)                                  \
2300       goto invalid_code;                                \
2301     else if (c < 0xC0)                                  \
2302       {                                                 \
2303         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2304         /* Re-read C as a composition component.  */    \
2305         src = current_src;                              \
2306       }                                                 \
2307     else if (c == 0xFF)                                 \
2308       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2309     else                                                \
2310       goto invalid_code;                                \
2311   } while (0)
2312
2313 #define EMACS_MULE_COMPOSITION_END()                            \
2314   do {                                                          \
2315     int idx = - cmp_status->length;                             \
2316                                                                 \
2317     if (cmp_status->old_form)                                   \
2318       charbuf[idx + 2] = cmp_status->nchars;                    \
2319     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2320       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2321     cmp_status->state = COMPOSING_NO;                           \
2322   } while (0)
2323
2324
2325 static int
2326 emacs_mule_finish_composition (int *charbuf,
2327                                struct composition_status *cmp_status)
2328 {
2329   int idx = - cmp_status->length;
2330   int new_chars;
2331
2332   if (cmp_status->old_form && cmp_status->nchars > 0)
2333     {
2334       charbuf[idx + 2] = cmp_status->nchars;
2335       new_chars = 0;
2336       if (cmp_status->method == COMPOSITION_WITH_RULE
2337           && cmp_status->state == COMPOSING_CHAR)
2338         {
2339           /* The last rule was invalid.  */
2340           int rule = charbuf[-1] + 0xA0;
2341
2342           charbuf[-2] = BYTE8_TO_CHAR (rule);
2343           charbuf[-1] = -1;
2344           new_chars = 1;
2345         }
2346     }
2347   else
2348     {
2349       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2350
2351       if (cmp_status->method == COMPOSITION_WITH_RULE)
2352         {
2353           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2354           charbuf[idx++] = -3;
2355           charbuf[idx++] = 0;
2356           new_chars = 1;
2357         }
2358       else
2359         {
2360           int nchars = charbuf[idx + 1] + 0xA0;
2361           int nbytes = charbuf[idx + 2] + 0xA0;
2362
2363           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2365           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2366           charbuf[idx++] = -1;
2367           new_chars = 4;
2368         }
2369     }
2370   cmp_status->state = COMPOSING_NO;
2371   return new_chars;
2372 }
2373
2374 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2375   do {                                                                    \
2376     if (cmp_status->state != COMPOSING_NO)                                \
2377       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2378   } while (0)
2379
2380
2381 static void
2382 decode_coding_emacs_mule (struct coding_system *coding)
2383 {
2384   const unsigned char *src = coding->source + coding->consumed;
2385   const unsigned char *src_end = coding->source + coding->src_bytes;
2386   const unsigned char *src_base;
2387   int *charbuf = coding->charbuf + coding->charbuf_used;
2388   /* We may produce two annotations (charset and composition) in one
2389      loop and one more charset annotation at the end.  */
2390   int *charbuf_end
2391     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2392       /* We can produce up to 2 characters in a loop.  */
2393       - 1;
2394   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2395   bool multibytep = coding->src_multibyte;
2396   ptrdiff_t char_offset = coding->produced_char;
2397   ptrdiff_t last_offset = char_offset;
2398   int last_id = charset_ascii;
2399   bool eol_dos
2400     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2401   int byte_after_cr = -1;
2402   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2403
2404   if (cmp_status->state != COMPOSING_NO)
2405     {
2406       int i;
2407
2408       if (charbuf_end - charbuf < cmp_status->length)
2409         emacs_abort ();
2410       for (i = 0; i < cmp_status->length; i++)
2411         *charbuf++ = cmp_status->carryover[i];
2412       coding->annotated = 1;
2413     }
2414
2415   while (1)
2416     {
2417       int c, id IF_LINT (= 0);
2418
2419       src_base = src;
2420       consumed_chars_base = consumed_chars;
2421
2422       if (charbuf >= charbuf_end)
2423         {
2424           if (byte_after_cr >= 0)
2425             src_base--;
2426           break;
2427         }
2428
2429       if (byte_after_cr >= 0)
2430         c = byte_after_cr, byte_after_cr = -1;
2431       else
2432         ONE_MORE_BYTE (c);
2433
2434       if (c < 0 || c == 0x80)
2435         {
2436           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2437           if (c < 0)
2438             {
2439               *charbuf++ = -c;
2440               char_offset++;
2441             }
2442           else
2443             DECODE_EMACS_MULE_COMPOSITION_START ();
2444           continue;
2445         }
2446
2447       if (c < 0x80)
2448         {
2449           if (eol_dos && c == '\r')
2450             ONE_MORE_BYTE (byte_after_cr);
2451           id = charset_ascii;
2452           if (cmp_status->state != COMPOSING_NO)
2453             {
2454               if (cmp_status->old_form)
2455                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2457                 cmp_status->ncomps--;
2458             }
2459         }
2460       else
2461         {
2462           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2463           /* emacs_mule_char can load a charset map from a file, which
2464              allocates a large structure and might cause buffer text
2465              to be relocated as result.  Thus, we need to remember the
2466              original pointer to buffer text, and fix up all related
2467              pointers after the call.  */
2468           const unsigned char *orig = coding->source;
2469           ptrdiff_t offset;
2470
2471           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2472                                cmp_status);
2473           offset = coding->source - orig;
2474           if (offset)
2475             {
2476               src += offset;
2477               src_base += offset;
2478               src_end += offset;
2479             }
2480           if (c < 0)
2481             {
2482               if (c == -1)
2483                 goto invalid_code;
2484               if (c == -2)
2485                 break;
2486             }
2487           src = src_base + nbytes;
2488           consumed_chars = consumed_chars_base + nchars;
2489           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2490             cmp_status->ncomps -= nchars;
2491         }
2492
2493       /* Now if C >= 0, we found a normally encoded character, if C <
2494          0, we found an old-style composition component character or
2495          rule.  */
2496
2497       if (cmp_status->state == COMPOSING_NO)
2498         {
2499           if (last_id != id)
2500             {
2501               if (last_id != charset_ascii)
2502                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2503                                   last_id);
2504               last_id = id;
2505               last_offset = char_offset;
2506             }
2507           *charbuf++ = c;
2508           char_offset++;
2509         }
2510       else if (cmp_status->state == COMPOSING_CHAR)
2511         {
2512           if (cmp_status->old_form)
2513             {
2514               if (c >= 0)
2515                 {
2516                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517                   *charbuf++ = c;
2518                   char_offset++;
2519                 }
2520               else
2521                 {
2522                   *charbuf++ = -c;
2523                   cmp_status->nchars++;
2524                   cmp_status->length++;
2525                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2526                     EMACS_MULE_COMPOSITION_END ();
2527                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2528                     cmp_status->state = COMPOSING_RULE;
2529                 }
2530             }
2531           else
2532             {
2533               *charbuf++ = c;
2534               cmp_status->length++;
2535               cmp_status->nchars--;
2536               if (cmp_status->nchars == 0)
2537                 EMACS_MULE_COMPOSITION_END ();
2538             }
2539         }
2540       else if (cmp_status->state == COMPOSING_RULE)
2541         {
2542           int rule;
2543
2544           if (c >= 0)
2545             {
2546               EMACS_MULE_COMPOSITION_END ();
2547               *charbuf++ = c;
2548               char_offset++;
2549             }
2550           else
2551             {
2552               c = -c;
2553               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2554               if (rule < 0)
2555                 goto invalid_code;
2556               *charbuf++ = -2;
2557               *charbuf++ = rule;
2558               cmp_status->length += 2;
2559               cmp_status->state = COMPOSING_CHAR;
2560             }
2561         }
2562       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2563         {
2564           *charbuf++ = c;
2565           cmp_status->length++;
2566           if (cmp_status->ncomps == 0)
2567             cmp_status->state = COMPOSING_CHAR;
2568           else if (cmp_status->ncomps > 0)
2569             {
2570               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2571                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2572             }
2573           else
2574             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2575         }
2576       else                      /* COMPOSING_COMPONENT_RULE */
2577         {
2578           int rule;
2579
2580           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2581           if (rule < 0)
2582             goto invalid_code;
2583           *charbuf++ = -2;
2584           *charbuf++ = rule;
2585           cmp_status->length += 2;
2586           cmp_status->ncomps--;
2587           if (cmp_status->ncomps > 0)
2588             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2589           else
2590             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2591         }
2592       continue;
2593
2594     invalid_code:
2595       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596       src = src_base;
2597       consumed_chars = consumed_chars_base;
2598       ONE_MORE_BYTE (c);
2599       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2600       char_offset++;
2601       coding->errors++;
2602     }
2603
2604  no_more_source:
2605   if (cmp_status->state != COMPOSING_NO)
2606     {
2607       if (coding->mode & CODING_MODE_LAST_BLOCK)
2608         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2609       else
2610         {
2611           int i;
2612
2613           charbuf -= cmp_status->length;
2614           for (i = 0; i < cmp_status->length; i++)
2615             cmp_status->carryover[i] = charbuf[i];
2616         }
2617     }
2618   if (last_id != charset_ascii)
2619     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2620   coding->consumed_char += consumed_chars_base;
2621   coding->consumed = src_base - coding->source;
2622   coding->charbuf_used = charbuf - coding->charbuf;
2623 }
2624
2625
2626 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2627   do {                                          \
2628     if (id < 0xA0)                              \
2629       codes[0] = id, codes[1] = 0;              \
2630     else if (id < 0xE0)                         \
2631       codes[0] = 0x9A, codes[1] = id;           \
2632     else if (id < 0xF0)                         \
2633       codes[0] = 0x9B, codes[1] = id;           \
2634     else if (id < 0xF5)                         \
2635       codes[0] = 0x9C, codes[1] = id;           \
2636     else                                        \
2637       codes[0] = 0x9D, codes[1] = id;           \
2638   } while (0);
2639
2640
2641 static bool
2642 encode_coding_emacs_mule (struct coding_system *coding)
2643 {
2644   bool multibytep = coding->dst_multibyte;
2645   int *charbuf = coding->charbuf;
2646   int *charbuf_end = charbuf + coding->charbuf_used;
2647   unsigned char *dst = coding->destination + coding->produced;
2648   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2649   int safe_room = 8;
2650   ptrdiff_t produced_chars = 0;
2651   Lisp_Object attrs, charset_list;
2652   int c;
2653   int preferred_charset_id = -1;
2654
2655   CODING_GET_INFO (coding, attrs, charset_list);
2656   if (! EQ (charset_list, Vemacs_mule_charset_list))
2657     {
2658       charset_list = Vemacs_mule_charset_list;
2659       ASET (attrs, coding_attr_charset_list, charset_list);
2660     }
2661
2662   while (charbuf < charbuf_end)
2663     {
2664       ASSURE_DESTINATION (safe_room);
2665       c = *charbuf++;
2666
2667       if (c < 0)
2668         {
2669           /* Handle an annotation.  */
2670           switch (*charbuf)
2671             {
2672             case CODING_ANNOTATE_COMPOSITION_MASK:
2673               /* Not yet implemented.  */
2674               break;
2675             case CODING_ANNOTATE_CHARSET_MASK:
2676               preferred_charset_id = charbuf[3];
2677               if (preferred_charset_id >= 0
2678                   && NILP (Fmemq (make_number (preferred_charset_id),
2679                                   charset_list)))
2680                 preferred_charset_id = -1;
2681               break;
2682             default:
2683               emacs_abort ();
2684             }
2685           charbuf += -c - 1;
2686           continue;
2687         }
2688
2689       if (ASCII_CHAR_P (c))
2690         EMIT_ONE_ASCII_BYTE (c);
2691       else if (CHAR_BYTE8_P (c))
2692         {
2693           c = CHAR_TO_BYTE8 (c);
2694           EMIT_ONE_BYTE (c);
2695         }
2696       else
2697         {
2698           struct charset *charset;
2699           unsigned code;
2700           int dimension;
2701           int emacs_mule_id;
2702           unsigned char leading_codes[2];
2703
2704           if (preferred_charset_id >= 0)
2705             {
2706               bool result;
2707
2708               charset = CHARSET_FROM_ID (preferred_charset_id);
2709               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2710               if (result)
2711                 code = ENCODE_CHAR (charset, c);
2712               else
2713                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714                                      &code, charset);
2715             }
2716           else
2717             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2718                                  &code, charset);
2719           if (! charset)
2720             {
2721               c = coding->default_char;
2722               if (ASCII_CHAR_P (c))
2723                 {
2724                   EMIT_ONE_ASCII_BYTE (c);
2725                   continue;
2726                 }
2727               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2728                                    &code, charset);
2729             }
2730           dimension = CHARSET_DIMENSION (charset);
2731           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2732           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2733           EMIT_ONE_BYTE (leading_codes[0]);
2734           if (leading_codes[1])
2735             EMIT_ONE_BYTE (leading_codes[1]);
2736           if (dimension == 1)
2737             EMIT_ONE_BYTE (code | 0x80);
2738           else
2739             {
2740               code |= 0x8080;
2741               EMIT_ONE_BYTE (code >> 8);
2742               EMIT_ONE_BYTE (code & 0xFF);
2743             }
2744         }
2745     }
2746   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2747   coding->produced_char += produced_chars;
2748   coding->produced = dst - coding->destination;
2749   return 0;
2750 }
2751
2752 \f
2753 /*** 7. ISO2022 handlers ***/
2754
2755 /* The following note describes the coding system ISO2022 briefly.
2756    Since the intention of this note is to help understand the
2757    functions in this file, some parts are NOT ACCURATE or are OVERLY
2758    SIMPLIFIED.  For thorough understanding, please refer to the
2759    original document of ISO2022.  This is equivalent to the standard
2760    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2761
2762    ISO2022 provides many mechanisms to encode several character sets
2763    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2764    is encoded using bytes less than 128.  This may make the encoded
2765    text a little bit longer, but the text passes more easily through
2766    several types of gateway, some of which strip off the MSB (Most
2767    Significant Bit).
2768
2769    There are two kinds of character sets: control character sets and
2770    graphic character sets.  The former contain control characters such
2771    as `newline' and `escape' to provide control functions (control
2772    functions are also provided by escape sequences).  The latter
2773    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2774    two control character sets and many graphic character sets.
2775
2776    Graphic character sets are classified into one of the following
2777    four classes, according to the number of bytes (DIMENSION) and
2778    number of characters in one dimension (CHARS) of the set:
2779    - DIMENSION1_CHARS94
2780    - DIMENSION1_CHARS96
2781    - DIMENSION2_CHARS94
2782    - DIMENSION2_CHARS96
2783
2784    In addition, each character set is assigned an identification tag,
2785    unique for each set, called the "final character" (denoted as <F>
2786    hereafter).  The <F> of each character set is decided by ECMA(*)
2787    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2788    (0x30..0x3F are for private use only).
2789
2790    Note (*): ECMA = European Computer Manufacturers Association
2791
2792    Here are examples of graphic character sets [NAME(<F>)]:
2793         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2794         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2795         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2796         o DIMENSION2_CHARS96 -- none for the moment
2797
2798    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2799         C0 [0x00..0x1F] -- control character plane 0
2800         GL [0x20..0x7F] -- graphic character plane 0
2801         C1 [0x80..0x9F] -- control character plane 1
2802         GR [0xA0..0xFF] -- graphic character plane 1
2803
2804    A control character set is directly designated and invoked to C0 or
2805    C1 by an escape sequence.  The most common case is that:
2806    - ISO646's  control character set is designated/invoked to C0, and
2807    - ISO6429's control character set is designated/invoked to C1,
2808    and usually these designations/invocations are omitted in encoded
2809    text.  In a 7-bit environment, only C0 can be used, and a control
2810    character for C1 is encoded by an appropriate escape sequence to
2811    fit into the environment.  All control characters for C1 are
2812    defined to have corresponding escape sequences.
2813
2814    A graphic character set is at first designated to one of four
2815    graphic registers (G0 through G3), then these graphic registers are
2816    invoked to GL or GR.  These designations and invocations can be
2817    done independently.  The most common case is that G0 is invoked to
2818    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2819    these invocations and designations are omitted in encoded text.
2820    In a 7-bit environment, only GL can be used.
2821
2822    When a graphic character set of CHARS94 is invoked to GL, codes
2823    0x20 and 0x7F of the GL area work as control characters SPACE and
2824    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2825    be used.
2826
2827    There are two ways of invocation: locking-shift and single-shift.
2828    With locking-shift, the invocation lasts until the next different
2829    invocation, whereas with single-shift, the invocation affects the
2830    following character only and doesn't affect the locking-shift
2831    state.  Invocations are done by the following control characters or
2832    escape sequences:
2833
2834    ----------------------------------------------------------------------
2835    abbrev  function                  cntrl escape seq   description
2836    ----------------------------------------------------------------------
2837    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2838    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2839    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2840    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2841    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2842    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2843    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2844    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2845    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2846    ----------------------------------------------------------------------
2847    (*) These are not used by any known coding system.
2848
2849    Control characters for these functions are defined by macros
2850    ISO_CODE_XXX in `coding.h'.
2851
2852    Designations are done by the following escape sequences:
2853    ----------------------------------------------------------------------
2854    escape sequence      description
2855    ----------------------------------------------------------------------
2856    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2857    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2858    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2859    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2860    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2861    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2862    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2863    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2864    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2865    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2866    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2867    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2868    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2869    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2870    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2871    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2872    ----------------------------------------------------------------------
2873
2874    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2875    of dimension 1, chars 94, and final character <F>, etc...
2876
2877    Note (*): Although these designations are not allowed in ISO2022,
2878    Emacs accepts them on decoding, and produces them on encoding
2879    CHARS96 character sets in a coding system which is characterized as
2880    7-bit environment, non-locking-shift, and non-single-shift.
2881
2882    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2883    '(' must be omitted.  We refer to this as "short-form" hereafter.
2884
2885    Now you may notice that there are a lot of ways of encoding the
2886    same multilingual text in ISO2022.  Actually, there exist many
2887    coding systems such as Compound Text (used in X11's inter client
2888    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2889    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2890    localized platforms), and all of these are variants of ISO2022.
2891
2892    In addition to the above, Emacs handles two more kinds of escape
2893    sequences: ISO6429's direction specification and Emacs' private
2894    sequence for specifying character composition.
2895
2896    ISO6429's direction specification takes the following form:
2897         o CSI ']'      -- end of the current direction
2898         o CSI '0' ']'  -- end of the current direction
2899         o CSI '1' ']'  -- start of left-to-right text
2900         o CSI '2' ']'  -- start of right-to-left text
2901    The control character CSI (0x9B: control sequence introducer) is
2902    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2903
2904    Character composition specification takes the following form:
2905         o ESC '0' -- start relative composition
2906         o ESC '1' -- end composition
2907         o ESC '2' -- start rule-base composition (*)
2908         o ESC '3' -- start relative composition with alternate chars  (**)
2909         o ESC '4' -- start rule-base composition with alternate chars  (**)
2910   Since these are not standard escape sequences of any ISO standard,
2911   the use of them with these meanings is restricted to Emacs only.
2912
2913   (*) This form is used only in Emacs 20.7 and older versions,
2914   but newer versions can safely decode it.
2915   (**) This form is used only in Emacs 21.1 and newer versions,
2916   and older versions can't decode it.
2917
2918   Here's a list of example usages of these composition escape
2919   sequences (categorized by `enum composition_method').
2920
2921   COMPOSITION_RELATIVE:
2922         ESC 0 CHAR [ CHAR ] ESC 1
2923   COMPOSITION_WITH_RULE:
2924         ESC 2 CHAR [ RULE CHAR ] ESC 1
2925   COMPOSITION_WITH_ALTCHARS:
2926         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2927   COMPOSITION_WITH_RULE_ALTCHARS:
2928         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2929
2930 static enum iso_code_class_type iso_code_class[256];
2931
2932 #define SAFE_CHARSET_P(coding, id)      \
2933   ((id) <= (coding)->max_charset_id     \
2934    && (coding)->safe_charsets[id] != 255)
2935
2936 static void
2937 setup_iso_safe_charsets (Lisp_Object attrs)
2938 {
2939   Lisp_Object charset_list, safe_charsets;
2940   Lisp_Object request;
2941   Lisp_Object reg_usage;
2942   Lisp_Object tail;
2943   EMACS_INT reg94, reg96;
2944   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2945   int max_charset_id;
2946
2947   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2948   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2949       && ! EQ (charset_list, Viso_2022_charset_list))
2950     {
2951       charset_list = Viso_2022_charset_list;
2952       ASET (attrs, coding_attr_charset_list, charset_list);
2953       ASET (attrs, coding_attr_safe_charsets, Qnil);
2954     }
2955
2956   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2957     return;
2958
2959   max_charset_id = 0;
2960   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2961     {
2962       int id = XINT (XCAR (tail));
2963       if (max_charset_id < id)
2964         max_charset_id = id;
2965     }
2966
2967   safe_charsets = make_uninit_string (max_charset_id + 1);
2968   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2969   request = AREF (attrs, coding_attr_iso_request);
2970   reg_usage = AREF (attrs, coding_attr_iso_usage);
2971   reg94 = XINT (XCAR (reg_usage));
2972   reg96 = XINT (XCDR (reg_usage));
2973
2974   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2975     {
2976       Lisp_Object id;
2977       Lisp_Object reg;
2978       struct charset *charset;
2979
2980       id = XCAR (tail);
2981       charset = CHARSET_FROM_ID (XINT (id));
2982       reg = Fcdr (Fassq (id, request));
2983       if (! NILP (reg))
2984         SSET (safe_charsets, XINT (id), XINT (reg));
2985       else if (charset->iso_chars_96)
2986         {
2987           if (reg96 < 4)
2988             SSET (safe_charsets, XINT (id), reg96);
2989         }
2990       else
2991         {
2992           if (reg94 < 4)
2993             SSET (safe_charsets, XINT (id), reg94);
2994         }
2995     }
2996   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2997 }
2998
2999
3000 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3001    Return true if a text is encoded in one of ISO-2022 based coding
3002    systems.  */
3003
3004 static bool
3005 detect_coding_iso_2022 (struct coding_system *coding,
3006                         struct coding_detection_info *detect_info)
3007 {
3008   const unsigned char *src = coding->source, *src_base = src;
3009   const unsigned char *src_end = coding->source + coding->src_bytes;
3010   bool multibytep = coding->src_multibyte;
3011   bool single_shifting = 0;
3012   int id;
3013   int c, c1;
3014   ptrdiff_t consumed_chars = 0;
3015   int i;
3016   int rejected = 0;
3017   int found = 0;
3018   int composition_count = -1;
3019
3020   detect_info->checked |= CATEGORY_MASK_ISO;
3021
3022   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3023     {
3024       struct coding_system *this = &(coding_categories[i]);
3025       Lisp_Object attrs, val;
3026
3027       if (this->id < 0)
3028         continue;
3029       attrs = CODING_ID_ATTRS (this->id);
3030       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3031           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3032         setup_iso_safe_charsets (attrs);
3033       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3034       this->max_charset_id = SCHARS (val) - 1;
3035       this->safe_charsets = SDATA (val);
3036     }
3037
3038   /* A coding system of this category is always ASCII compatible.  */
3039   src += coding->head_ascii;
3040
3041   while (rejected != CATEGORY_MASK_ISO)
3042     {
3043       src_base = src;
3044       ONE_MORE_BYTE (c);
3045       switch (c)
3046         {
3047         case ISO_CODE_ESC:
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           ONE_MORE_BYTE (c);
3052           if (c == 'N' || c == 'O')
3053             {
3054               /* ESC <Fe> for SS2 or SS3.  */
3055               single_shifting = 1;
3056               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3057             }
3058           else if (c == '1')
3059             {
3060               /* End of composition.  */
3061               if (composition_count < 0
3062                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3063                 /* Invalid */
3064                 break;
3065               composition_count = -1;
3066               found |= CATEGORY_MASK_ISO;
3067             }
3068           else if (c >= '0' && c <= '4')
3069             {
3070               /* ESC <Fp> for start/end composition.  */
3071               composition_count = 0;
3072             }
3073           else
3074             {
3075               if (c >= '(' && c <= '/')
3076                 {
3077                   /* Designation sequence for a charset of dimension 1.  */
3078                   ONE_MORE_BYTE (c1);
3079                   if (c1 < ' ' || c1 >= 0x80
3080                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3081                     {
3082                       /* Invalid designation sequence.  Just ignore.  */
3083                       if (c1 >= 0x80)
3084                         rejected |= (CATEGORY_MASK_ISO_7BIT
3085                                      | CATEGORY_MASK_ISO_7_ELSE);
3086                       break;
3087                     }
3088                 }
3089               else if (c == '$')
3090                 {
3091                   /* Designation sequence for a charset of dimension 2.  */
3092                   ONE_MORE_BYTE (c);
3093                   if (c >= '@' && c <= 'B')
3094                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3095                     id = iso_charset_table[1][0][c];
3096                   else if (c >= '(' && c <= '/')
3097                     {
3098                       ONE_MORE_BYTE (c1);
3099                       if (c1 < ' ' || c1 >= 0x80
3100                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3101                         {
3102                           /* Invalid designation sequence.  Just ignore.  */
3103                           if (c1 >= 0x80)
3104                             rejected |= (CATEGORY_MASK_ISO_7BIT
3105                                          | CATEGORY_MASK_ISO_7_ELSE);
3106                           break;
3107                         }
3108                     }
3109                   else
3110                     {
3111                       /* Invalid designation sequence.  Just ignore it.  */
3112                       if (c >= 0x80)
3113                         rejected |= (CATEGORY_MASK_ISO_7BIT
3114                                      | CATEGORY_MASK_ISO_7_ELSE);
3115                       break;
3116                     }
3117                 }
3118               else
3119                 {
3120                   /* Invalid escape sequence.  Just ignore it.  */
3121                   if (c >= 0x80)
3122                     rejected |= (CATEGORY_MASK_ISO_7BIT
3123                                  | CATEGORY_MASK_ISO_7_ELSE);
3124                   break;
3125                 }
3126
3127               /* We found a valid designation sequence for CHARSET.  */
3128               rejected |= CATEGORY_MASK_ISO_8BIT;
3129               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3130                                   id))
3131                 found |= CATEGORY_MASK_ISO_7;
3132               else
3133                 rejected |= CATEGORY_MASK_ISO_7;
3134               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3135                                   id))
3136                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3137               else
3138                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3139               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3140                                   id))
3141                 found |= CATEGORY_MASK_ISO_7_ELSE;
3142               else
3143                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3144               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3145                                   id))
3146                 found |= CATEGORY_MASK_ISO_8_ELSE;
3147               else
3148                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3149             }
3150           break;
3151
3152         case ISO_CODE_SO:
3153         case ISO_CODE_SI:
3154           /* Locking shift out/in.  */
3155           if (inhibit_iso_escape_detection)
3156             break;
3157           single_shifting = 0;
3158           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3159           break;
3160
3161         case ISO_CODE_CSI:
3162           /* Control sequence introducer.  */
3163           single_shifting = 0;
3164           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3165           found |= CATEGORY_MASK_ISO_8_ELSE;
3166           goto check_extra_latin;
3167
3168         case ISO_CODE_SS2:
3169         case ISO_CODE_SS3:
3170           /* Single shift.   */
3171           if (inhibit_iso_escape_detection)
3172             break;
3173           single_shifting = 0;
3174           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3175           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3176               & CODING_ISO_FLAG_SINGLE_SHIFT)
3177             {
3178               found |= CATEGORY_MASK_ISO_8_1;
3179               single_shifting = 1;
3180             }
3181           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3182               & CODING_ISO_FLAG_SINGLE_SHIFT)
3183             {
3184               found |= CATEGORY_MASK_ISO_8_2;
3185               single_shifting = 1;
3186             }
3187           if (single_shifting)
3188             break;
3189           goto check_extra_latin;
3190
3191         default:
3192           if (c < 0)
3193             continue;
3194           if (c < 0x80)
3195             {
3196               if (composition_count >= 0)
3197                 composition_count++;
3198               single_shifting = 0;
3199               break;
3200             }
3201           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3202           if (c >= 0xA0)
3203             {
3204               found |= CATEGORY_MASK_ISO_8_1;
3205               /* Check the length of succeeding codes of the range
3206                  0xA0..0FF.  If the byte length is even, we include
3207                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3208                  only when we are not single shifting.  */
3209               if (! single_shifting
3210                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3211                 {
3212                   ptrdiff_t len = 1;
3213                   while (src < src_end)
3214                     {
3215                       src_base = src;
3216                       ONE_MORE_BYTE (c);
3217                       if (c < 0xA0)
3218                         {
3219                           src = src_base;
3220                           break;
3221                         }
3222                       len++;
3223                     }
3224
3225                   if (len & 1 && src < src_end)
3226                     {
3227                       rejected |= CATEGORY_MASK_ISO_8_2;
3228                       if (composition_count >= 0)
3229                         composition_count += len;
3230                     }
3231                   else
3232                     {
3233                       found |= CATEGORY_MASK_ISO_8_2;
3234                       if (composition_count >= 0)
3235                         composition_count += len / 2;
3236                     }
3237                 }
3238               break;
3239             }
3240         check_extra_latin:
3241           if (! VECTORP (Vlatin_extra_code_table)
3242               || NILP (AREF (Vlatin_extra_code_table, c)))
3243             {
3244               rejected = CATEGORY_MASK_ISO;
3245               break;
3246             }
3247           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3248               & CODING_ISO_FLAG_LATIN_EXTRA)
3249             found |= CATEGORY_MASK_ISO_8_1;
3250           else
3251             rejected |= CATEGORY_MASK_ISO_8_1;
3252           rejected |= CATEGORY_MASK_ISO_8_2;
3253           break;
3254         }
3255     }
3256   detect_info->rejected |= CATEGORY_MASK_ISO;
3257   return 0;
3258
3259  no_more_source:
3260   detect_info->rejected |= rejected;
3261   detect_info->found |= (found & ~rejected);
3262   return 1;
3263 }
3264
3265
3266 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3267    escape sequence should be kept.  */
3268 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3269   do {                                                                  \
3270     int id, prev;                                                       \
3271                                                                         \
3272     if (final < '0' || final >= 128                                     \
3273         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3274         || !SAFE_CHARSET_P (coding, id))                                \
3275       {                                                                 \
3276         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3277         chars_96 = -1;                                                  \
3278         break;                                                          \
3279       }                                                                 \
3280     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3281     if (id == charset_jisx0201_roman)                                   \
3282       {                                                                 \
3283         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3284           id = charset_ascii;                                           \
3285       }                                                                 \
3286     else if (id == charset_jisx0208_1978)                               \
3287       {                                                                 \
3288         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3289           id = charset_jisx0208;                                        \
3290       }                                                                 \
3291     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3292     /* If there was an invalid designation to REG previously, and this  \
3293        designation is ASCII to REG, we should keep this designation     \
3294        sequence.  */                                                    \
3295     if (prev == -2 && id == charset_ascii)                              \
3296       chars_96 = -1;                                                    \
3297   } while (0)
3298
3299
3300 /* Handle these composition sequence (ALT: alternate char):
3301
3302    (1) relative composition: ESC 0 CHAR ... ESC 1
3303    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3304    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3305    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3306
3307    When the start sequence (ESC 0/2/3/4) is found, this annotation
3308    header is produced.
3309
3310         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3311
3312    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3313    produced until the end sequence (ESC 1) is found:
3314
3315    (1) CHAR ... CHAR
3316    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3317    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3318    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3319
3320    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3321    annotation header is updated as below:
3322
3323    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3324    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3325    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3326    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3327
3328    If an error is found while composing, the annotation header is
3329    changed to:
3330
3331         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3332
3333    and the sequence [ -2 DECODED-RULE ] is changed to the original
3334    byte sequence as below:
3335         o the original byte sequence is B: [ B -1 ]
3336         o the original byte sequence is B1 B2: [ B1 B2 ]
3337    and the sequence [ -1 -1 ] is changed to the original byte
3338    sequence:
3339         [ ESC '0' ]
3340 */
3341
3342 /* Decode a composition rule C1 and maybe one more byte from the
3343    source, and set RULE to the encoded composition rule.  If the rule
3344    is invalid, goto invalid_code.  */
3345
3346 #define DECODE_COMPOSITION_RULE(rule)                                   \
3347   do {                                                                  \
3348     rule = c1 - 32;                                                     \
3349     if (rule < 0)                                                       \
3350       goto invalid_code;                                                \
3351     if (rule < 81)              /* old format (before ver.21) */        \
3352       {                                                                 \
3353         int gref = (rule) / 9;                                          \
3354         int nref = (rule) % 9;                                          \
3355         if (gref == 4) gref = 10;                                       \
3356         if (nref == 4) nref = 10;                                       \
3357         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3358       }                                                                 \
3359     else                        /* new format (after ver.21) */         \
3360       {                                                                 \
3361         int b;                                                          \
3362                                                                         \
3363         ONE_MORE_BYTE (b);                                              \
3364         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3365           goto invalid_code;                                            \
3366         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3367         rule += 0x100;   /* Distinguish it from the old format.  */     \
3368       }                                                                 \
3369   } while (0)
3370
3371 #define ENCODE_COMPOSITION_RULE(rule)                           \
3372   do {                                                          \
3373     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3374                                                                 \
3375     if (rule < 0x100)           /* old format */                \
3376       {                                                         \
3377         if (gref == 10) gref = 4;                               \
3378         if (nref == 10) nref = 4;                               \
3379         charbuf[idx] = 32 + gref * 9 + nref;                    \
3380         charbuf[idx + 1] = -1;                                  \
3381         new_chars++;                                            \
3382       }                                                         \
3383     else                                /* new format */        \
3384       {                                                         \
3385         charbuf[idx] = 32 + 81 + gref;                          \
3386         charbuf[idx + 1] = 32 + nref;                           \
3387         new_chars += 2;                                         \
3388       }                                                         \
3389   } while (0)
3390
3391 /* Finish the current composition as invalid.  */
3392
3393 static int
3394 finish_composition (int *charbuf, struct composition_status *cmp_status)
3395 {
3396   int idx = - cmp_status->length;
3397   int new_chars;
3398
3399   /* Recover the original ESC sequence */
3400   charbuf[idx++] = ISO_CODE_ESC;
3401   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3402                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3403                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3404                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3405                     : '4');
3406   charbuf[idx++] = -2;
3407   charbuf[idx++] = 0;
3408   charbuf[idx++] = -1;
3409   new_chars = cmp_status->nchars;
3410   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3411     for (; idx < 0; idx++)
3412       {
3413         int elt = charbuf[idx];
3414
3415         if (elt == -2)
3416           {
3417             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3418             idx++;
3419           }
3420         else if (elt == -1)
3421           {
3422             charbuf[idx++] = ISO_CODE_ESC;
3423             charbuf[idx] = '0';
3424             new_chars += 2;
3425           }
3426       }
3427   cmp_status->state = COMPOSING_NO;
3428   return new_chars;
3429 }
3430
3431 /* If characters are under composition, finish the composition.  */
3432 #define MAYBE_FINISH_COMPOSITION()                              \
3433   do {                                                          \
3434     if (cmp_status->state != COMPOSING_NO)                      \
3435       char_offset += finish_composition (charbuf, cmp_status);  \
3436   } while (0)
3437
3438 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3439
3440    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3441    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3442    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3443    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3444
3445    Produce this annotation sequence now:
3446
3447    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3448 */
3449
3450 #define DECODE_COMPOSITION_START(c1)                                       \
3451   do {                                                                     \
3452     if (c1 == '0'                                                          \
3453         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3454              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3455             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3456                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3457       {                                                                    \
3458         *charbuf++ = -1;                                                   \
3459         *charbuf++= -1;                                                    \
3460         cmp_status->state = COMPOSING_CHAR;                                \
3461         cmp_status->length += 2;                                           \
3462       }                                                                    \
3463     else                                                                   \
3464       {                                                                    \
3465         MAYBE_FINISH_COMPOSITION ();                                       \
3466         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3467                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3468                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3469                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3470         cmp_status->state                                                  \
3471           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3472         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3473         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3474         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3475         coding->annotated = 1;                                             \
3476       }                                                                    \
3477   } while (0)
3478
3479
3480 /* Handle composition end sequence ESC 1.  */
3481
3482 #define DECODE_COMPOSITION_END()                                        \
3483   do {                                                                  \
3484     if (cmp_status->nchars == 0                                         \
3485         || ((cmp_status->state == COMPOSING_CHAR)                       \
3486             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3487       {                                                                 \
3488         MAYBE_FINISH_COMPOSITION ();                                    \
3489         goto invalid_code;                                              \
3490       }                                                                 \
3491     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3492       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3493     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3494       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3495     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3496     char_offset += cmp_status->nchars;                                  \
3497     cmp_status->state = COMPOSING_NO;                                   \
3498   } while (0)
3499
3500 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3501
3502 #define STORE_COMPOSITION_RULE(rule)    \
3503   do {                                  \
3504     *charbuf++ = -2;                    \
3505     *charbuf++ = rule;                  \
3506     cmp_status->length += 2;            \
3507     cmp_status->state--;                \
3508   } while (0)
3509
3510 /* Store a composed char or a component char C in charbuf, and update
3511    cmp_status.  */
3512
3513 #define STORE_COMPOSITION_CHAR(c)                                       \
3514   do {                                                                  \
3515     *charbuf++ = (c);                                                   \
3516     cmp_status->length++;                                               \
3517     if (cmp_status->state == COMPOSING_CHAR)                            \
3518       cmp_status->nchars++;                                             \
3519     else                                                                \
3520       cmp_status->ncomps++;                                             \
3521     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3522         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3523             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3524       cmp_status->state++;                                              \
3525   } while (0)
3526
3527
3528 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3529
3530 static void
3531 decode_coding_iso_2022 (struct coding_system *coding)
3532 {
3533   const unsigned char *src = coding->source + coding->consumed;
3534   const unsigned char *src_end = coding->source + coding->src_bytes;
3535   const unsigned char *src_base;
3536   int *charbuf = coding->charbuf + coding->charbuf_used;
3537   /* We may produce two annotations (charset and composition) in one
3538      loop and one more charset annotation at the end.  */
3539   int *charbuf_end
3540     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3541   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3542   bool multibytep = coding->src_multibyte;
3543   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3544   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3545   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3546   int charset_id_2, charset_id_3;
3547   struct charset *charset;
3548   int c;
3549   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3550   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3551   ptrdiff_t char_offset = coding->produced_char;
3552   ptrdiff_t last_offset = char_offset;
3553   int last_id = charset_ascii;
3554   bool eol_dos
3555     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3556   int byte_after_cr = -1;
3557   int i;
3558
3559   setup_iso_safe_charsets (attrs);
3560   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3561
3562   if (cmp_status->state != COMPOSING_NO)
3563     {
3564       if (charbuf_end - charbuf < cmp_status->length)
3565         emacs_abort ();
3566       for (i = 0; i < cmp_status->length; i++)
3567         *charbuf++ = cmp_status->carryover[i];
3568       coding->annotated = 1;
3569     }
3570
3571   while (1)
3572     {
3573       int c1, c2, c3;
3574
3575       src_base = src;
3576       consumed_chars_base = consumed_chars;
3577
3578       if (charbuf >= charbuf_end)
3579         {
3580           if (byte_after_cr >= 0)
3581             src_base--;
3582           break;
3583         }
3584
3585       if (byte_after_cr >= 0)
3586         c1 = byte_after_cr, byte_after_cr = -1;
3587       else
3588         ONE_MORE_BYTE (c1);
3589       if (c1 < 0)
3590         goto invalid_code;
3591
3592       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3593         {
3594           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3595           char_offset++;
3596           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3597           continue;
3598         }
3599
3600       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3601         {
3602           if (c1 == ISO_CODE_ESC)
3603             {
3604               if (src + 1 >= src_end)
3605                 goto no_more_source;
3606               *charbuf++ = ISO_CODE_ESC;
3607               char_offset++;
3608               if (src[0] == '%' && src[1] == '@')
3609                 {
3610                   src += 2;
3611                   consumed_chars += 2;
3612                   char_offset += 2;
3613                   /* We are sure charbuf can contain two more chars. */
3614                   *charbuf++ = '%';
3615                   *charbuf++ = '@';
3616                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3617                 }
3618             }
3619           else
3620             {
3621               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3622               char_offset++;
3623             }
3624           continue;
3625         }
3626
3627       if ((cmp_status->state == COMPOSING_RULE
3628            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3629           && c1 != ISO_CODE_ESC)
3630         {
3631           int rule;
3632
3633           DECODE_COMPOSITION_RULE (rule);
3634           STORE_COMPOSITION_RULE (rule);
3635           continue;
3636         }
3637
3638       /* We produce at most one character.  */
3639       switch (iso_code_class [c1])
3640         {
3641         case ISO_0x20_or_0x7F:
3642           if (charset_id_0 < 0
3643               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3644             /* This is SPACE or DEL.  */
3645             charset = CHARSET_FROM_ID (charset_ascii);
3646           else
3647             charset = CHARSET_FROM_ID (charset_id_0);
3648           break;
3649
3650         case ISO_graphic_plane_0:
3651           if (charset_id_0 < 0)
3652             charset = CHARSET_FROM_ID (charset_ascii);
3653           else
3654             charset = CHARSET_FROM_ID (charset_id_0);
3655           break;
3656
3657         case ISO_0xA0_or_0xFF:
3658           if (charset_id_1 < 0
3659               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3660               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3661             goto invalid_code;
3662           /* This is a graphic character, we fall down ... */
3663
3664         case ISO_graphic_plane_1:
3665           if (charset_id_1 < 0)
3666             goto invalid_code;
3667           charset = CHARSET_FROM_ID (charset_id_1);
3668           break;
3669
3670         case ISO_control_0:
3671           if (eol_dos && c1 == '\r')
3672             ONE_MORE_BYTE (byte_after_cr);
3673           MAYBE_FINISH_COMPOSITION ();
3674           charset = CHARSET_FROM_ID (charset_ascii);
3675           break;
3676
3677         case ISO_control_1:
3678           goto invalid_code;
3679
3680         case ISO_shift_out:
3681           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3682               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3683             goto invalid_code;
3684           CODING_ISO_INVOCATION (coding, 0) = 1;
3685           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3686           continue;
3687
3688         case ISO_shift_in:
3689           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3690             goto invalid_code;
3691           CODING_ISO_INVOCATION (coding, 0) = 0;
3692           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3693           continue;
3694
3695         case ISO_single_shift_2_7:
3696           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3697             goto invalid_code;
3698         case ISO_single_shift_2:
3699           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3700             goto invalid_code;
3701           /* SS2 is handled as an escape sequence of ESC 'N' */
3702           c1 = 'N';
3703           goto label_escape_sequence;
3704
3705         case ISO_single_shift_3:
3706           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3707             goto invalid_code;
3708           /* SS2 is handled as an escape sequence of ESC 'O' */
3709           c1 = 'O';
3710           goto label_escape_sequence;
3711
3712         case ISO_control_sequence_introducer:
3713           /* CSI is handled as an escape sequence of ESC '[' ...  */
3714           c1 = '[';
3715           goto label_escape_sequence;
3716
3717         case ISO_escape:
3718           ONE_MORE_BYTE (c1);
3719         label_escape_sequence:
3720           /* Escape sequences handled here are invocation,
3721              designation, direction specification, and character
3722              composition specification.  */
3723           switch (c1)
3724             {
3725             case '&':           /* revision of following character set */
3726               ONE_MORE_BYTE (c1);
3727               if (!(c1 >= '@' && c1 <= '~'))
3728                 goto invalid_code;
3729               ONE_MORE_BYTE (c1);
3730               if (c1 != ISO_CODE_ESC)
3731                 goto invalid_code;
3732               ONE_MORE_BYTE (c1);
3733               goto label_escape_sequence;
3734
3735             case '$':           /* designation of 2-byte character set */
3736               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3737                 goto invalid_code;
3738               {
3739                 int reg, chars96;
3740
3741                 ONE_MORE_BYTE (c1);
3742                 if (c1 >= '@' && c1 <= 'B')
3743                   {     /* designation of JISX0208.1978, GB2312.1980,
3744                            or JISX0208.1980 */
3745                     reg = 0, chars96 = 0;
3746                   }
3747                 else if (c1 >= 0x28 && c1 <= 0x2B)
3748                   { /* designation of DIMENSION2_CHARS94 character set */
3749                     reg = c1 - 0x28, chars96 = 0;
3750                     ONE_MORE_BYTE (c1);
3751                   }
3752                 else if (c1 >= 0x2C && c1 <= 0x2F)
3753                   { /* designation of DIMENSION2_CHARS96 character set */
3754                     reg = c1 - 0x2C, chars96 = 1;
3755                     ONE_MORE_BYTE (c1);
3756                   }
3757                 else
3758                   goto invalid_code;
3759                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3760                 /* We must update these variables now.  */
3761                 if (reg == 0)
3762                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3763                 else if (reg == 1)
3764                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3765                 if (chars96 < 0)
3766                   goto invalid_code;
3767               }
3768               continue;
3769
3770             case 'n':           /* invocation of locking-shift-2 */
3771               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3772                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3773                 goto invalid_code;
3774               CODING_ISO_INVOCATION (coding, 0) = 2;
3775               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3776               continue;
3777
3778             case 'o':           /* invocation of locking-shift-3 */
3779               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3780                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3781                 goto invalid_code;
3782               CODING_ISO_INVOCATION (coding, 0) = 3;
3783               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3784               continue;
3785
3786             case 'N':           /* invocation of single-shift-2 */
3787               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3788                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3789                 goto invalid_code;
3790               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3791               if (charset_id_2 < 0)
3792                 charset = CHARSET_FROM_ID (charset_ascii);
3793               else
3794                 charset = CHARSET_FROM_ID (charset_id_2);
3795               ONE_MORE_BYTE (c1);
3796               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3797                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3798                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3799                           ? c1 >= 0x80 : c1 < 0x80)))
3800                 goto invalid_code;
3801               break;
3802
3803             case 'O':           /* invocation of single-shift-3 */
3804               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3805                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3806                 goto invalid_code;
3807               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3808               if (charset_id_3 < 0)
3809                 charset = CHARSET_FROM_ID (charset_ascii);
3810               else
3811                 charset = CHARSET_FROM_ID (charset_id_3);
3812               ONE_MORE_BYTE (c1);
3813               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3814                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3815                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3816                           ? c1 >= 0x80 : c1 < 0x80)))
3817                 goto invalid_code;
3818               break;
3819
3820             case '0': case '2': case '3': case '4': /* start composition */
3821               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3822                 goto invalid_code;
3823               if (last_id != charset_ascii)
3824                 {
3825                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3826                   last_id = charset_ascii;
3827                   last_offset = char_offset;
3828                 }
3829               DECODE_COMPOSITION_START (c1);
3830               continue;
3831
3832             case '1':           /* end composition */
3833               if (cmp_status->state == COMPOSING_NO)
3834                 goto invalid_code;
3835               DECODE_COMPOSITION_END ();
3836               continue;
3837
3838             case '[':           /* specification of direction */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3840                 goto invalid_code;
3841               /* For the moment, nested direction is not supported.
3842                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3843                  left-to-right, and nonzero means right-to-left.  */
3844               ONE_MORE_BYTE (c1);
3845               switch (c1)
3846                 {
3847                 case ']':       /* end of the current direction */
3848                   coding->mode &= ~CODING_MODE_DIRECTION;
3849
3850                 case '0':       /* end of the current direction */
3851                 case '1':       /* start of left-to-right direction */
3852                   ONE_MORE_BYTE (c1);
3853                   if (c1 == ']')
3854                     coding->mode &= ~CODING_MODE_DIRECTION;
3855                   else
3856                     goto invalid_code;
3857                   break;
3858
3859                 case '2':       /* start of right-to-left direction */
3860                   ONE_MORE_BYTE (c1);
3861                   if (c1 == ']')
3862                     coding->mode |= CODING_MODE_DIRECTION;
3863                   else
3864                     goto invalid_code;
3865                   break;
3866
3867                 default:
3868                   goto invalid_code;
3869                 }
3870               continue;
3871
3872             case '%':
3873               ONE_MORE_BYTE (c1);
3874               if (c1 == '/')
3875                 {
3876                   /* CTEXT extended segment:
3877                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3878                      We keep these bytes as is for the moment.
3879                      They may be decoded by post-read-conversion.  */
3880                   int dim, M, L;
3881                   int size;
3882
3883                   ONE_MORE_BYTE (dim);
3884                   if (dim < '0' || dim > '4')
3885                     goto invalid_code;
3886                   ONE_MORE_BYTE (M);
3887                   if (M < 128)
3888                     goto invalid_code;
3889                   ONE_MORE_BYTE (L);
3890                   if (L < 128)
3891                     goto invalid_code;
3892                   size = ((M - 128) * 128) + (L - 128);
3893                   if (charbuf + 6 > charbuf_end)
3894                     goto break_loop;
3895                   *charbuf++ = ISO_CODE_ESC;
3896                   *charbuf++ = '%';
3897                   *charbuf++ = '/';
3898                   *charbuf++ = dim;
3899                   *charbuf++ = BYTE8_TO_CHAR (M);
3900                   *charbuf++ = BYTE8_TO_CHAR (L);
3901                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3902                 }
3903               else if (c1 == 'G')
3904                 {
3905                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3906                      ESC % G --UTF-8-BYTES-- ESC % @
3907                      We keep these bytes as is for the moment.
3908                      They may be decoded by post-read-conversion.  */
3909                   if (charbuf + 3 > charbuf_end)
3910                     goto break_loop;
3911                   *charbuf++ = ISO_CODE_ESC;
3912                   *charbuf++ = '%';
3913                   *charbuf++ = 'G';
3914                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3915                 }
3916               else
3917                 goto invalid_code;
3918               continue;
3919               break;
3920
3921             default:
3922               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3923                 goto invalid_code;
3924               {
3925                 int reg, chars96;
3926
3927                 if (c1 >= 0x28 && c1 <= 0x2B)
3928                   { /* designation of DIMENSION1_CHARS94 character set */
3929                     reg = c1 - 0x28, chars96 = 0;
3930                     ONE_MORE_BYTE (c1);
3931                   }
3932                 else if (c1 >= 0x2C && c1 <= 0x2F)
3933                   { /* designation of DIMENSION1_CHARS96 character set */
3934                     reg = c1 - 0x2C, chars96 = 1;
3935                     ONE_MORE_BYTE (c1);
3936                   }
3937                 else
3938                   goto invalid_code;
3939                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3940                 /* We must update these variables now.  */
3941                 if (reg == 0)
3942                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3943                 else if (reg == 1)
3944                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3945                 if (chars96 < 0)
3946                   goto invalid_code;
3947               }
3948               continue;
3949             }
3950           break;
3951
3952         default:
3953           emacs_abort ();
3954         }
3955
3956       if (cmp_status->state == COMPOSING_NO
3957           && charset->id != charset_ascii
3958           && last_id != charset->id)
3959         {
3960           if (last_id != charset_ascii)
3961             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3962           last_id = charset->id;
3963           last_offset = char_offset;
3964         }
3965
3966       /* Now we know CHARSET and 1st position code C1 of a character.
3967          Produce a decoded character while getting 2nd and 3rd
3968          position codes C2, C3 if necessary.  */
3969       if (CHARSET_DIMENSION (charset) > 1)
3970         {
3971           ONE_MORE_BYTE (c2);
3972           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3973               || ((c1 & 0x80) != (c2 & 0x80)))
3974             /* C2 is not in a valid range.  */
3975             goto invalid_code;
3976           if (CHARSET_DIMENSION (charset) == 2)
3977             c1 = (c1 << 8) | c2;
3978           else
3979             {
3980               ONE_MORE_BYTE (c3);
3981               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3982                   || ((c1 & 0x80) != (c3 & 0x80)))
3983                 /* C3 is not in a valid range.  */
3984                 goto invalid_code;
3985               c1 = (c1 << 16) | (c2 << 8) | c2;
3986             }
3987         }
3988       c1 &= 0x7F7F7F;
3989       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3990       if (c < 0)
3991         {
3992           MAYBE_FINISH_COMPOSITION ();
3993           for (; src_base < src; src_base++, char_offset++)
3994             {
3995               if (ASCII_BYTE_P (*src_base))
3996                 *charbuf++ = *src_base;
3997               else
3998                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3999             }
4000         }
4001       else if (cmp_status->state == COMPOSING_NO)
4002         {
4003           *charbuf++ = c;
4004           char_offset++;
4005         }
4006       else if ((cmp_status->state == COMPOSING_CHAR
4007                 ? cmp_status->nchars
4008                 : cmp_status->ncomps)
4009                >= MAX_COMPOSITION_COMPONENTS)
4010         {
4011           /* Too long composition.  */
4012           MAYBE_FINISH_COMPOSITION ();
4013           *charbuf++ = c;
4014           char_offset++;
4015         }
4016       else
4017         STORE_COMPOSITION_CHAR (c);
4018       continue;
4019
4020     invalid_code:
4021       MAYBE_FINISH_COMPOSITION ();
4022       src = src_base;
4023       consumed_chars = consumed_chars_base;
4024       ONE_MORE_BYTE (c);
4025       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4026       char_offset++;
4027       coding->errors++;
4028       /* Reset the invocation and designation status to the safest
4029          one; i.e. designate ASCII to the graphic register 0, and
4030          invoke that register to the graphic plane 0.  This typically
4031          helps the case that an designation sequence for ASCII "ESC (
4032          B" is somehow broken (e.g. broken by a newline).  */
4033       CODING_ISO_INVOCATION (coding, 0) = 0;
4034       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4035       charset_id_0 = charset_ascii;
4036       continue;
4037
4038     break_loop:
4039       break;
4040     }
4041
4042  no_more_source:
4043   if (cmp_status->state != COMPOSING_NO)
4044     {
4045       if (coding->mode & CODING_MODE_LAST_BLOCK)
4046         MAYBE_FINISH_COMPOSITION ();
4047       else
4048         {
4049           charbuf -= cmp_status->length;
4050           for (i = 0; i < cmp_status->length; i++)
4051             cmp_status->carryover[i] = charbuf[i];
4052         }
4053     }
4054   else if (last_id != charset_ascii)
4055     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4056   coding->consumed_char += consumed_chars_base;
4057   coding->consumed = src_base - coding->source;
4058   coding->charbuf_used = charbuf - coding->charbuf;
4059 }
4060
4061
4062 /* ISO2022 encoding stuff.  */
4063
4064 /*
4065    It is not enough to say just "ISO2022" on encoding, we have to
4066    specify more details.  In Emacs, each coding system of ISO2022
4067    variant has the following specifications:
4068         1. Initial designation to G0 thru G3.
4069         2. Allows short-form designation?
4070         3. ASCII should be designated to G0 before control characters?
4071         4. ASCII should be designated to G0 at end of line?
4072         5. 7-bit environment or 8-bit environment?
4073         6. Use locking-shift?
4074         7. Use Single-shift?
4075    And the following two are only for Japanese:
4076         8. Use ASCII in place of JIS0201-1976-Roman?
4077         9. Use JISX0208-1983 in place of JISX0208-1978?
4078    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4079    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4080    details.
4081 */
4082
4083 /* Produce codes (escape sequence) for designating CHARSET to graphic
4084    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4085    '@', 'A', or 'B' and the coding system CODING allows, produce
4086    designation sequence of short-form.  */
4087
4088 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4089   do {                                                                  \
4090     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4091     const char *intermediate_char_94 = "()*+";                          \
4092     const char *intermediate_char_96 = ",-./";                          \
4093     int revision = -1;                                                  \
4094                                                                         \
4095     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4096       revision = CHARSET_ISO_REVISION (charset);                        \
4097                                                                         \
4098     if (revision >= 0)                                                  \
4099       {                                                                 \
4100         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4101         EMIT_ONE_BYTE ('@' + revision);                                 \
4102       }                                                                 \
4103     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4104     if (CHARSET_DIMENSION (charset) == 1)                               \
4105       {                                                                 \
4106         int b;                                                          \
4107         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4108           b = intermediate_char_94[reg];                                \
4109         else                                                            \
4110           b = intermediate_char_96[reg];                                \
4111         EMIT_ONE_ASCII_BYTE (b);                                        \
4112       }                                                                 \
4113     else                                                                \
4114       {                                                                 \
4115         EMIT_ONE_ASCII_BYTE ('$');                                      \
4116         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4117           {                                                             \
4118             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4119                 || reg != 0                                             \
4120                 || final_char < '@' || final_char > 'B')                \
4121               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4122           }                                                             \
4123         else                                                            \
4124           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4125       }                                                                 \
4126     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4127                                                                         \
4128     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4129   } while (0)
4130
4131
4132 /* The following two macros produce codes (control character or escape
4133    sequence) for ISO2022 single-shift functions (single-shift-2 and
4134    single-shift-3).  */
4135
4136 #define ENCODE_SINGLE_SHIFT_2                                           \
4137   do {                                                                  \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4139       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4140     else                                                                \
4141       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4142     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4143   } while (0)
4144
4145
4146 #define ENCODE_SINGLE_SHIFT_3                                           \
4147   do {                                                                  \
4148     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4149       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4150     else                                                                \
4151       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4152     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4153   } while (0)
4154
4155
4156 /* The following four macros produce codes (control character or
4157    escape sequence) for ISO2022 locking-shift functions (shift-in,
4158    shift-out, locking-shift-2, and locking-shift-3).  */
4159
4160 #define ENCODE_SHIFT_IN                                 \
4161   do {                                                  \
4162     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4163     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4164   } while (0)
4165
4166
4167 #define ENCODE_SHIFT_OUT                                \
4168   do {                                                  \
4169     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4170     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4171   } while (0)
4172
4173
4174 #define ENCODE_LOCKING_SHIFT_2                          \
4175   do {                                                  \
4176     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4177     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4178   } while (0)
4179
4180
4181 #define ENCODE_LOCKING_SHIFT_3                          \
4182   do {                                                  \
4183     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4184     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4185   } while (0)
4186
4187
4188 /* Produce codes for a DIMENSION1 character whose character set is
4189    CHARSET and whose position-code is C1.  Designation and invocation
4190    sequences are also produced in advance if necessary.  */
4191
4192 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4193   do {                                                                  \
4194     int id = CHARSET_ID (charset);                                      \
4195                                                                         \
4196     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4197         && id == charset_ascii)                                         \
4198       {                                                                 \
4199         id = charset_jisx0201_roman;                                    \
4200         charset = CHARSET_FROM_ID (id);                                 \
4201       }                                                                 \
4202                                                                         \
4203     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4204       {                                                                 \
4205         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4206           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4207         else                                                            \
4208           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4209         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4210         break;                                                          \
4211       }                                                                 \
4212     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4213       {                                                                 \
4214         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4215         break;                                                          \
4216       }                                                                 \
4217     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4218       {                                                                 \
4219         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4220         break;                                                          \
4221       }                                                                 \
4222     else                                                                \
4223       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4224          must invoke it, or, at first, designate it to some graphic     \
4225          register.  Then repeat the loop to actually produce the        \
4226          character.  */                                                 \
4227       dst = encode_invocation_designation (charset, coding, dst,        \
4228                                            &produced_chars);            \
4229   } while (1)
4230
4231
4232 /* Produce codes for a DIMENSION2 character whose character set is
4233    CHARSET and whose position-codes are C1 and C2.  Designation and
4234    invocation codes are also produced in advance if necessary.  */
4235
4236 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4237   do {                                                                  \
4238     int id = CHARSET_ID (charset);                                      \
4239                                                                         \
4240     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4241         && id == charset_jisx0208)                                      \
4242       {                                                                 \
4243         id = charset_jisx0208_1978;                                     \
4244         charset = CHARSET_FROM_ID (id);                                 \
4245       }                                                                 \
4246                                                                         \
4247     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4248       {                                                                 \
4249         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4250           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4251         else                                                            \
4252           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4253         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4254         break;                                                          \
4255       }                                                                 \
4256     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4257       {                                                                 \
4258         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4259         break;                                                          \
4260       }                                                                 \
4261     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4262       {                                                                 \
4263         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4264         break;                                                          \
4265       }                                                                 \
4266     else                                                                \
4267       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4268          must invoke it, or, at first, designate it to some graphic     \
4269          register.  Then repeat the loop to actually produce the        \
4270          character.  */                                                 \
4271       dst = encode_invocation_designation (charset, coding, dst,        \
4272                                            &produced_chars);            \
4273   } while (1)
4274
4275
4276 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4277   do {                                                                     \
4278     unsigned code;                                                         \
4279     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4280                                                                            \
4281     if (CHARSET_DIMENSION (charset) == 1)                                  \
4282       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4283     else                                                                   \
4284       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4285   } while (0)
4286
4287
4288 /* Produce designation and invocation codes at a place pointed by DST
4289    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4290    Return new DST.  */
4291
4292 static unsigned char *
4293 encode_invocation_designation (struct charset *charset,
4294                                struct coding_system *coding,
4295                                unsigned char *dst, ptrdiff_t *p_nchars)
4296 {
4297   bool multibytep = coding->dst_multibyte;
4298   ptrdiff_t produced_chars = *p_nchars;
4299   int reg;                      /* graphic register number */
4300   int id = CHARSET_ID (charset);
4301
4302   /* At first, check designations.  */
4303   for (reg = 0; reg < 4; reg++)
4304     if (id == CODING_ISO_DESIGNATION (coding, reg))
4305       break;
4306
4307   if (reg >= 4)
4308     {
4309       /* CHARSET is not yet designated to any graphic registers.  */
4310       /* At first check the requested designation.  */
4311       reg = CODING_ISO_REQUEST (coding, id);
4312       if (reg < 0)
4313         /* Since CHARSET requests no special designation, designate it
4314            to graphic register 0.  */
4315         reg = 0;
4316
4317       ENCODE_DESIGNATION (charset, reg, coding);
4318     }
4319
4320   if (CODING_ISO_INVOCATION (coding, 0) != reg
4321       && CODING_ISO_INVOCATION (coding, 1) != reg)
4322     {
4323       /* Since the graphic register REG is not invoked to any graphic
4324          planes, invoke it to graphic plane 0.  */
4325       switch (reg)
4326         {
4327         case 0:                 /* graphic register 0 */
4328           ENCODE_SHIFT_IN;
4329           break;
4330
4331         case 1:                 /* graphic register 1 */
4332           ENCODE_SHIFT_OUT;
4333           break;
4334
4335         case 2:                 /* graphic register 2 */
4336           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4337             ENCODE_SINGLE_SHIFT_2;
4338           else
4339             ENCODE_LOCKING_SHIFT_2;
4340           break;
4341
4342         case 3:                 /* graphic register 3 */
4343           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4344             ENCODE_SINGLE_SHIFT_3;
4345           else
4346             ENCODE_LOCKING_SHIFT_3;
4347           break;
4348         }
4349     }
4350
4351   *p_nchars = produced_chars;
4352   return dst;
4353 }
4354
4355
4356 /* Produce codes for designation and invocation to reset the graphic
4357    planes and registers to initial state.  */
4358 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4359   do {                                                                  \
4360     int reg;                                                            \
4361     struct charset *charset;                                            \
4362                                                                         \
4363     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4364       ENCODE_SHIFT_IN;                                                  \
4365     for (reg = 0; reg < 4; reg++)                                       \
4366       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4367           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4368               != CODING_ISO_INITIAL (coding, reg)))                     \
4369         {                                                               \
4370           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4371           ENCODE_DESIGNATION (charset, reg, coding);                    \
4372         }                                                               \
4373   } while (0)
4374
4375
4376 /* Produce designation sequences of charsets in the line started from
4377    CHARBUF to a place pointed by DST, and return the number of
4378    produced bytes.  DST should not directly point a buffer text area
4379    which may be relocated by char_charset call.
4380
4381    If the current block ends before any end-of-line, we may fail to
4382    find all the necessary designations.  */
4383
4384 static ptrdiff_t
4385 encode_designation_at_bol (struct coding_system *coding,
4386                            int *charbuf, int *charbuf_end,
4387                            unsigned char *dst)
4388 {
4389   unsigned char *orig = dst;
4390   struct charset *charset;
4391   /* Table of charsets to be designated to each graphic register.  */
4392   int r[4];
4393   int c, found = 0, reg;
4394   ptrdiff_t produced_chars = 0;
4395   bool multibytep = coding->dst_multibyte;
4396   Lisp_Object attrs;
4397   Lisp_Object charset_list;
4398
4399   attrs = CODING_ID_ATTRS (coding->id);
4400   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4401   if (EQ (charset_list, Qiso_2022))
4402     charset_list = Viso_2022_charset_list;
4403
4404   for (reg = 0; reg < 4; reg++)
4405     r[reg] = -1;
4406
4407   while (charbuf < charbuf_end && found < 4)
4408     {
4409       int id;
4410
4411       c = *charbuf++;
4412       if (c == '\n')
4413         break;
4414       charset = char_charset (c, charset_list, NULL);
4415       id = CHARSET_ID (charset);
4416       reg = CODING_ISO_REQUEST (coding, id);
4417       if (reg >= 0 && r[reg] < 0)
4418         {
4419           found++;
4420           r[reg] = id;
4421         }
4422     }
4423
4424   if (found)
4425     {
4426       for (reg = 0; reg < 4; reg++)
4427         if (r[reg] >= 0
4428             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4429           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4430     }
4431
4432   return dst - orig;
4433 }
4434
4435 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4436
4437 static bool
4438 encode_coding_iso_2022 (struct coding_system *coding)
4439 {
4440   bool multibytep = coding->dst_multibyte;
4441   int *charbuf = coding->charbuf;
4442   int *charbuf_end = charbuf + coding->charbuf_used;
4443   unsigned char *dst = coding->destination + coding->produced;
4444   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4445   int safe_room = 16;
4446   bool bol_designation
4447     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4448        && CODING_ISO_BOL (coding));
4449   ptrdiff_t produced_chars = 0;
4450   Lisp_Object attrs, eol_type, charset_list;
4451   bool ascii_compatible;
4452   int c;
4453   int preferred_charset_id = -1;
4454
4455   CODING_GET_INFO (coding, attrs, charset_list);
4456   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4457   if (VECTORP (eol_type))
4458     eol_type = Qunix;
4459
4460   setup_iso_safe_charsets (attrs);
4461   /* Charset list may have been changed.  */
4462   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4463   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4464
4465   ascii_compatible
4466     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4467        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4468                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4469
4470   while (charbuf < charbuf_end)
4471     {
4472       ASSURE_DESTINATION (safe_room);
4473
4474       if (bol_designation)
4475         {
4476           /* We have to produce designation sequences if any now.  */
4477           unsigned char desig_buf[16];
4478           ptrdiff_t nbytes;
4479           ptrdiff_t offset;
4480
4481           charset_map_loaded = 0;
4482           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4483                                               desig_buf);
4484           if (charset_map_loaded
4485               && (offset = coding_change_destination (coding)))
4486             {
4487               dst += offset;
4488               dst_end += offset;
4489             }
4490           memcpy (dst, desig_buf, nbytes);
4491           dst += nbytes;
4492           /* We are sure that designation sequences are all ASCII bytes.  */
4493           produced_chars += nbytes;
4494           bol_designation = 0;
4495           ASSURE_DESTINATION (safe_room);
4496         }
4497
4498       c = *charbuf++;
4499
4500       if (c < 0)
4501         {
4502           /* Handle an annotation.  */
4503           switch (*charbuf)
4504             {
4505             case CODING_ANNOTATE_COMPOSITION_MASK:
4506               /* Not yet implemented.  */
4507               break;
4508             case CODING_ANNOTATE_CHARSET_MASK:
4509               preferred_charset_id = charbuf[2];
4510               if (preferred_charset_id >= 0
4511                   && NILP (Fmemq (make_number (preferred_charset_id),
4512                                   charset_list)))
4513                 preferred_charset_id = -1;
4514               break;
4515             default:
4516               emacs_abort ();
4517             }
4518           charbuf += -c - 1;
4519           continue;
4520         }
4521
4522       /* Now encode the character C.  */
4523       if (c < 0x20 || c == 0x7F)
4524         {
4525           if (c == '\n'
4526               || (c == '\r' && EQ (eol_type, Qmac)))
4527             {
4528               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4529                 ENCODE_RESET_PLANE_AND_REGISTER ();
4530               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4531                 {
4532                   int i;
4533
4534                   for (i = 0; i < 4; i++)
4535                     CODING_ISO_DESIGNATION (coding, i)
4536                       = CODING_ISO_INITIAL (coding, i);
4537                 }
4538               bol_designation = ((CODING_ISO_FLAGS (coding)
4539                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4540                                  != 0);
4541             }
4542           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4543             ENCODE_RESET_PLANE_AND_REGISTER ();
4544           EMIT_ONE_ASCII_BYTE (c);
4545         }
4546       else if (ASCII_CHAR_P (c))
4547         {
4548           if (ascii_compatible)
4549             EMIT_ONE_ASCII_BYTE (c);
4550           else
4551             {
4552               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4553               ENCODE_ISO_CHARACTER (charset, c);
4554             }
4555         }
4556       else if (CHAR_BYTE8_P (c))
4557         {
4558           c = CHAR_TO_BYTE8 (c);
4559           EMIT_ONE_BYTE (c);
4560         }
4561       else
4562         {
4563           struct charset *charset;
4564
4565           if (preferred_charset_id >= 0)
4566             {
4567               bool result;
4568
4569               charset = CHARSET_FROM_ID (preferred_charset_id);
4570               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4571               if (! result)
4572                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4573                                      NULL, charset);
4574             }
4575           else
4576             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4577                                  NULL, charset);
4578           if (!charset)
4579             {
4580               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4581                 {
4582                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4583                   charset = CHARSET_FROM_ID (charset_ascii);
4584                 }
4585               else
4586                 {
4587                   c = coding->default_char;
4588                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4589                                        charset_list, NULL, charset);
4590                 }
4591             }
4592           ENCODE_ISO_CHARACTER (charset, c);
4593         }
4594     }
4595
4596   if (coding->mode & CODING_MODE_LAST_BLOCK
4597       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4598     {
4599       ASSURE_DESTINATION (safe_room);
4600       ENCODE_RESET_PLANE_AND_REGISTER ();
4601     }
4602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4603   CODING_ISO_BOL (coding) = bol_designation;
4604   coding->produced_char += produced_chars;
4605   coding->produced = dst - coding->destination;
4606   return 0;
4607 }
4608
4609 \f
4610 /*** 8,9. SJIS and BIG5 handlers ***/
4611
4612 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4613    quite widely.  So, for the moment, Emacs supports them in the bare
4614    C code.  But, in the future, they may be supported only by CCL.  */
4615
4616 /* SJIS is a coding system encoding three character sets: ASCII, right
4617    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4618    as is.  A character of charset katakana-jisx0201 is encoded by
4619    "position-code + 0x80".  A character of charset japanese-jisx0208
4620    is encoded in 2-byte but two position-codes are divided and shifted
4621    so that it fit in the range below.
4622
4623    --- CODE RANGE of SJIS ---
4624    (character set)      (range)
4625    ASCII                0x00 .. 0x7F
4626    KATAKANA-JISX0201    0xA0 .. 0xDF
4627    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4628             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4629    -------------------------------
4630
4631 */
4632
4633 /* BIG5 is a coding system encoding two character sets: ASCII and
4634    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4635    character set and is encoded in two-byte.
4636
4637    --- CODE RANGE of BIG5 ---
4638    (character set)      (range)
4639    ASCII                0x00 .. 0x7F
4640    Big5 (1st byte)      0xA1 .. 0xFE
4641         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4642    --------------------------
4643
4644   */
4645
4646 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4647    Return true if a text is encoded in SJIS.  */
4648
4649 static bool
4650 detect_coding_sjis (struct coding_system *coding,
4651                     struct coding_detection_info *detect_info)
4652 {
4653   const unsigned char *src = coding->source, *src_base;
4654   const unsigned char *src_end = coding->source + coding->src_bytes;
4655   bool multibytep = coding->src_multibyte;
4656   ptrdiff_t consumed_chars = 0;
4657   int found = 0;
4658   int c;
4659   Lisp_Object attrs, charset_list;
4660   int max_first_byte_of_2_byte_code;
4661
4662   CODING_GET_INFO (coding, attrs, charset_list);
4663   max_first_byte_of_2_byte_code
4664     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4665
4666   detect_info->checked |= CATEGORY_MASK_SJIS;
4667   /* A coding system of this category is always ASCII compatible.  */
4668   src += coding->head_ascii;
4669
4670   while (1)
4671     {
4672       src_base = src;
4673       ONE_MORE_BYTE (c);
4674       if (c < 0x80)
4675         continue;
4676       if ((c >= 0x81 && c <= 0x9F)
4677           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4678         {
4679           ONE_MORE_BYTE (c);
4680           if (c < 0x40 || c == 0x7F || c > 0xFC)
4681             break;
4682           found = CATEGORY_MASK_SJIS;
4683         }
4684       else if (c >= 0xA0 && c < 0xE0)
4685         found = CATEGORY_MASK_SJIS;
4686       else
4687         break;
4688     }
4689   detect_info->rejected |= CATEGORY_MASK_SJIS;
4690   return 0;
4691
4692  no_more_source:
4693   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4694     {
4695       detect_info->rejected |= CATEGORY_MASK_SJIS;
4696       return 0;
4697     }
4698   detect_info->found |= found;
4699   return 1;
4700 }
4701
4702 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4703    Return true if a text is encoded in BIG5.  */
4704
4705 static bool
4706 detect_coding_big5 (struct coding_system *coding,
4707                     struct coding_detection_info *detect_info)
4708 {
4709   const unsigned char *src = coding->source, *src_base;
4710   const unsigned char *src_end = coding->source + coding->src_bytes;
4711   bool multibytep = coding->src_multibyte;
4712   ptrdiff_t consumed_chars = 0;
4713   int found = 0;
4714   int c;
4715
4716   detect_info->checked |= CATEGORY_MASK_BIG5;
4717   /* A coding system of this category is always ASCII compatible.  */
4718   src += coding->head_ascii;
4719
4720   while (1)
4721     {
4722       src_base = src;
4723       ONE_MORE_BYTE (c);
4724       if (c < 0x80)
4725         continue;
4726       if (c >= 0xA1)
4727         {
4728           ONE_MORE_BYTE (c);
4729           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4730             return 0;
4731           found = CATEGORY_MASK_BIG5;
4732         }
4733       else
4734         break;
4735     }
4736   detect_info->rejected |= CATEGORY_MASK_BIG5;
4737   return 0;
4738
4739  no_more_source:
4740   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4741     {
4742       detect_info->rejected |= CATEGORY_MASK_BIG5;
4743       return 0;
4744     }
4745   detect_info->found |= found;
4746   return 1;
4747 }
4748
4749 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4750
4751 static void
4752 decode_coding_sjis (struct coding_system *coding)
4753 {
4754   const unsigned char *src = coding->source + coding->consumed;
4755   const unsigned char *src_end = coding->source + coding->src_bytes;
4756   const unsigned char *src_base;
4757   int *charbuf = coding->charbuf + coding->charbuf_used;
4758   /* We may produce one charset annotation in one loop and one more at
4759      the end.  */
4760   int *charbuf_end
4761     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4762   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4763   bool multibytep = coding->src_multibyte;
4764   struct charset *charset_roman, *charset_kanji, *charset_kana;
4765   struct charset *charset_kanji2;
4766   Lisp_Object attrs, charset_list, val;
4767   ptrdiff_t char_offset = coding->produced_char;
4768   ptrdiff_t last_offset = char_offset;
4769   int last_id = charset_ascii;
4770   bool eol_dos
4771     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4772   int byte_after_cr = -1;
4773
4774   CODING_GET_INFO (coding, attrs, charset_list);
4775
4776   val = charset_list;
4777   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4778   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4779   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4780   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4781
4782   while (1)
4783     {
4784       int c, c1;
4785       struct charset *charset;
4786
4787       src_base = src;
4788       consumed_chars_base = consumed_chars;
4789
4790       if (charbuf >= charbuf_end)
4791         {
4792           if (byte_after_cr >= 0)
4793             src_base--;
4794           break;
4795         }
4796
4797       if (byte_after_cr >= 0)
4798         c = byte_after_cr, byte_after_cr = -1;
4799       else
4800         ONE_MORE_BYTE (c);
4801       if (c < 0)
4802         goto invalid_code;
4803       if (c < 0x80)
4804         {
4805           if (eol_dos && c == '\r')
4806             ONE_MORE_BYTE (byte_after_cr);
4807           charset = charset_roman;
4808         }
4809       else if (c == 0x80 || c == 0xA0)
4810         goto invalid_code;
4811       else if (c >= 0xA1 && c <= 0xDF)
4812         {
4813           /* SJIS -> JISX0201-Kana */
4814           c &= 0x7F;
4815           charset = charset_kana;
4816         }
4817       else if (c <= 0xEF)
4818         {
4819           /* SJIS -> JISX0208 */
4820           ONE_MORE_BYTE (c1);
4821           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4822             goto invalid_code;
4823           c = (c << 8) | c1;
4824           SJIS_TO_JIS (c);
4825           charset = charset_kanji;
4826         }
4827       else if (c <= 0xFC && charset_kanji2)
4828         {
4829           /* SJIS -> JISX0213-2 */
4830           ONE_MORE_BYTE (c1);
4831           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4832             goto invalid_code;
4833           c = (c << 8) | c1;
4834           SJIS_TO_JIS2 (c);
4835           charset = charset_kanji2;
4836         }
4837       else
4838         goto invalid_code;
4839       if (charset->id != charset_ascii
4840           && last_id != charset->id)
4841         {
4842           if (last_id != charset_ascii)
4843             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4844           last_id = charset->id;
4845           last_offset = char_offset;
4846         }
4847       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4848       *charbuf++ = c;
4849       char_offset++;
4850       continue;
4851
4852     invalid_code:
4853       src = src_base;
4854       consumed_chars = consumed_chars_base;
4855       ONE_MORE_BYTE (c);
4856       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4857       char_offset++;
4858       coding->errors++;
4859     }
4860
4861  no_more_source:
4862   if (last_id != charset_ascii)
4863     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4864   coding->consumed_char += consumed_chars_base;
4865   coding->consumed = src_base - coding->source;
4866   coding->charbuf_used = charbuf - coding->charbuf;
4867 }
4868
4869 static void
4870 decode_coding_big5 (struct coding_system *coding)
4871 {
4872   const unsigned char *src = coding->source + coding->consumed;
4873   const unsigned char *src_end = coding->source + coding->src_bytes;
4874   const unsigned char *src_base;
4875   int *charbuf = coding->charbuf + coding->charbuf_used;
4876   /* We may produce one charset annotation in one loop and one more at
4877      the end.  */
4878   int *charbuf_end
4879     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4880   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4881   bool multibytep = coding->src_multibyte;
4882   struct charset *charset_roman, *charset_big5;
4883   Lisp_Object attrs, charset_list, val;
4884   ptrdiff_t char_offset = coding->produced_char;
4885   ptrdiff_t last_offset = char_offset;
4886   int last_id = charset_ascii;
4887   bool eol_dos
4888     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4889   int byte_after_cr = -1;
4890
4891   CODING_GET_INFO (coding, attrs, charset_list);
4892   val = charset_list;
4893   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4894   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4895
4896   while (1)
4897     {
4898       int c, c1;
4899       struct charset *charset;
4900
4901       src_base = src;
4902       consumed_chars_base = consumed_chars;
4903
4904       if (charbuf >= charbuf_end)
4905         {
4906           if (byte_after_cr >= 0)
4907             src_base--;
4908           break;
4909         }
4910
4911       if (byte_after_cr >= 0)
4912         c = byte_after_cr, byte_after_cr = -1;
4913       else
4914         ONE_MORE_BYTE (c);
4915
4916       if (c < 0)
4917         goto invalid_code;
4918       if (c < 0x80)
4919         {
4920           if (eol_dos && c == '\r')
4921             ONE_MORE_BYTE (byte_after_cr);
4922           charset = charset_roman;
4923         }
4924       else
4925         {
4926           /* BIG5 -> Big5 */
4927           if (c < 0xA1 || c > 0xFE)
4928             goto invalid_code;
4929           ONE_MORE_BYTE (c1);
4930           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4931             goto invalid_code;
4932           c = c << 8 | c1;
4933           charset = charset_big5;
4934         }
4935       if (charset->id != charset_ascii
4936           && last_id != charset->id)
4937         {
4938           if (last_id != charset_ascii)
4939             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4940           last_id = charset->id;
4941           last_offset = char_offset;
4942         }
4943       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4944       *charbuf++ = c;
4945       char_offset++;
4946       continue;
4947
4948     invalid_code:
4949       src = src_base;
4950       consumed_chars = consumed_chars_base;
4951       ONE_MORE_BYTE (c);
4952       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4953       char_offset++;
4954       coding->errors++;
4955     }
4956
4957  no_more_source:
4958   if (last_id != charset_ascii)
4959     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4960   coding->consumed_char += consumed_chars_base;
4961   coding->consumed = src_base - coding->source;
4962   coding->charbuf_used = charbuf - coding->charbuf;
4963 }
4964
4965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4966    This function can encode charsets `ascii', `katakana-jisx0201',
4967    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4968    are sure that all these charsets are registered as official charset
4969    (i.e. do not have extended leading-codes).  Characters of other
4970    charsets are produced without any encoding.  */
4971
4972 static bool
4973 encode_coding_sjis (struct coding_system *coding)
4974 {
4975   bool multibytep = coding->dst_multibyte;
4976   int *charbuf = coding->charbuf;
4977   int *charbuf_end = charbuf + coding->charbuf_used;
4978   unsigned char *dst = coding->destination + coding->produced;
4979   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4980   int safe_room = 4;
4981   ptrdiff_t produced_chars = 0;
4982   Lisp_Object attrs, charset_list, val;
4983   bool ascii_compatible;
4984   struct charset *charset_kanji, *charset_kana;
4985   struct charset *charset_kanji2;
4986   int c;
4987
4988   CODING_GET_INFO (coding, attrs, charset_list);
4989   val = XCDR (charset_list);
4990   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4991   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4992   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4993
4994   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4995
4996   while (charbuf < charbuf_end)
4997     {
4998       ASSURE_DESTINATION (safe_room);
4999       c = *charbuf++;
5000       /* Now encode the character C.  */
5001       if (ASCII_CHAR_P (c) && ascii_compatible)
5002         EMIT_ONE_ASCII_BYTE (c);
5003       else if (CHAR_BYTE8_P (c))
5004         {
5005           c = CHAR_TO_BYTE8 (c);
5006           EMIT_ONE_BYTE (c);
5007         }
5008       else
5009         {
5010           unsigned code;
5011           struct charset *charset;
5012           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5013                                &code, charset);
5014
5015           if (!charset)
5016             {
5017               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5018                 {
5019                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5020                   charset = CHARSET_FROM_ID (charset_ascii);
5021                 }
5022               else
5023                 {
5024                   c = coding->default_char;
5025                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5026                                        charset_list, &code, charset);
5027                 }
5028             }
5029           if (code == CHARSET_INVALID_CODE (charset))
5030             emacs_abort ();
5031           if (charset == charset_kanji)
5032             {
5033               int c1, c2;
5034               JIS_TO_SJIS (code);
5035               c1 = code >> 8, c2 = code & 0xFF;
5036               EMIT_TWO_BYTES (c1, c2);
5037             }
5038           else if (charset == charset_kana)
5039             EMIT_ONE_BYTE (code | 0x80);
5040           else if (charset_kanji2 && charset == charset_kanji2)
5041             {
5042               int c1, c2;
5043
5044               c1 = code >> 8;
5045               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5046                   || c1 == 0x28
5047                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5048                 {
5049                   JIS_TO_SJIS2 (code);
5050                   c1 = code >> 8, c2 = code & 0xFF;
5051                   EMIT_TWO_BYTES (c1, c2);
5052                 }
5053               else
5054                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5055             }
5056           else
5057             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5058         }
5059     }
5060   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5061   coding->produced_char += produced_chars;
5062   coding->produced = dst - coding->destination;
5063   return 0;
5064 }
5065
5066 static bool
5067 encode_coding_big5 (struct coding_system *coding)
5068 {
5069   bool multibytep = coding->dst_multibyte;
5070   int *charbuf = coding->charbuf;
5071   int *charbuf_end = charbuf + coding->charbuf_used;
5072   unsigned char *dst = coding->destination + coding->produced;
5073   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5074   int safe_room = 4;
5075   ptrdiff_t produced_chars = 0;
5076   Lisp_Object attrs, charset_list, val;
5077   bool ascii_compatible;
5078   struct charset *charset_big5;
5079   int c;
5080
5081   CODING_GET_INFO (coding, attrs, charset_list);
5082   val = XCDR (charset_list);
5083   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5084   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5085
5086   while (charbuf < charbuf_end)
5087     {
5088       ASSURE_DESTINATION (safe_room);
5089       c = *charbuf++;
5090       /* Now encode the character C.  */
5091       if (ASCII_CHAR_P (c) && ascii_compatible)
5092         EMIT_ONE_ASCII_BYTE (c);
5093       else if (CHAR_BYTE8_P (c))
5094         {
5095           c = CHAR_TO_BYTE8 (c);
5096           EMIT_ONE_BYTE (c);
5097         }
5098       else
5099         {
5100           unsigned code;
5101           struct charset *charset;
5102           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5103                                &code, charset);
5104
5105           if (! charset)
5106             {
5107               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5108                 {
5109                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5110                   charset = CHARSET_FROM_ID (charset_ascii);
5111                 }
5112               else
5113                 {
5114                   c = coding->default_char;
5115                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5116                                        charset_list, &code, charset);
5117                 }
5118             }
5119           if (code == CHARSET_INVALID_CODE (charset))
5120             emacs_abort ();
5121           if (charset == charset_big5)
5122             {
5123               int c1, c2;
5124
5125               c1 = code >> 8, c2 = code & 0xFF;
5126               EMIT_TWO_BYTES (c1, c2);
5127             }
5128           else
5129             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5130         }
5131     }
5132   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5133   coding->produced_char += produced_chars;
5134   coding->produced = dst - coding->destination;
5135   return 0;
5136 }
5137
5138 \f
5139 /*** 10. CCL handlers ***/
5140
5141 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5142    Return true if a text is encoded in a coding system of which
5143    encoder/decoder are written in CCL program.  */
5144
5145 static bool
5146 detect_coding_ccl (struct coding_system *coding,
5147                    struct coding_detection_info *detect_info)
5148 {
5149   const unsigned char *src = coding->source, *src_base;
5150   const unsigned char *src_end = coding->source + coding->src_bytes;
5151   bool multibytep = coding->src_multibyte;
5152   ptrdiff_t consumed_chars = 0;
5153   int found = 0;
5154   unsigned char *valids;
5155   ptrdiff_t head_ascii = coding->head_ascii;
5156   Lisp_Object attrs;
5157
5158   detect_info->checked |= CATEGORY_MASK_CCL;
5159
5160   coding = &coding_categories[coding_category_ccl];
5161   valids = CODING_CCL_VALIDS (coding);
5162   attrs = CODING_ID_ATTRS (coding->id);
5163   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5164     src += head_ascii;
5165
5166   while (1)
5167     {
5168       int c;
5169
5170       src_base = src;
5171       ONE_MORE_BYTE (c);
5172       if (c < 0 || ! valids[c])
5173         break;
5174       if ((valids[c] > 1))
5175         found = CATEGORY_MASK_CCL;
5176     }
5177   detect_info->rejected |= CATEGORY_MASK_CCL;
5178   return 0;
5179
5180  no_more_source:
5181   detect_info->found |= found;
5182   return 1;
5183 }
5184
5185 static void
5186 decode_coding_ccl (struct coding_system *coding)
5187 {
5188   const unsigned char *src = coding->source + coding->consumed;
5189   const unsigned char *src_end = coding->source + coding->src_bytes;
5190   int *charbuf = coding->charbuf + coding->charbuf_used;
5191   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5192   ptrdiff_t consumed_chars = 0;
5193   bool multibytep = coding->src_multibyte;
5194   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5195   int source_charbuf[1024];
5196   int source_byteidx[1025];
5197   Lisp_Object attrs, charset_list;
5198
5199   CODING_GET_INFO (coding, attrs, charset_list);
5200
5201   while (1)
5202     {
5203       const unsigned char *p = src;
5204       ptrdiff_t offset;
5205       int i = 0;
5206
5207       if (multibytep)
5208         {
5209           while (i < 1024 && p < src_end)
5210             {
5211               source_byteidx[i] = p - src;
5212               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5213             }
5214           source_byteidx[i] = p - src;
5215         }
5216       else
5217         while (i < 1024 && p < src_end)
5218           source_charbuf[i++] = *p++;
5219
5220       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5221         ccl->last_block = true;
5222       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5223       charset_map_loaded = 0;
5224       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5225                   charset_list);
5226       if (charset_map_loaded
5227           && (offset = coding_change_source (coding)))
5228         {
5229           p += offset;
5230           src += offset;
5231           src_end += offset;
5232         }
5233       charbuf += ccl->produced;
5234       if (multibytep)
5235         src += source_byteidx[ccl->consumed];
5236       else
5237         src += ccl->consumed;
5238       consumed_chars += ccl->consumed;
5239       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5240         break;
5241     }
5242
5243   switch (ccl->status)
5244     {
5245     case CCL_STAT_SUSPEND_BY_SRC:
5246       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5247       break;
5248     case CCL_STAT_SUSPEND_BY_DST:
5249       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5250       break;
5251     case CCL_STAT_QUIT:
5252     case CCL_STAT_INVALID_CMD:
5253       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5254       break;
5255     default:
5256       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5257       break;
5258     }
5259   coding->consumed_char += consumed_chars;
5260   coding->consumed = src - coding->source;
5261   coding->charbuf_used = charbuf - coding->charbuf;
5262 }
5263
5264 static bool
5265 encode_coding_ccl (struct coding_system *coding)
5266 {
5267   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5268   bool multibytep = coding->dst_multibyte;
5269   int *charbuf = coding->charbuf;
5270   int *charbuf_end = charbuf + coding->charbuf_used;
5271   unsigned char *dst = coding->destination + coding->produced;
5272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5273   int destination_charbuf[1024];
5274   ptrdiff_t produced_chars = 0;
5275   int i;
5276   Lisp_Object attrs, charset_list;
5277
5278   CODING_GET_INFO (coding, attrs, charset_list);
5279   if (coding->consumed_char == coding->src_chars
5280       && coding->mode & CODING_MODE_LAST_BLOCK)
5281     ccl->last_block = true;
5282
5283   do
5284     {
5285       ptrdiff_t offset;
5286
5287       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5288       charset_map_loaded = 0;
5289       ccl_driver (ccl, charbuf, destination_charbuf,
5290                   charbuf_end - charbuf, 1024, charset_list);
5291       if (charset_map_loaded
5292           && (offset = coding_change_destination (coding)))
5293         dst += offset;
5294       if (multibytep)
5295         {
5296           ASSURE_DESTINATION (ccl->produced * 2);
5297           for (i = 0; i < ccl->produced; i++)
5298             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5299         }
5300       else
5301         {
5302           ASSURE_DESTINATION (ccl->produced);
5303           for (i = 0; i < ccl->produced; i++)
5304             *dst++ = destination_charbuf[i] & 0xFF;
5305           produced_chars += ccl->produced;
5306         }
5307       charbuf += ccl->consumed;
5308       if (ccl->status == CCL_STAT_QUIT
5309           || ccl->status == CCL_STAT_INVALID_CMD)
5310         break;
5311     }
5312   while (charbuf < charbuf_end);
5313
5314   switch (ccl->status)
5315     {
5316     case CCL_STAT_SUSPEND_BY_SRC:
5317       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5318       break;
5319     case CCL_STAT_SUSPEND_BY_DST:
5320       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5321       break;
5322     case CCL_STAT_QUIT:
5323     case CCL_STAT_INVALID_CMD:
5324       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5325       break;
5326     default:
5327       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5328       break;
5329     }
5330
5331   coding->produced_char += produced_chars;
5332   coding->produced = dst - coding->destination;
5333   return 0;
5334 }
5335
5336 \f
5337 /*** 10, 11. no-conversion handlers ***/
5338
5339 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5340
5341 static void
5342 decode_coding_raw_text (struct coding_system *coding)
5343 {
5344   bool eol_dos
5345     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5346
5347   coding->chars_at_source = 1;
5348   coding->consumed_char = coding->src_chars;
5349   coding->consumed = coding->src_bytes;
5350   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5351     {
5352       coding->consumed_char--;
5353       coding->consumed--;
5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5355     }
5356   else
5357     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5358 }
5359
5360 static bool
5361 encode_coding_raw_text (struct coding_system *coding)
5362 {
5363   bool multibytep = coding->dst_multibyte;
5364   int *charbuf = coding->charbuf;
5365   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5366   unsigned char *dst = coding->destination + coding->produced;
5367   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5368   ptrdiff_t produced_chars = 0;
5369   int c;
5370
5371   if (multibytep)
5372     {
5373       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5374
5375       if (coding->src_multibyte)
5376         while (charbuf < charbuf_end)
5377           {
5378             ASSURE_DESTINATION (safe_room);
5379             c = *charbuf++;
5380             if (ASCII_CHAR_P (c))
5381               EMIT_ONE_ASCII_BYTE (c);
5382             else if (CHAR_BYTE8_P (c))
5383               {
5384                 c = CHAR_TO_BYTE8 (c);
5385                 EMIT_ONE_BYTE (c);
5386               }
5387             else
5388               {
5389                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5390
5391                 CHAR_STRING_ADVANCE (c, p1);
5392                 do
5393                   {
5394                     EMIT_ONE_BYTE (*p0);
5395                     p0++;
5396                   }
5397                 while (p0 < p1);
5398               }
5399           }
5400       else
5401         while (charbuf < charbuf_end)
5402           {
5403             ASSURE_DESTINATION (safe_room);
5404             c = *charbuf++;
5405             EMIT_ONE_BYTE (c);
5406           }
5407     }
5408   else
5409     {
5410       if (coding->src_multibyte)
5411         {
5412           int safe_room = MAX_MULTIBYTE_LENGTH;
5413
5414           while (charbuf < charbuf_end)
5415             {
5416               ASSURE_DESTINATION (safe_room);
5417               c = *charbuf++;
5418               if (ASCII_CHAR_P (c))
5419                 *dst++ = c;
5420               else if (CHAR_BYTE8_P (c))
5421                 *dst++ = CHAR_TO_BYTE8 (c);
5422               else
5423                 CHAR_STRING_ADVANCE (c, dst);
5424             }
5425         }
5426       else
5427         {
5428           ASSURE_DESTINATION (charbuf_end - charbuf);
5429           while (charbuf < charbuf_end && dst < dst_end)
5430             *dst++ = *charbuf++;
5431         }
5432       produced_chars = dst - (coding->destination + coding->produced);
5433     }
5434   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5435   coding->produced_char += produced_chars;
5436   coding->produced = dst - coding->destination;
5437   return 0;
5438 }
5439
5440 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5441    Return true if a text is encoded in a charset-based coding system.  */
5442
5443 static bool
5444 detect_coding_charset (struct coding_system *coding,
5445                        struct coding_detection_info *detect_info)
5446 {
5447   const unsigned char *src = coding->source, *src_base;
5448   const unsigned char *src_end = coding->source + coding->src_bytes;
5449   bool multibytep = coding->src_multibyte;
5450   ptrdiff_t consumed_chars = 0;
5451   Lisp_Object attrs, valids, name;
5452   int found = 0;
5453   ptrdiff_t head_ascii = coding->head_ascii;
5454   bool check_latin_extra = 0;
5455
5456   detect_info->checked |= CATEGORY_MASK_CHARSET;
5457
5458   coding = &coding_categories[coding_category_charset];
5459   attrs = CODING_ID_ATTRS (coding->id);
5460   valids = AREF (attrs, coding_attr_charset_valids);
5461   name = CODING_ID_NAME (coding->id);
5462   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5463                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5464       || strncmp (SSDATA (SYMBOL_NAME (name)),
5465                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5466     check_latin_extra = 1;
5467
5468   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5469     src += head_ascii;
5470
5471   while (1)
5472     {
5473       int c;
5474       Lisp_Object val;
5475       struct charset *charset;
5476       int dim, idx;
5477
5478       src_base = src;
5479       ONE_MORE_BYTE (c);
5480       if (c < 0)
5481         continue;
5482       val = AREF (valids, c);
5483       if (NILP (val))
5484         break;
5485       if (c >= 0x80)
5486         {
5487           if (c < 0xA0
5488               && check_latin_extra
5489               && (!VECTORP (Vlatin_extra_code_table)
5490                   || NILP (AREF (Vlatin_extra_code_table, c))))
5491             break;
5492           found = CATEGORY_MASK_CHARSET;
5493         }
5494       if (INTEGERP (val))
5495         {
5496           charset = CHARSET_FROM_ID (XFASTINT (val));
5497           dim = CHARSET_DIMENSION (charset);
5498           for (idx = 1; idx < dim; idx++)
5499             {
5500               if (src == src_end)
5501                 goto too_short;
5502               ONE_MORE_BYTE (c);
5503               if (c < charset->code_space[(dim - 1 - idx) * 4]
5504                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5505                 break;
5506             }
5507           if (idx < dim)
5508             break;
5509         }
5510       else
5511         {
5512           idx = 1;
5513           for (; CONSP (val); val = XCDR (val))
5514             {
5515               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5516               dim = CHARSET_DIMENSION (charset);
5517               while (idx < dim)
5518                 {
5519                   if (src == src_end)
5520                     goto too_short;
5521                   ONE_MORE_BYTE (c);
5522                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5523                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5524                     break;
5525                   idx++;
5526                 }
5527               if (idx == dim)
5528                 {
5529                   val = Qnil;
5530                   break;
5531                 }
5532             }
5533           if (CONSP (val))
5534             break;
5535         }
5536     }
5537  too_short:
5538   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5539   return 0;
5540
5541  no_more_source:
5542   detect_info->found |= found;
5543   return 1;
5544 }
5545
5546 static void
5547 decode_coding_charset (struct coding_system *coding)
5548 {
5549   const unsigned char *src = coding->source + coding->consumed;
5550   const unsigned char *src_end = coding->source + coding->src_bytes;
5551   const unsigned char *src_base;
5552   int *charbuf = coding->charbuf + coding->charbuf_used;
5553   /* We may produce one charset annotation in one loop and one more at
5554      the end.  */
5555   int *charbuf_end
5556     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5557   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5558   bool multibytep = coding->src_multibyte;
5559   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5560   Lisp_Object valids;
5561   ptrdiff_t char_offset = coding->produced_char;
5562   ptrdiff_t last_offset = char_offset;
5563   int last_id = charset_ascii;
5564   bool eol_dos
5565     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5566   int byte_after_cr = -1;
5567
5568   valids = AREF (attrs, coding_attr_charset_valids);
5569
5570   while (1)
5571     {
5572       int c;
5573       Lisp_Object val;
5574       struct charset *charset;
5575       int dim;
5576       int len = 1;
5577       unsigned code;
5578
5579       src_base = src;
5580       consumed_chars_base = consumed_chars;
5581
5582       if (charbuf >= charbuf_end)
5583         {
5584           if (byte_after_cr >= 0)
5585             src_base--;
5586           break;
5587         }
5588
5589       if (byte_after_cr >= 0)
5590         {
5591           c = byte_after_cr;
5592           byte_after_cr = -1;
5593         }
5594       else
5595         {
5596           ONE_MORE_BYTE (c);
5597           if (eol_dos && c == '\r')
5598             ONE_MORE_BYTE (byte_after_cr);
5599         }
5600       if (c < 0)
5601         goto invalid_code;
5602       code = c;
5603
5604       val = AREF (valids, c);
5605       if (! INTEGERP (val) && ! CONSP (val))
5606         goto invalid_code;
5607       if (INTEGERP (val))
5608         {
5609           charset = CHARSET_FROM_ID (XFASTINT (val));
5610           dim = CHARSET_DIMENSION (charset);
5611           while (len < dim)
5612             {
5613               ONE_MORE_BYTE (c);
5614               code = (code << 8) | c;
5615               len++;
5616             }
5617           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5618                               charset, code, c);
5619         }
5620       else
5621         {
5622           /* VAL is a list of charset IDs.  It is assured that the
5623              list is sorted by charset dimensions (smaller one
5624              comes first).  */
5625           while (CONSP (val))
5626             {
5627               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5628               dim = CHARSET_DIMENSION (charset);
5629               while (len < dim)
5630                 {
5631                   ONE_MORE_BYTE (c);
5632                   code = (code << 8) | c;
5633                   len++;
5634                 }
5635               CODING_DECODE_CHAR (coding, src, src_base,
5636                                   src_end, charset, code, c);
5637               if (c >= 0)
5638                 break;
5639               val = XCDR (val);
5640             }
5641         }
5642       if (c < 0)
5643         goto invalid_code;
5644       if (charset->id != charset_ascii
5645           && last_id != charset->id)
5646         {
5647           if (last_id != charset_ascii)
5648             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5649           last_id = charset->id;
5650           last_offset = char_offset;
5651         }
5652
5653       *charbuf++ = c;
5654       char_offset++;
5655       continue;
5656
5657     invalid_code:
5658       src = src_base;
5659       consumed_chars = consumed_chars_base;
5660       ONE_MORE_BYTE (c);
5661       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5662       char_offset++;
5663       coding->errors++;
5664     }
5665
5666  no_more_source:
5667   if (last_id != charset_ascii)
5668     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5669   coding->consumed_char += consumed_chars_base;
5670   coding->consumed = src_base - coding->source;
5671   coding->charbuf_used = charbuf - coding->charbuf;
5672 }
5673
5674 static bool
5675 encode_coding_charset (struct coding_system *coding)
5676 {
5677   bool multibytep = coding->dst_multibyte;
5678   int *charbuf = coding->charbuf;
5679   int *charbuf_end = charbuf + coding->charbuf_used;
5680   unsigned char *dst = coding->destination + coding->produced;
5681   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5682   int safe_room = MAX_MULTIBYTE_LENGTH;
5683   ptrdiff_t produced_chars = 0;
5684   Lisp_Object attrs, charset_list;
5685   bool ascii_compatible;
5686   int c;
5687
5688   CODING_GET_INFO (coding, attrs, charset_list);
5689   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5690
5691   while (charbuf < charbuf_end)
5692     {
5693       struct charset *charset;
5694       unsigned code;
5695
5696       ASSURE_DESTINATION (safe_room);
5697       c = *charbuf++;
5698       if (ascii_compatible && ASCII_CHAR_P (c))
5699         EMIT_ONE_ASCII_BYTE (c);
5700       else if (CHAR_BYTE8_P (c))
5701         {
5702           c = CHAR_TO_BYTE8 (c);
5703           EMIT_ONE_BYTE (c);
5704         }
5705       else
5706         {
5707           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5708                                &code, charset);
5709
5710           if (charset)
5711             {
5712               if (CHARSET_DIMENSION (charset) == 1)
5713                 EMIT_ONE_BYTE (code);
5714               else if (CHARSET_DIMENSION (charset) == 2)
5715                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5716               else if (CHARSET_DIMENSION (charset) == 3)
5717                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5718               else
5719                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5720                                  (code >> 8) & 0xFF, code & 0xFF);
5721             }
5722           else
5723             {
5724               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5725                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5726               else
5727                 c = coding->default_char;
5728               EMIT_ONE_BYTE (c);
5729             }
5730         }
5731     }
5732
5733   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5734   coding->produced_char += produced_chars;
5735   coding->produced = dst - coding->destination;
5736   return 0;
5737 }
5738
5739 \f
5740 /*** 7. C library functions ***/
5741
5742 /* Setup coding context CODING from information about CODING_SYSTEM.
5743    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5744    CODING_SYSTEM is invalid, signal an error.  */
5745
5746 void
5747 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5748 {
5749   Lisp_Object attrs;
5750   Lisp_Object eol_type;
5751   Lisp_Object coding_type;
5752   Lisp_Object val;
5753
5754   if (NILP (coding_system))
5755     coding_system = Qundecided;
5756
5757   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5758
5759   attrs = CODING_ID_ATTRS (coding->id);
5760   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5761
5762   coding->mode = 0;
5763   if (VECTORP (eol_type))
5764     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5765                             | CODING_REQUIRE_DETECTION_MASK);
5766   else if (! EQ (eol_type, Qunix))
5767     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5768                             | CODING_REQUIRE_ENCODING_MASK);
5769   else
5770     coding->common_flags = 0;
5771   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5772     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5773   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5774     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5775   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5776     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5777
5778   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5779   coding->max_charset_id = SCHARS (val) - 1;
5780   coding->safe_charsets = SDATA (val);
5781   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5782   coding->carryover_bytes = 0;
5783   coding->raw_destination = 0;
5784
5785   coding_type = CODING_ATTR_TYPE (attrs);
5786   if (EQ (coding_type, Qundecided))
5787     {
5788       coding->detector = NULL;
5789       coding->decoder = decode_coding_raw_text;
5790       coding->encoder = encode_coding_raw_text;
5791       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5792       coding->spec.undecided.inhibit_nbd
5793         = (encode_inhibit_flag
5794            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5795       coding->spec.undecided.inhibit_ied
5796         = (encode_inhibit_flag
5797            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5798       coding->spec.undecided.prefer_utf_8
5799         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5800     }
5801   else if (EQ (coding_type, Qiso_2022))
5802     {
5803       int i;
5804       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5805
5806       /* Invoke graphic register 0 to plane 0.  */
5807       CODING_ISO_INVOCATION (coding, 0) = 0;
5808       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5809       CODING_ISO_INVOCATION (coding, 1)
5810         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5811       /* Setup the initial status of designation.  */
5812       for (i = 0; i < 4; i++)
5813         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5814       /* Not single shifting initially.  */
5815       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5816       /* Beginning of buffer should also be regarded as bol. */
5817       CODING_ISO_BOL (coding) = 1;
5818       coding->detector = detect_coding_iso_2022;
5819       coding->decoder = decode_coding_iso_2022;
5820       coding->encoder = encode_coding_iso_2022;
5821       if (flags & CODING_ISO_FLAG_SAFE)
5822         coding->mode |= CODING_MODE_SAFE_ENCODING;
5823       coding->common_flags
5824         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5825             | CODING_REQUIRE_FLUSHING_MASK);
5826       if (flags & CODING_ISO_FLAG_COMPOSITION)
5827         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5828       if (flags & CODING_ISO_FLAG_DESIGNATION)
5829         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5830       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5831         {
5832           setup_iso_safe_charsets (attrs);
5833           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5834           coding->max_charset_id = SCHARS (val) - 1;
5835           coding->safe_charsets = SDATA (val);
5836         }
5837       CODING_ISO_FLAGS (coding) = flags;
5838       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5839       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5840       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5841       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5842     }
5843   else if (EQ (coding_type, Qcharset))
5844     {
5845       coding->detector = detect_coding_charset;
5846       coding->decoder = decode_coding_charset;
5847       coding->encoder = encode_coding_charset;
5848       coding->common_flags
5849         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5850     }
5851   else if (EQ (coding_type, Qutf_8))
5852     {
5853       val = AREF (attrs, coding_attr_utf_bom);
5854       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5855                                    : EQ (val, Qt) ? utf_with_bom
5856                                    : utf_without_bom);
5857       coding->detector = detect_coding_utf_8;
5858       coding->decoder = decode_coding_utf_8;
5859       coding->encoder = encode_coding_utf_8;
5860       coding->common_flags
5861         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5863         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5864     }
5865   else if (EQ (coding_type, Qutf_16))
5866     {
5867       val = AREF (attrs, coding_attr_utf_bom);
5868       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5869                                     : EQ (val, Qt) ? utf_with_bom
5870                                     : utf_without_bom);
5871       val = AREF (attrs, coding_attr_utf_16_endian);
5872       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5873                                        : utf_16_little_endian);
5874       CODING_UTF_16_SURROGATE (coding) = 0;
5875       coding->detector = detect_coding_utf_16;
5876       coding->decoder = decode_coding_utf_16;
5877       coding->encoder = encode_coding_utf_16;
5878       coding->common_flags
5879         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5881         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5882     }
5883   else if (EQ (coding_type, Qccl))
5884     {
5885       coding->detector = detect_coding_ccl;
5886       coding->decoder = decode_coding_ccl;
5887       coding->encoder = encode_coding_ccl;
5888       coding->common_flags
5889         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5890             | CODING_REQUIRE_FLUSHING_MASK);
5891     }
5892   else if (EQ (coding_type, Qemacs_mule))
5893     {
5894       coding->detector = detect_coding_emacs_mule;
5895       coding->decoder = decode_coding_emacs_mule;
5896       coding->encoder = encode_coding_emacs_mule;
5897       coding->common_flags
5898         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5899       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5900           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5901         {
5902           Lisp_Object tail, safe_charsets;
5903           int max_charset_id = 0;
5904
5905           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5906                tail = XCDR (tail))
5907             if (max_charset_id < XFASTINT (XCAR (tail)))
5908               max_charset_id = XFASTINT (XCAR (tail));
5909           safe_charsets = make_uninit_string (max_charset_id + 1);
5910           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5911           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5912                tail = XCDR (tail))
5913             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5914           coding->max_charset_id = max_charset_id;
5915           coding->safe_charsets = SDATA (safe_charsets);
5916         }
5917       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5918       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5919     }
5920   else if (EQ (coding_type, Qshift_jis))
5921     {
5922       coding->detector = detect_coding_sjis;
5923       coding->decoder = decode_coding_sjis;
5924       coding->encoder = encode_coding_sjis;
5925       coding->common_flags
5926         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5927     }
5928   else if (EQ (coding_type, Qbig5))
5929     {
5930       coding->detector = detect_coding_big5;
5931       coding->decoder = decode_coding_big5;
5932       coding->encoder = encode_coding_big5;
5933       coding->common_flags
5934         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5935     }
5936   else                          /* EQ (coding_type, Qraw_text) */
5937     {
5938       coding->detector = NULL;
5939       coding->decoder = decode_coding_raw_text;
5940       coding->encoder = encode_coding_raw_text;
5941       if (! EQ (eol_type, Qunix))
5942         {
5943           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5944           if (! VECTORP (eol_type))
5945             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5946         }
5947
5948     }
5949
5950   return;
5951 }
5952
5953 /* Return a list of charsets supported by CODING.  */
5954
5955 Lisp_Object
5956 coding_charset_list (struct coding_system *coding)
5957 {
5958   Lisp_Object attrs, charset_list;
5959
5960   CODING_GET_INFO (coding, attrs, charset_list);
5961   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5962     {
5963       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5964
5965       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5966         charset_list = Viso_2022_charset_list;
5967     }
5968   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5969     {
5970       charset_list = Vemacs_mule_charset_list;
5971     }
5972   return charset_list;
5973 }
5974
5975
5976 /* Return a list of charsets supported by CODING-SYSTEM.  */
5977
5978 Lisp_Object
5979 coding_system_charset_list (Lisp_Object coding_system)
5980 {
5981   ptrdiff_t id;
5982   Lisp_Object attrs, charset_list;
5983
5984   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5985   attrs = CODING_ID_ATTRS (id);
5986
5987   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5988     {
5989       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5990
5991       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5992         charset_list = Viso_2022_charset_list;
5993       else
5994         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5995     }
5996   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5997     {
5998       charset_list = Vemacs_mule_charset_list;
5999     }
6000   else
6001     {
6002       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6003     }
6004   return charset_list;
6005 }
6006
6007
6008 /* Return raw-text or one of its subsidiaries that has the same
6009    eol_type as CODING-SYSTEM.  */
6010
6011 Lisp_Object
6012 raw_text_coding_system (Lisp_Object coding_system)
6013 {
6014   Lisp_Object spec, attrs;
6015   Lisp_Object eol_type, raw_text_eol_type;
6016
6017   if (NILP (coding_system))
6018     return Qraw_text;
6019   spec = CODING_SYSTEM_SPEC (coding_system);
6020   attrs = AREF (spec, 0);
6021
6022   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6023     return coding_system;
6024
6025   eol_type = AREF (spec, 2);
6026   if (VECTORP (eol_type))
6027     return Qraw_text;
6028   spec = CODING_SYSTEM_SPEC (Qraw_text);
6029   raw_text_eol_type = AREF (spec, 2);
6030   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6031           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6032           : AREF (raw_text_eol_type, 2));
6033 }
6034
6035
6036 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6037    the subsidiary that has the same eol-spec as PARENT (if it is not
6038    nil and specifies end-of-line format) or the system's setting
6039    (system_eol_type).  */
6040
6041 Lisp_Object
6042 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6043 {
6044   Lisp_Object spec, eol_type;
6045
6046   if (NILP (coding_system))
6047     coding_system = Qraw_text;
6048   spec = CODING_SYSTEM_SPEC (coding_system);
6049   eol_type = AREF (spec, 2);
6050   if (VECTORP (eol_type))
6051     {
6052       Lisp_Object parent_eol_type;
6053
6054       if (! NILP (parent))
6055         {
6056           Lisp_Object parent_spec;
6057
6058           parent_spec = CODING_SYSTEM_SPEC (parent);
6059           parent_eol_type = AREF (parent_spec, 2);
6060           if (VECTORP (parent_eol_type))
6061             parent_eol_type = system_eol_type;
6062         }
6063       else
6064         parent_eol_type = system_eol_type;
6065       if (EQ (parent_eol_type, Qunix))
6066         coding_system = AREF (eol_type, 0);
6067       else if (EQ (parent_eol_type, Qdos))
6068         coding_system = AREF (eol_type, 1);
6069       else if (EQ (parent_eol_type, Qmac))
6070         coding_system = AREF (eol_type, 2);
6071     }
6072   return coding_system;
6073 }
6074
6075
6076 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6077    decided for writing to a process.  If not, complement them, and
6078    return a new coding system.  */
6079
6080 Lisp_Object
6081 complement_process_encoding_system (Lisp_Object coding_system)
6082 {
6083   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6084   Lisp_Object spec, attrs;
6085   int i;
6086
6087   for (i = 0; i < 3; i++)
6088     {
6089       if (i == 1)
6090         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6091       else if (i == 2)
6092         coding_system = preferred_coding_system ();
6093       spec = CODING_SYSTEM_SPEC (coding_system);
6094       if (NILP (spec))
6095         continue;
6096       attrs = AREF (spec, 0);
6097       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6098         coding_base = CODING_ATTR_BASE_NAME (attrs);
6099       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6100         eol_base = coding_system;
6101       if (! NILP (coding_base) && ! NILP (eol_base))
6102         break;
6103     }
6104
6105   if (i > 0)
6106     /* The original CODING_SYSTEM didn't specify text-conversion or
6107        eol-conversion.  Be sure that we return a fully complemented
6108        coding system.  */
6109     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6110   return coding_system;
6111 }
6112
6113
6114 /* Emacs has a mechanism to automatically detect a coding system if it
6115    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6116    it's impossible to distinguish some coding systems accurately
6117    because they use the same range of codes.  So, at first, coding
6118    systems are categorized into 7, those are:
6119
6120    o coding-category-emacs-mule
6121
6122         The category for a coding system which has the same code range
6123         as Emacs' internal format.  Assigned the coding-system (Lisp
6124         symbol) `emacs-mule' by default.
6125
6126    o coding-category-sjis
6127
6128         The category for a coding system which has the same code range
6129         as SJIS.  Assigned the coding-system (Lisp
6130         symbol) `japanese-shift-jis' by default.
6131
6132    o coding-category-iso-7
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 7-bit environment.  This doesn't use any locking
6136         shift and single shift functions.  This can encode/decode all
6137         charsets.  Assigned the coding-system (Lisp symbol)
6138         `iso-2022-7bit' by default.
6139
6140    o coding-category-iso-7-tight
6141
6142         Same as coding-category-iso-7 except that this can
6143         encode/decode only the specified charsets.
6144
6145    o coding-category-iso-8-1
6146
6147         The category for a coding system which has the same code range
6148         as ISO2022 of 8-bit environment and graphic plane 1 used only
6149         for DIMENSION1 charset.  This doesn't use any locking shift
6150         and single shift functions.  Assigned the coding-system (Lisp
6151         symbol) `iso-latin-1' by default.
6152
6153    o coding-category-iso-8-2
6154
6155         The category for a coding system which has the same code range
6156         as ISO2022 of 8-bit environment and graphic plane 1 used only
6157         for DIMENSION2 charset.  This doesn't use any locking shift
6158         and single shift functions.  Assigned the coding-system (Lisp
6159         symbol) `japanese-iso-8bit' by default.
6160
6161    o coding-category-iso-7-else
6162
6163         The category for a coding system which has the same code range
6164         as ISO2022 of 7-bit environment but uses locking shift or
6165         single shift functions.  Assigned the coding-system (Lisp
6166         symbol) `iso-2022-7bit-lock' by default.
6167
6168    o coding-category-iso-8-else
6169
6170         The category for a coding system which has the same code range
6171         as ISO2022 of 8-bit environment but uses locking shift or
6172         single shift functions.  Assigned the coding-system (Lisp
6173         symbol) `iso-2022-8bit-ss2' by default.
6174
6175    o coding-category-big5
6176
6177         The category for a coding system which has the same code range
6178         as BIG5.  Assigned the coding-system (Lisp symbol)
6179         `cn-big5' by default.
6180
6181    o coding-category-utf-8
6182
6183         The category for a coding system which has the same code range
6184         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6185         symbol) `utf-8' by default.
6186
6187    o coding-category-utf-16-be
6188
6189         The category for a coding system in which a text has an
6190         Unicode signature (cf. Unicode Standard) in the order of BIG
6191         endian at the head.  Assigned the coding-system (Lisp symbol)
6192         `utf-16-be' by default.
6193
6194    o coding-category-utf-16-le
6195
6196         The category for a coding system in which a text has an
6197         Unicode signature (cf. Unicode Standard) in the order of
6198         LITTLE endian at the head.  Assigned the coding-system (Lisp
6199         symbol) `utf-16-le' by default.
6200
6201    o coding-category-ccl
6202
6203         The category for a coding system of which encoder/decoder is
6204         written in CCL programs.  The default value is nil, i.e., no
6205         coding system is assigned.
6206
6207    o coding-category-binary
6208
6209         The category for a coding system not categorized in any of the
6210         above.  Assigned the coding-system (Lisp symbol)
6211         `no-conversion' by default.
6212
6213    Each of them is a Lisp symbol and the value is an actual
6214    `coding-system's (this is also a Lisp symbol) assigned by a user.
6215    What Emacs does actually is to detect a category of coding system.
6216    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6217    decide only one possible category, it selects a category of the
6218    highest priority.  Priorities of categories are also specified by a
6219    user in a Lisp variable `coding-category-list'.
6220
6221 */
6222
6223 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6224                                            int eol_seen);
6225
6226
6227 /* Return the number of ASCII characters at the head of the source.
6228    By side effects, set coding->head_ascii and update
6229    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6230    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6231    reliable only when all the source bytes are ASCII.  */
6232
6233 static ptrdiff_t
6234 check_ascii (struct coding_system *coding)
6235 {
6236   const unsigned char *src, *end;
6237   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6238   int eol_seen = coding->eol_seen;
6239
6240   coding_set_source (coding);
6241   src = coding->source;
6242   end = src + coding->src_bytes;
6243
6244   if (inhibit_eol_conversion
6245       || SYMBOLP (eol_type))
6246     {
6247       /* We don't have to check EOL format.  */
6248       while (src < end && !( *src & 0x80))
6249         {
6250           if (*src++ == '\n')
6251             eol_seen |= EOL_SEEN_LF;
6252         }
6253     }
6254   else
6255     {
6256       end--;                /* We look ahead one byte for "CR LF".  */
6257       while (src < end)
6258         {
6259           int c = *src;
6260
6261           if (c & 0x80)
6262             break;
6263           src++;
6264           if (c == '\r')
6265             {
6266               if (*src == '\n')
6267                 {
6268                   eol_seen |= EOL_SEEN_CRLF;
6269                   src++;
6270                 }
6271               else
6272                 eol_seen |= EOL_SEEN_CR;
6273             }
6274           else if (c == '\n')
6275             eol_seen |= EOL_SEEN_LF;
6276         }
6277       if (src == end)
6278         {
6279           int c = *src;
6280
6281           /* All bytes but the last one C are ASCII.  */
6282           if (! (c & 0x80))
6283             {
6284               if (c == '\r')
6285                 eol_seen |= EOL_SEEN_CR;
6286               else if (c  == '\n')
6287                 eol_seen |= EOL_SEEN_LF;
6288               src++;
6289             }
6290         }
6291     }
6292   coding->head_ascii = src - coding->source;
6293   coding->eol_seen = eol_seen;
6294   return (coding->head_ascii);
6295 }
6296
6297
6298 /* Return the number of characters at the source if all the bytes are
6299    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6300    effects, update coding->eol_seen.  The value of coding->eol_seen is
6301    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6302    the value is reliable only when all the source bytes are valid
6303    UTF-8.  */
6304
6305 static ptrdiff_t
6306 check_utf_8 (struct coding_system *coding)
6307 {
6308   const unsigned char *src, *end;
6309   int eol_seen;
6310   ptrdiff_t nchars = coding->head_ascii;
6311
6312   if (coding->head_ascii < 0)
6313     check_ascii (coding);
6314   else
6315     coding_set_source (coding);
6316   src = coding->source + coding->head_ascii;
6317   /* We look ahead one byte for CR LF.  */
6318   end = coding->source + coding->src_bytes - 1;
6319   eol_seen = coding->eol_seen;
6320   while (src < end)
6321     {
6322       int c = *src;
6323
6324       if (UTF_8_1_OCTET_P (*src))
6325         {
6326           src++;
6327           if (c < 0x20)
6328             {
6329               if (c == '\r')
6330                 {
6331                   if (*src == '\n')
6332                     {
6333                       eol_seen |= EOL_SEEN_CRLF;
6334                       src++;
6335                       nchars++;
6336                     }
6337                   else
6338                     eol_seen |= EOL_SEEN_CR;
6339                 }
6340               else if (c == '\n')
6341                 eol_seen |= EOL_SEEN_LF;
6342             }
6343         }
6344       else if (UTF_8_2_OCTET_LEADING_P (c))
6345         {
6346           if (c < 0xC2          /* overlong sequence */
6347               || src + 1 >= end
6348               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6349             return -1;
6350           src += 2;
6351         }
6352       else if (UTF_8_3_OCTET_LEADING_P (c))
6353         {
6354           if (src + 2 >= end
6355               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6356                     && UTF_8_EXTRA_OCTET_P (src[2])))
6357             return -1;
6358           c = (((c & 0xF) << 12)
6359                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6360           if (c < 0x800                       /* overlong sequence */
6361               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6362             return -1;
6363           src += 3;
6364         }
6365       else if (UTF_8_4_OCTET_LEADING_P (c))
6366         {
6367           if (src + 3 >= end
6368               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6369                     && UTF_8_EXTRA_OCTET_P (src[2])
6370                     && UTF_8_EXTRA_OCTET_P (src[3])))
6371             return -1;
6372           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6373                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6374           if (c < 0x10000       /* overlong sequence */
6375               || c >= 0x110000) /* non-Unicode character  */
6376             return -1;
6377           src += 4;
6378         }
6379       else
6380         return -1;
6381       nchars++;
6382     }
6383
6384   if (src == end)
6385     {
6386       if (! UTF_8_1_OCTET_P (*src))
6387         return -1;
6388       nchars++;
6389       if (*src == '\r')
6390         eol_seen |= EOL_SEEN_CR;
6391       else if (*src  == '\n')
6392         eol_seen |= EOL_SEEN_LF;
6393     }
6394   coding->eol_seen = eol_seen;
6395   return nchars;
6396 }
6397
6398
6399 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6400    SOURCE is encoded.  If CATEGORY is one of
6401    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6402    two-byte, else they are encoded by one-byte.
6403
6404    Return one of EOL_SEEN_XXX.  */
6405
6406 #define MAX_EOL_CHECK_COUNT 3
6407
6408 static int
6409 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6410             enum coding_category category)
6411 {
6412   const unsigned char *src = source, *src_end = src + src_bytes;
6413   unsigned char c;
6414   int total  = 0;
6415   int eol_seen = EOL_SEEN_NONE;
6416
6417   if ((1 << category) & CATEGORY_MASK_UTF_16)
6418     {
6419       bool msb = category == (coding_category_utf_16_le
6420                               | coding_category_utf_16_le_nosig);
6421       bool lsb = !msb;
6422
6423       while (src + 1 < src_end)
6424         {
6425           c = src[lsb];
6426           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6427             {
6428               int this_eol;
6429
6430               if (c == '\n')
6431                 this_eol = EOL_SEEN_LF;
6432               else if (src + 3 >= src_end
6433                        || src[msb + 2] != 0
6434                        || src[lsb + 2] != '\n')
6435                 this_eol = EOL_SEEN_CR;
6436               else
6437                 {
6438                   this_eol = EOL_SEEN_CRLF;
6439                   src += 2;
6440                 }
6441
6442               if (eol_seen == EOL_SEEN_NONE)
6443                 /* This is the first end-of-line.  */
6444                 eol_seen = this_eol;
6445               else if (eol_seen != this_eol)
6446                 {
6447                   /* The found type is different from what found before.
6448                      Allow for stray ^M characters in DOS EOL files.  */
6449                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6450                       || (eol_seen == EOL_SEEN_CRLF
6451                           && this_eol == EOL_SEEN_CR))
6452                     eol_seen = EOL_SEEN_CRLF;
6453                   else
6454                     {
6455                       eol_seen = EOL_SEEN_LF;
6456                       break;
6457                     }
6458                 }
6459               if (++total == MAX_EOL_CHECK_COUNT)
6460                 break;
6461             }
6462           src += 2;
6463         }
6464     }
6465   else
6466     while (src < src_end)
6467       {
6468         c = *src++;
6469         if (c == '\n' || c == '\r')
6470           {
6471             int this_eol;
6472
6473             if (c == '\n')
6474               this_eol = EOL_SEEN_LF;
6475             else if (src >= src_end || *src != '\n')
6476               this_eol = EOL_SEEN_CR;
6477             else
6478               this_eol = EOL_SEEN_CRLF, src++;
6479
6480             if (eol_seen == EOL_SEEN_NONE)
6481               /* This is the first end-of-line.  */
6482               eol_seen = this_eol;
6483             else if (eol_seen != this_eol)
6484               {
6485                 /* The found type is different from what found before.
6486                    Allow for stray ^M characters in DOS EOL files.  */
6487                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6488                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6489                   eol_seen = EOL_SEEN_CRLF;
6490                 else
6491                   {
6492                     eol_seen = EOL_SEEN_LF;
6493                     break;
6494                   }
6495               }
6496             if (++total == MAX_EOL_CHECK_COUNT)
6497               break;
6498           }
6499       }
6500   return eol_seen;
6501 }
6502
6503
6504 static Lisp_Object
6505 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6506 {
6507   Lisp_Object eol_type;
6508
6509   eol_type = CODING_ID_EOL_TYPE (coding->id);
6510   if (! VECTORP (eol_type))
6511     /* Already adjusted.  */
6512     return eol_type;
6513   if (eol_seen & EOL_SEEN_LF)
6514     {
6515       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6516       eol_type = Qunix;
6517     }
6518   else if (eol_seen & EOL_SEEN_CRLF)
6519     {
6520       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6521       eol_type = Qdos;
6522     }
6523   else if (eol_seen & EOL_SEEN_CR)
6524     {
6525       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6526       eol_type = Qmac;
6527     }
6528   return eol_type;
6529 }
6530
6531 /* Detect how a text specified in CODING is encoded.  If a coding
6532    system is detected, update fields of CODING by the detected coding
6533    system.  */
6534
6535 static void
6536 detect_coding (struct coding_system *coding)
6537 {
6538   const unsigned char *src, *src_end;
6539   unsigned int saved_mode = coding->mode;
6540   Lisp_Object found = Qnil;
6541   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6542
6543   coding->consumed = coding->consumed_char = 0;
6544   coding->produced = coding->produced_char = 0;
6545   coding_set_source (coding);
6546
6547   src_end = coding->source + coding->src_bytes;
6548
6549   coding->eol_seen = EOL_SEEN_NONE;
6550   /* If we have not yet decided the text encoding type, detect it
6551      now.  */
6552   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6553     {
6554       int c, i;
6555       struct coding_detection_info detect_info;
6556       bool null_byte_found = 0, eight_bit_found = 0;
6557       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6558                                        inhibit_null_byte_detection);
6559       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6560                                        inhibit_iso_escape_detection);
6561       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6562
6563       coding->head_ascii = 0;
6564       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6565       for (src = coding->source; src < src_end; src++)
6566         {
6567           c = *src;
6568           if (c & 0x80)
6569             {
6570               eight_bit_found = 1;
6571               if (null_byte_found)
6572                 break;
6573             }
6574           else if (c < 0x20)
6575             {
6576               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6577                   && ! inhibit_ied
6578                   && ! detect_info.checked)
6579                 {
6580                   if (detect_coding_iso_2022 (coding, &detect_info))
6581                     {
6582                       /* We have scanned the whole data.  */
6583                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6584                         {
6585                           /* We didn't find an 8-bit code.  We may
6586                              have found a null-byte, but it's very
6587                              rare that a binary file conforms to
6588                              ISO-2022.  */
6589                           src = src_end;
6590                           coding->head_ascii = src - coding->source;
6591                         }
6592                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6593                       break;
6594                     }
6595                 }
6596               else if (! c && !inhibit_nbd)
6597                 {
6598                   null_byte_found = 1;
6599                   if (eight_bit_found)
6600                     break;
6601                 }
6602               else if (! disable_ascii_optimization
6603                        && ! inhibit_eol_conversion)
6604                 {
6605                   if (c == '\r')
6606                     {
6607                       if (src < src_end && src[1] == '\n')
6608                         {
6609                           coding->eol_seen |= EOL_SEEN_CRLF;
6610                           src++;
6611                           if (! eight_bit_found)
6612                             coding->head_ascii++;
6613                         }
6614                       else
6615                         coding->eol_seen |= EOL_SEEN_CR;
6616                     }
6617                   else if (c == '\n')
6618                     {
6619                       coding->eol_seen |= EOL_SEEN_LF;
6620                     }
6621                 }
6622
6623               if (! eight_bit_found)
6624                 coding->head_ascii++;
6625             }
6626           else if (! eight_bit_found)
6627             coding->head_ascii++;
6628         }
6629
6630       if (null_byte_found || eight_bit_found
6631           || coding->head_ascii < coding->src_bytes
6632           || detect_info.found)
6633         {
6634           enum coding_category category;
6635           struct coding_system *this;
6636
6637           if (coding->head_ascii == coding->src_bytes)
6638             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6639             for (i = 0; i < coding_category_raw_text; i++)
6640               {
6641                 category = coding_priorities[i];
6642                 this = coding_categories + category;
6643                 if (detect_info.found & (1 << category))
6644                   break;
6645               }
6646           else
6647             {
6648               if (null_byte_found)
6649                 {
6650                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6651                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6652                 }
6653               else if (prefer_utf_8
6654                        && detect_coding_utf_8 (coding, &detect_info))
6655                 {
6656                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6657                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6658                 }
6659               for (i = 0; i < coding_category_raw_text; i++)
6660                 {
6661                   category = coding_priorities[i];
6662                   this = coding_categories + category;
6663                   /* Some of this->detector (e.g. detect_coding_sjis)
6664                      require this information.  */
6665                   coding->id = this->id;
6666                   if (this->id < 0)
6667                     {
6668                       /* No coding system of this category is defined.  */
6669                       detect_info.rejected |= (1 << category);
6670                     }
6671                   else if (category >= coding_category_raw_text)
6672                     continue;
6673                   else if (detect_info.checked & (1 << category))
6674                     {
6675                       if (detect_info.found & (1 << category))
6676                         break;
6677                     }
6678                   else if ((*(this->detector)) (coding, &detect_info)
6679                            && detect_info.found & (1 << category))
6680                     break;
6681                 }
6682             }
6683
6684           if (i < coding_category_raw_text)
6685             {
6686               if (category == coding_category_utf_8_auto)
6687                 {
6688                   Lisp_Object coding_systems;
6689
6690                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6691                                          coding_attr_utf_bom);
6692                   if (CONSP (coding_systems))
6693                     {
6694                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6695                         found = XCAR (coding_systems);
6696                       else
6697                         found = XCDR (coding_systems);
6698                     }
6699                   else
6700                     found = CODING_ID_NAME (this->id);
6701                 }
6702               else if (category == coding_category_utf_16_auto)
6703                 {
6704                   Lisp_Object coding_systems;
6705
6706                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6707                                          coding_attr_utf_bom);
6708                   if (CONSP (coding_systems))
6709                     {
6710                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6711                         found = XCAR (coding_systems);
6712                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6713                         found = XCDR (coding_systems);
6714                     }
6715                   else
6716                     found = CODING_ID_NAME (this->id);
6717                 }
6718               else
6719                 found = CODING_ID_NAME (this->id);
6720             }
6721           else if (null_byte_found)
6722             found = Qno_conversion;
6723           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6724                    == CATEGORY_MASK_ANY)
6725             found = Qraw_text;
6726           else if (detect_info.rejected)
6727             for (i = 0; i < coding_category_raw_text; i++)
6728               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6729                 {
6730                   this = coding_categories + coding_priorities[i];
6731                   found = CODING_ID_NAME (this->id);
6732                   break;
6733                 }
6734         }
6735     }
6736   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6737            == coding_category_utf_8_auto)
6738     {
6739       Lisp_Object coding_systems;
6740       struct coding_detection_info detect_info;
6741
6742       coding_systems
6743         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6744       detect_info.found = detect_info.rejected = 0;
6745       if (check_ascii (coding) == coding->src_bytes)
6746         {
6747           if (CONSP (coding_systems))
6748             found = XCDR (coding_systems);
6749         }
6750       else
6751         {
6752           if (CONSP (coding_systems)
6753               && detect_coding_utf_8 (coding, &detect_info))
6754             {
6755               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6756                 found = XCAR (coding_systems);
6757               else
6758                 found = XCDR (coding_systems);
6759             }
6760         }
6761     }
6762   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6763            == coding_category_utf_16_auto)
6764     {
6765       Lisp_Object coding_systems;
6766       struct coding_detection_info detect_info;
6767
6768       coding_systems
6769         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6770       detect_info.found = detect_info.rejected = 0;
6771       coding->head_ascii = 0;
6772       if (CONSP (coding_systems)
6773           && detect_coding_utf_16 (coding, &detect_info))
6774         {
6775           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6776             found = XCAR (coding_systems);
6777           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6778             found = XCDR (coding_systems);
6779         }
6780     }
6781
6782   if (! NILP (found))
6783     {
6784       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6785                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6786                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6787                            : EOL_SEEN_LF);
6788
6789       setup_coding_system (found, coding);
6790       if (specified_eol != EOL_SEEN_NONE)
6791         adjust_coding_eol_type (coding, specified_eol);
6792     }
6793
6794   coding->mode = saved_mode;
6795 }
6796
6797
6798 static void
6799 decode_eol (struct coding_system *coding)
6800 {
6801   Lisp_Object eol_type;
6802   unsigned char *p, *pbeg, *pend;
6803
6804   eol_type = CODING_ID_EOL_TYPE (coding->id);
6805   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6806     return;
6807
6808   if (NILP (coding->dst_object))
6809     pbeg = coding->destination;
6810   else
6811     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6812   pend = pbeg + coding->produced;
6813
6814   if (VECTORP (eol_type))
6815     {
6816       int eol_seen = EOL_SEEN_NONE;
6817
6818       for (p = pbeg; p < pend; p++)
6819         {
6820           if (*p == '\n')
6821             eol_seen |= EOL_SEEN_LF;
6822           else if (*p == '\r')
6823             {
6824               if (p + 1 < pend && *(p + 1) == '\n')
6825                 {
6826                   eol_seen |= EOL_SEEN_CRLF;
6827                   p++;
6828                 }
6829               else
6830                 eol_seen |= EOL_SEEN_CR;
6831             }
6832         }
6833       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6834       if ((eol_seen & EOL_SEEN_CRLF) != 0
6835           && (eol_seen & EOL_SEEN_CR) != 0
6836           && (eol_seen & EOL_SEEN_LF) == 0)
6837         eol_seen = EOL_SEEN_CRLF;
6838       else if (eol_seen != EOL_SEEN_NONE
6839           && eol_seen != EOL_SEEN_LF
6840           && eol_seen != EOL_SEEN_CRLF
6841           && eol_seen != EOL_SEEN_CR)
6842         eol_seen = EOL_SEEN_LF;
6843       if (eol_seen != EOL_SEEN_NONE)
6844         eol_type = adjust_coding_eol_type (coding, eol_seen);
6845     }
6846
6847   if (EQ (eol_type, Qmac))
6848     {
6849       for (p = pbeg; p < pend; p++)
6850         if (*p == '\r')
6851           *p = '\n';
6852     }
6853   else if (EQ (eol_type, Qdos))
6854     {
6855       ptrdiff_t n = 0;
6856
6857       if (NILP (coding->dst_object))
6858         {
6859           /* Start deleting '\r' from the tail to minimize the memory
6860              movement.  */
6861           for (p = pend - 2; p >= pbeg; p--)
6862             if (*p == '\r')
6863               {
6864                 memmove (p, p + 1, pend-- - p - 1);
6865                 n++;
6866               }
6867         }
6868       else
6869         {
6870           ptrdiff_t pos_byte = coding->dst_pos_byte;
6871           ptrdiff_t pos = coding->dst_pos;
6872           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6873
6874           while (pos < pos_end)
6875             {
6876               p = BYTE_POS_ADDR (pos_byte);
6877               if (*p == '\r' && p[1] == '\n')
6878                 {
6879                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6880                   n++;
6881                   pos_end--;
6882                 }
6883               pos++;
6884               if (coding->dst_multibyte)
6885                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6886               else
6887                 pos_byte++;
6888             }
6889         }
6890       coding->produced -= n;
6891       coding->produced_char -= n;
6892     }
6893 }
6894
6895
6896 /* Return a translation table (or list of them) from coding system
6897    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6898    not ENCODEP). */
6899
6900 static Lisp_Object
6901 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6902 {
6903   Lisp_Object standard, translation_table;
6904   Lisp_Object val;
6905
6906   if (NILP (Venable_character_translation))
6907     {
6908       if (max_lookup)
6909         *max_lookup = 0;
6910       return Qnil;
6911     }
6912   if (encodep)
6913     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6914       standard = Vstandard_translation_table_for_encode;
6915   else
6916     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6917       standard = Vstandard_translation_table_for_decode;
6918   if (NILP (translation_table))
6919     translation_table = standard;
6920   else
6921     {
6922       if (SYMBOLP (translation_table))
6923         translation_table = Fget (translation_table, Qtranslation_table);
6924       else if (CONSP (translation_table))
6925         {
6926           translation_table = Fcopy_sequence (translation_table);
6927           for (val = translation_table; CONSP (val); val = XCDR (val))
6928             if (SYMBOLP (XCAR (val)))
6929               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6930         }
6931       if (CHAR_TABLE_P (standard))
6932         {
6933           if (CONSP (translation_table))
6934             translation_table = nconc2 (translation_table, list1 (standard));
6935           else
6936             translation_table = list2 (translation_table, standard);
6937         }
6938     }
6939
6940   if (max_lookup)
6941     {
6942       *max_lookup = 1;
6943       if (CHAR_TABLE_P (translation_table)
6944           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6945         {
6946           val = XCHAR_TABLE (translation_table)->extras[1];
6947           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6948             *max_lookup = XFASTINT (val);
6949         }
6950       else if (CONSP (translation_table))
6951         {
6952           Lisp_Object tail;
6953
6954           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6955             if (CHAR_TABLE_P (XCAR (tail))
6956                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6957               {
6958                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6959                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6960                   *max_lookup = XFASTINT (tailval);
6961               }
6962         }
6963     }
6964   return translation_table;
6965 }
6966
6967 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6968   do {                                                          \
6969     trans = Qnil;                                               \
6970     if (CHAR_TABLE_P (table))                                   \
6971       {                                                         \
6972         trans = CHAR_TABLE_REF (table, c);                      \
6973         if (CHARACTERP (trans))                                 \
6974           c = XFASTINT (trans), trans = Qnil;                   \
6975       }                                                         \
6976     else if (CONSP (table))                                     \
6977       {                                                         \
6978         Lisp_Object tail;                                       \
6979                                                                 \
6980         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6981           if (CHAR_TABLE_P (XCAR (tail)))                       \
6982             {                                                   \
6983               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6984               if (CHARACTERP (trans))                           \
6985                 c = XFASTINT (trans), trans = Qnil;             \
6986               else if (! NILP (trans))                          \
6987                 break;                                          \
6988             }                                                   \
6989       }                                                         \
6990   } while (0)
6991
6992
6993 /* Return a translation of character(s) at BUF according to TRANS.
6994    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6995    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6996    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6997    translation is found, and Qnil if not found..
6998    If BUF is too short to lookup characters in FROM, return Qt.  */
6999
7000 static Lisp_Object
7001 get_translation (Lisp_Object trans, int *buf, int *buf_end)
7002 {
7003
7004   if (INTEGERP (trans))
7005     return trans;
7006   for (; CONSP (trans); trans = XCDR (trans))
7007     {
7008       Lisp_Object val = XCAR (trans);
7009       Lisp_Object from = XCAR (val);
7010       ptrdiff_t len = ASIZE (from);
7011       ptrdiff_t i;
7012
7013       for (i = 0; i < len; i++)
7014         {
7015           if (buf + i == buf_end)
7016             return Qt;
7017           if (XINT (AREF (from, i)) != buf[i])
7018             break;
7019         }
7020       if (i == len)
7021         return val;
7022     }
7023   return Qnil;
7024 }
7025
7026
7027 static int
7028 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7029                bool last_block)
7030 {
7031   unsigned char *dst = coding->destination + coding->produced;
7032   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7033   ptrdiff_t produced;
7034   ptrdiff_t produced_chars = 0;
7035   int carryover = 0;
7036
7037   if (! coding->chars_at_source)
7038     {
7039       /* Source characters are in coding->charbuf.  */
7040       int *buf = coding->charbuf;
7041       int *buf_end = buf + coding->charbuf_used;
7042
7043       if (EQ (coding->src_object, coding->dst_object))
7044         {
7045           coding_set_source (coding);
7046           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7047         }
7048
7049       while (buf < buf_end)
7050         {
7051           int c = *buf;
7052           ptrdiff_t i;
7053
7054           if (c >= 0)
7055             {
7056               ptrdiff_t from_nchars = 1, to_nchars = 1;
7057               Lisp_Object trans = Qnil;
7058
7059               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7060               if (! NILP (trans))
7061                 {
7062                   trans = get_translation (trans, buf, buf_end);
7063                   if (INTEGERP (trans))
7064                     c = XINT (trans);
7065                   else if (CONSP (trans))
7066                     {
7067                       from_nchars = ASIZE (XCAR (trans));
7068                       trans = XCDR (trans);
7069                       if (INTEGERP (trans))
7070                         c = XINT (trans);
7071                       else
7072                         {
7073                           to_nchars = ASIZE (trans);
7074                           c = XINT (AREF (trans, 0));
7075                         }
7076                     }
7077                   else if (EQ (trans, Qt) && ! last_block)
7078                     break;
7079                 }
7080
7081               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7082                 {
7083                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7084                        / MAX_MULTIBYTE_LENGTH)
7085                       < to_nchars)
7086                     memory_full (SIZE_MAX);
7087                   dst = alloc_destination (coding,
7088                                            buf_end - buf
7089                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7090                                            dst);
7091                   if (EQ (coding->src_object, coding->dst_object))
7092                     {
7093                       coding_set_source (coding);
7094                       dst_end = (((unsigned char *) coding->source)
7095                                  + coding->consumed);
7096                     }
7097                   else
7098                     dst_end = coding->destination + coding->dst_bytes;
7099                 }
7100
7101               for (i = 0; i < to_nchars; i++)
7102                 {
7103                   if (i > 0)
7104                     c = XINT (AREF (trans, i));
7105                   if (coding->dst_multibyte
7106                       || ! CHAR_BYTE8_P (c))
7107                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7108                   else
7109                     *dst++ = CHAR_TO_BYTE8 (c);
7110                 }
7111               produced_chars += to_nchars;
7112               buf += from_nchars;
7113             }
7114           else
7115             /* This is an annotation datum.  (-C) is the length.  */
7116             buf += -c;
7117         }
7118       carryover = buf_end - buf;
7119     }
7120   else
7121     {
7122       /* Source characters are at coding->source.  */
7123       const unsigned char *src = coding->source;
7124       const unsigned char *src_end = src + coding->consumed;
7125
7126       if (EQ (coding->dst_object, coding->src_object))
7127         dst_end = (unsigned char *) src;
7128       if (coding->src_multibyte != coding->dst_multibyte)
7129         {
7130           if (coding->src_multibyte)
7131             {
7132               bool multibytep = 1;
7133               ptrdiff_t consumed_chars = 0;
7134
7135               while (1)
7136                 {
7137                   const unsigned char *src_base = src;
7138                   int c;
7139
7140                   ONE_MORE_BYTE (c);
7141                   if (dst == dst_end)
7142                     {
7143                       if (EQ (coding->src_object, coding->dst_object))
7144                         dst_end = (unsigned char *) src;
7145                       if (dst == dst_end)
7146                         {
7147                           ptrdiff_t offset = src - coding->source;
7148
7149                           dst = alloc_destination (coding, src_end - src + 1,
7150                                                    dst);
7151                           dst_end = coding->destination + coding->dst_bytes;
7152                           coding_set_source (coding);
7153                           src = coding->source + offset;
7154                           src_end = coding->source + coding->consumed;
7155                           if (EQ (coding->src_object, coding->dst_object))
7156                             dst_end = (unsigned char *) src;
7157                         }
7158                     }
7159                   *dst++ = c;
7160                   produced_chars++;
7161                 }
7162             no_more_source:
7163               ;
7164             }
7165           else
7166             while (src < src_end)
7167               {
7168                 bool multibytep = 1;
7169                 int c = *src++;
7170
7171                 if (dst >= dst_end - 1)
7172                   {
7173                     if (EQ (coding->src_object, coding->dst_object))
7174                       dst_end = (unsigned char *) src;
7175                     if (dst >= dst_end - 1)
7176                       {
7177                         ptrdiff_t offset = src - coding->source;
7178                         ptrdiff_t more_bytes;
7179
7180                         if (EQ (coding->src_object, coding->dst_object))
7181                           more_bytes = ((src_end - src) / 2) + 2;
7182                         else
7183                           more_bytes = src_end - src + 2;
7184                         dst = alloc_destination (coding, more_bytes, dst);
7185                         dst_end = coding->destination + coding->dst_bytes;
7186                         coding_set_source (coding);
7187                         src = coding->source + offset;
7188                         src_end = coding->source + coding->consumed;
7189                         if (EQ (coding->src_object, coding->dst_object))
7190                           dst_end = (unsigned char *) src;
7191                       }
7192                   }
7193                 EMIT_ONE_BYTE (c);
7194               }
7195         }
7196       else
7197         {
7198           if (!EQ (coding->src_object, coding->dst_object))
7199             {
7200               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7201
7202               if (require > 0)
7203                 {
7204                   ptrdiff_t offset = src - coding->source;
7205
7206                   dst = alloc_destination (coding, require, dst);
7207                   coding_set_source (coding);
7208                   src = coding->source + offset;
7209                   src_end = coding->source + coding->consumed;
7210                 }
7211             }
7212           produced_chars = coding->consumed_char;
7213           while (src < src_end)
7214             *dst++ = *src++;
7215         }
7216     }
7217
7218   produced = dst - (coding->destination + coding->produced);
7219   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7220     insert_from_gap (produced_chars, produced, 0);
7221   coding->produced += produced;
7222   coding->produced_char += produced_chars;
7223   return carryover;
7224 }
7225
7226 /* Compose text in CODING->object according to the annotation data at
7227    CHARBUF.  CHARBUF is an array:
7228      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7229  */
7230
7231 static void
7232 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7233 {
7234   int len;
7235   ptrdiff_t to;
7236   enum composition_method method;
7237   Lisp_Object components;
7238
7239   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7240   to = pos + charbuf[2];
7241   method = (enum composition_method) (charbuf[4]);
7242
7243   if (method == COMPOSITION_RELATIVE)
7244     components = Qnil;
7245   else
7246     {
7247       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7248       int i, j;
7249
7250       if (method == COMPOSITION_WITH_RULE)
7251         len = charbuf[2] * 3 - 2;
7252       charbuf += MAX_ANNOTATION_LENGTH;
7253       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7254       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7255         {
7256           if (charbuf[i] >= 0)
7257             args[j] = make_number (charbuf[i]);
7258           else
7259             {
7260               i++;
7261               args[j] = make_number (charbuf[i] % 0x100);
7262             }
7263         }
7264       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7265     }
7266   compose_text (pos, to, components, Qnil, coding->dst_object);
7267 }
7268
7269
7270 /* Put `charset' property on text in CODING->object according to
7271    the annotation data at CHARBUF.  CHARBUF is an array:
7272      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7273  */
7274
7275 static void
7276 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7277 {
7278   ptrdiff_t from = pos - charbuf[2];
7279   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7280
7281   Fput_text_property (make_number (from), make_number (pos),
7282                       Qcharset, CHARSET_NAME (charset),
7283                       coding->dst_object);
7284 }
7285
7286
7287 #define CHARBUF_SIZE 0x4000
7288
7289 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7290   do {                                                                  \
7291     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7292     coding->charbuf_size = CHARBUF_SIZE;                                \
7293   } while (0)
7294
7295
7296 static void
7297 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7298 {
7299   int *charbuf = coding->charbuf;
7300   int *charbuf_end = charbuf + coding->charbuf_used;
7301
7302   if (NILP (coding->dst_object))
7303     return;
7304
7305   while (charbuf < charbuf_end)
7306     {
7307       if (*charbuf >= 0)
7308         pos++, charbuf++;
7309       else
7310         {
7311           int len = -*charbuf;
7312
7313           if (len > 2)
7314             switch (charbuf[1])
7315               {
7316               case CODING_ANNOTATE_COMPOSITION_MASK:
7317                 produce_composition (coding, charbuf, pos);
7318                 break;
7319               case CODING_ANNOTATE_CHARSET_MASK:
7320                 produce_charset (coding, charbuf, pos);
7321                 break;
7322               }
7323           charbuf += len;
7324         }
7325     }
7326 }
7327
7328 /* Decode the data at CODING->src_object into CODING->dst_object.
7329    CODING->src_object is a buffer, a string, or nil.
7330    CODING->dst_object is a buffer.
7331
7332    If CODING->src_object is a buffer, it must be the current buffer.
7333    In this case, if CODING->src_pos is positive, it is a position of
7334    the source text in the buffer, otherwise, the source text is in the
7335    gap area of the buffer, and CODING->src_pos specifies the offset of
7336    the text from GPT (which must be the same as PT).  If this is the
7337    same buffer as CODING->dst_object, CODING->src_pos must be
7338    negative.
7339
7340    If CODING->src_object is a string, CODING->src_pos is an index to
7341    that string.
7342
7343    If CODING->src_object is nil, CODING->source must already point to
7344    the non-relocatable memory area.  In this case, CODING->src_pos is
7345    an offset from CODING->source.
7346
7347    The decoded data is inserted at the current point of the buffer
7348    CODING->dst_object.
7349 */
7350
7351 static void
7352 decode_coding (struct coding_system *coding)
7353 {
7354   Lisp_Object attrs;
7355   Lisp_Object undo_list;
7356   Lisp_Object translation_table;
7357   struct ccl_spec cclspec;
7358   int carryover;
7359   int i;
7360
7361   USE_SAFE_ALLOCA;
7362
7363   if (BUFFERP (coding->src_object)
7364       && coding->src_pos > 0
7365       && coding->src_pos < GPT
7366       && coding->src_pos + coding->src_chars > GPT)
7367     move_gap_both (coding->src_pos, coding->src_pos_byte);
7368
7369   undo_list = Qt;
7370   if (BUFFERP (coding->dst_object))
7371     {
7372       set_buffer_internal (XBUFFER (coding->dst_object));
7373       if (GPT != PT)
7374         move_gap_both (PT, PT_BYTE);
7375
7376       /* We must disable undo_list in order to record the whole insert
7377          transaction via record_insert at the end.  But doing so also
7378          disables the recording of the first change to the undo_list.
7379          Therefore we check for first change here and record it via
7380          record_first_change if needed.  */
7381       if (MODIFF <= SAVE_MODIFF)
7382         record_first_change ();
7383
7384       undo_list = BVAR (current_buffer, undo_list);
7385       bset_undo_list (current_buffer, Qt);
7386     }
7387
7388   coding->consumed = coding->consumed_char = 0;
7389   coding->produced = coding->produced_char = 0;
7390   coding->chars_at_source = 0;
7391   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7392   coding->errors = 0;
7393
7394   ALLOC_CONVERSION_WORK_AREA (coding);
7395
7396   attrs = CODING_ID_ATTRS (coding->id);
7397   translation_table = get_translation_table (attrs, 0, NULL);
7398
7399   carryover = 0;
7400   if (coding->decoder == decode_coding_ccl)
7401     {
7402       coding->spec.ccl = &cclspec;
7403       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7404     }
7405   do
7406     {
7407       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7408
7409       coding_set_source (coding);
7410       coding->annotated = 0;
7411       coding->charbuf_used = carryover;
7412       (*(coding->decoder)) (coding);
7413       coding_set_destination (coding);
7414       carryover = produce_chars (coding, translation_table, 0);
7415       if (coding->annotated)
7416         produce_annotation (coding, pos);
7417       for (i = 0; i < carryover; i++)
7418         coding->charbuf[i]
7419           = coding->charbuf[coding->charbuf_used - carryover + i];
7420     }
7421   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7422          || (coding->consumed < coding->src_bytes
7423              && (coding->result == CODING_RESULT_SUCCESS
7424                  || coding->result == CODING_RESULT_INVALID_SRC)));
7425
7426   if (carryover > 0)
7427     {
7428       coding_set_destination (coding);
7429       coding->charbuf_used = carryover;
7430       produce_chars (coding, translation_table, 1);
7431     }
7432
7433   coding->carryover_bytes = 0;
7434   if (coding->consumed < coding->src_bytes)
7435     {
7436       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7437       const unsigned char *src;
7438
7439       coding_set_source (coding);
7440       coding_set_destination (coding);
7441       src = coding->source + coding->consumed;
7442
7443       if (coding->mode & CODING_MODE_LAST_BLOCK)
7444         {
7445           /* Flush out unprocessed data as binary chars.  We are sure
7446              that the number of data is less than the size of
7447              coding->charbuf.  */
7448           coding->charbuf_used = 0;
7449           coding->chars_at_source = 0;
7450
7451           while (nbytes-- > 0)
7452             {
7453               int c = *src++;
7454
7455               if (c & 0x80)
7456                 c = BYTE8_TO_CHAR (c);
7457               coding->charbuf[coding->charbuf_used++] = c;
7458             }
7459           produce_chars (coding, Qnil, 1);
7460         }
7461       else
7462         {
7463           /* Record unprocessed bytes in coding->carryover.  We are
7464              sure that the number of data is less than the size of
7465              coding->carryover.  */
7466           unsigned char *p = coding->carryover;
7467
7468           if (nbytes > sizeof coding->carryover)
7469             nbytes = sizeof coding->carryover;
7470           coding->carryover_bytes = nbytes;
7471           while (nbytes-- > 0)
7472             *p++ = *src++;
7473         }
7474       coding->consumed = coding->src_bytes;
7475     }
7476
7477   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7478       && !inhibit_eol_conversion)
7479     decode_eol (coding);
7480   if (BUFFERP (coding->dst_object))
7481     {
7482       bset_undo_list (current_buffer, undo_list);
7483       record_insert (coding->dst_pos, coding->produced_char);
7484     }
7485
7486   SAFE_FREE ();
7487 }
7488
7489
7490 /* Extract an annotation datum from a composition starting at POS and
7491    ending before LIMIT of CODING->src_object (buffer or string), store
7492    the data in BUF, set *STOP to a starting position of the next
7493    composition (if any) or to LIMIT, and return the address of the
7494    next element of BUF.
7495
7496    If such an annotation is not found, set *STOP to a starting
7497    position of a composition after POS (if any) or to LIMIT, and
7498    return BUF.  */
7499
7500 static int *
7501 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7502                                struct coding_system *coding, int *buf,
7503                                ptrdiff_t *stop)
7504 {
7505   ptrdiff_t start, end;
7506   Lisp_Object prop;
7507
7508   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7509       || end > limit)
7510     *stop = limit;
7511   else if (start > pos)
7512     *stop = start;
7513   else
7514     {
7515       if (start == pos)
7516         {
7517           /* We found a composition.  Store the corresponding
7518              annotation data in BUF.  */
7519           int *head = buf;
7520           enum composition_method method = composition_method (prop);
7521           int nchars = COMPOSITION_LENGTH (prop);
7522
7523           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7524           if (method != COMPOSITION_RELATIVE)
7525             {
7526               Lisp_Object components;
7527               ptrdiff_t i, len, i_byte;
7528
7529               components = COMPOSITION_COMPONENTS (prop);
7530               if (VECTORP (components))
7531                 {
7532                   len = ASIZE (components);
7533                   for (i = 0; i < len; i++)
7534                     *buf++ = XINT (AREF (components, i));
7535                 }
7536               else if (STRINGP (components))
7537                 {
7538                   len = SCHARS (components);
7539                   i = i_byte = 0;
7540                   while (i < len)
7541                     {
7542                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7543                       buf++;
7544                     }
7545                 }
7546               else if (INTEGERP (components))
7547                 {
7548                   len = 1;
7549                   *buf++ = XINT (components);
7550                 }
7551               else if (CONSP (components))
7552                 {
7553                   for (len = 0; CONSP (components);
7554                        len++, components = XCDR (components))
7555                     *buf++ = XINT (XCAR (components));
7556                 }
7557               else
7558                 emacs_abort ();
7559               *head -= len;
7560             }
7561         }
7562
7563       if (find_composition (end, limit, &start, &end, &prop,
7564                             coding->src_object)
7565           && end <= limit)
7566         *stop = start;
7567       else
7568         *stop = limit;
7569     }
7570   return buf;
7571 }
7572
7573
7574 /* Extract an annotation datum from a text property `charset' at POS of
7575    CODING->src_object (buffer of string), store the data in BUF, set
7576    *STOP to the position where the value of `charset' property changes
7577    (limiting by LIMIT), and return the address of the next element of
7578    BUF.
7579
7580    If the property value is nil, set *STOP to the position where the
7581    property value is non-nil (limiting by LIMIT), and return BUF.  */
7582
7583 static int *
7584 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7585                            struct coding_system *coding, int *buf,
7586                            ptrdiff_t *stop)
7587 {
7588   Lisp_Object val, next;
7589   int id;
7590
7591   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7592   if (! NILP (val) && CHARSETP (val))
7593     id = XINT (CHARSET_SYMBOL_ID (val));
7594   else
7595     id = -1;
7596   ADD_CHARSET_DATA (buf, 0, id);
7597   next = Fnext_single_property_change (make_number (pos), Qcharset,
7598                                        coding->src_object,
7599                                        make_number (limit));
7600   *stop = XINT (next);
7601   return buf;
7602 }
7603
7604
7605 static void
7606 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7607                int max_lookup)
7608 {
7609   int *buf = coding->charbuf;
7610   int *buf_end = coding->charbuf + coding->charbuf_size;
7611   const unsigned char *src = coding->source + coding->consumed;
7612   const unsigned char *src_end = coding->source + coding->src_bytes;
7613   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7614   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7615   bool multibytep = coding->src_multibyte;
7616   Lisp_Object eol_type;
7617   int c;
7618   ptrdiff_t stop, stop_composition, stop_charset;
7619   int *lookup_buf = NULL;
7620
7621   if (! NILP (translation_table))
7622     lookup_buf = alloca (sizeof (int) * max_lookup);
7623
7624   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7625   if (VECTORP (eol_type))
7626     eol_type = Qunix;
7627
7628   /* Note: composition handling is not yet implemented.  */
7629   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7630
7631   if (NILP (coding->src_object))
7632     stop = stop_composition = stop_charset = end_pos;
7633   else
7634     {
7635       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7636         stop = stop_composition = pos;
7637       else
7638         stop = stop_composition = end_pos;
7639       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7640         stop = stop_charset = pos;
7641       else
7642         stop_charset = end_pos;
7643     }
7644
7645   /* Compensate for CRLF and conversion.  */
7646   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7647   while (buf < buf_end)
7648     {
7649       Lisp_Object trans;
7650
7651       if (pos == stop)
7652         {
7653           if (pos == end_pos)
7654             break;
7655           if (pos == stop_composition)
7656             buf = handle_composition_annotation (pos, end_pos, coding,
7657                                                  buf, &stop_composition);
7658           if (pos == stop_charset)
7659             buf = handle_charset_annotation (pos, end_pos, coding,
7660                                              buf, &stop_charset);
7661           stop = (stop_composition < stop_charset
7662                   ? stop_composition : stop_charset);
7663         }
7664
7665       if (! multibytep)
7666         {
7667           int bytes;
7668
7669           if (coding->encoder == encode_coding_raw_text
7670               || coding->encoder == encode_coding_ccl)
7671             c = *src++, pos++;
7672           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7673             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7674           else
7675             c = BYTE8_TO_CHAR (*src), src++, pos++;
7676         }
7677       else
7678         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7679       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7680         c = '\n';
7681       if (! EQ (eol_type, Qunix))
7682         {
7683           if (c == '\n')
7684             {
7685               if (EQ (eol_type, Qdos))
7686                 *buf++ = '\r';
7687               else
7688                 c = '\r';
7689             }
7690         }
7691
7692       trans = Qnil;
7693       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7694       if (NILP (trans))
7695         *buf++ = c;
7696       else
7697         {
7698           ptrdiff_t from_nchars = 1, to_nchars = 1;
7699           int *lookup_buf_end;
7700           const unsigned char *p = src;
7701           int i;
7702
7703           lookup_buf[0] = c;
7704           for (i = 1; i < max_lookup && p < src_end; i++)
7705             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7706           lookup_buf_end = lookup_buf + i;
7707           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7708           if (INTEGERP (trans))
7709             c = XINT (trans);
7710           else if (CONSP (trans))
7711             {
7712               from_nchars = ASIZE (XCAR (trans));
7713               trans = XCDR (trans);
7714               if (INTEGERP (trans))
7715                 c = XINT (trans);
7716               else
7717                 {
7718                   to_nchars = ASIZE (trans);
7719                   if (buf_end - buf < to_nchars)
7720                     break;
7721                   c = XINT (AREF (trans, 0));
7722                 }
7723             }
7724           else
7725             break;
7726           *buf++ = c;
7727           for (i = 1; i < to_nchars; i++)
7728             *buf++ = XINT (AREF (trans, i));
7729           for (i = 1; i < from_nchars; i++, pos++)
7730             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7731         }
7732     }
7733
7734   coding->consumed = src - coding->source;
7735   coding->consumed_char = pos - coding->src_pos;
7736   coding->charbuf_used = buf - coding->charbuf;
7737   coding->chars_at_source = 0;
7738 }
7739
7740
7741 /* Encode the text at CODING->src_object into CODING->dst_object.
7742    CODING->src_object is a buffer or a string.
7743    CODING->dst_object is a buffer or nil.
7744
7745    If CODING->src_object is a buffer, it must be the current buffer.
7746    In this case, if CODING->src_pos is positive, it is a position of
7747    the source text in the buffer, otherwise. the source text is in the
7748    gap area of the buffer, and coding->src_pos specifies the offset of
7749    the text from GPT (which must be the same as PT).  If this is the
7750    same buffer as CODING->dst_object, CODING->src_pos must be
7751    negative and CODING should not have `pre-write-conversion'.
7752
7753    If CODING->src_object is a string, CODING should not have
7754    `pre-write-conversion'.
7755
7756    If CODING->dst_object is a buffer, the encoded data is inserted at
7757    the current point of that buffer.
7758
7759    If CODING->dst_object is nil, the encoded data is placed at the
7760    memory area specified by CODING->destination.  */
7761
7762 static void
7763 encode_coding (struct coding_system *coding)
7764 {
7765   Lisp_Object attrs;
7766   Lisp_Object translation_table;
7767   int max_lookup;
7768   struct ccl_spec cclspec;
7769
7770   USE_SAFE_ALLOCA;
7771
7772   attrs = CODING_ID_ATTRS (coding->id);
7773   if (coding->encoder == encode_coding_raw_text)
7774     translation_table = Qnil, max_lookup = 0;
7775   else
7776     translation_table = get_translation_table (attrs, 1, &max_lookup);
7777
7778   if (BUFFERP (coding->dst_object))
7779     {
7780       set_buffer_internal (XBUFFER (coding->dst_object));
7781       coding->dst_multibyte
7782         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7783     }
7784
7785   coding->consumed = coding->consumed_char = 0;
7786   coding->produced = coding->produced_char = 0;
7787   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7788   coding->errors = 0;
7789
7790   ALLOC_CONVERSION_WORK_AREA (coding);
7791
7792   if (coding->encoder == encode_coding_ccl)
7793     {
7794       coding->spec.ccl = &cclspec;
7795       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7796     }
7797   do {
7798     coding_set_source (coding);
7799     consume_chars (coding, translation_table, max_lookup);
7800     coding_set_destination (coding);
7801     (*(coding->encoder)) (coding);
7802   } while (coding->consumed_char < coding->src_chars);
7803
7804   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7805     insert_from_gap (coding->produced_char, coding->produced, 0);
7806
7807   SAFE_FREE ();
7808 }
7809
7810
7811 /* Name (or base name) of work buffer for code conversion.  */
7812 static Lisp_Object Vcode_conversion_workbuf_name;
7813
7814 /* A working buffer used by the top level conversion.  Once it is
7815    created, it is never destroyed.  It has the name
7816    Vcode_conversion_workbuf_name.  The other working buffers are
7817    destroyed after the use is finished, and their names are modified
7818    versions of Vcode_conversion_workbuf_name.  */
7819 static Lisp_Object Vcode_conversion_reused_workbuf;
7820
7821 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7822 static bool reused_workbuf_in_use;
7823
7824
7825 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7826    multibyteness of returning buffer.  */
7827
7828 static Lisp_Object
7829 make_conversion_work_buffer (bool multibyte)
7830 {
7831   Lisp_Object name, workbuf;
7832   struct buffer *current;
7833
7834   if (reused_workbuf_in_use)
7835     {
7836       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7837       workbuf = Fget_buffer_create (name);
7838     }
7839   else
7840     {
7841       reused_workbuf_in_use = 1;
7842       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7843         Vcode_conversion_reused_workbuf
7844           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7845       workbuf = Vcode_conversion_reused_workbuf;
7846     }
7847   current = current_buffer;
7848   set_buffer_internal (XBUFFER (workbuf));
7849   /* We can't allow modification hooks to run in the work buffer.  For
7850      instance, directory_files_internal assumes that file decoding
7851      doesn't compile new regexps.  */
7852   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7853   Ferase_buffer ();
7854   bset_undo_list (current_buffer, Qt);
7855   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7856   set_buffer_internal (current);
7857   return workbuf;
7858 }
7859
7860
7861 static void
7862 code_conversion_restore (Lisp_Object arg)
7863 {
7864   Lisp_Object current, workbuf;
7865   struct gcpro gcpro1;
7866
7867   GCPRO1 (arg);
7868   current = XCAR (arg);
7869   workbuf = XCDR (arg);
7870   if (! NILP (workbuf))
7871     {
7872       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7873         reused_workbuf_in_use = 0;
7874       else
7875         Fkill_buffer (workbuf);
7876     }
7877   set_buffer_internal (XBUFFER (current));
7878   UNGCPRO;
7879 }
7880
7881 Lisp_Object
7882 code_conversion_save (bool with_work_buf, bool multibyte)
7883 {
7884   Lisp_Object workbuf = Qnil;
7885
7886   if (with_work_buf)
7887     workbuf = make_conversion_work_buffer (multibyte);
7888   record_unwind_protect (code_conversion_restore,
7889                          Fcons (Fcurrent_buffer (), workbuf));
7890   return workbuf;
7891 }
7892
7893 void
7894 decode_coding_gap (struct coding_system *coding,
7895                    ptrdiff_t chars, ptrdiff_t bytes)
7896 {
7897   ptrdiff_t count = SPECPDL_INDEX ();
7898   Lisp_Object attrs;
7899
7900   coding->src_object = Fcurrent_buffer ();
7901   coding->src_chars = chars;
7902   coding->src_bytes = bytes;
7903   coding->src_pos = -chars;
7904   coding->src_pos_byte = -bytes;
7905   coding->src_multibyte = chars < bytes;
7906   coding->dst_object = coding->src_object;
7907   coding->dst_pos = PT;
7908   coding->dst_pos_byte = PT_BYTE;
7909   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7910
7911   coding->head_ascii = -1;
7912   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7913   coding->eol_seen = EOL_SEEN_NONE;
7914   if (CODING_REQUIRE_DETECTION (coding))
7915     detect_coding (coding);
7916   attrs = CODING_ID_ATTRS (coding->id);
7917   if (! disable_ascii_optimization
7918       && ! coding->src_multibyte
7919       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7920       && NILP (CODING_ATTR_POST_READ (attrs))
7921       && NILP (get_translation_table (attrs, 0, NULL)))
7922     {
7923       chars = coding->head_ascii;
7924       if (chars < 0)
7925         chars = check_ascii (coding);
7926       if (chars != bytes)
7927         {
7928           /* There exists a non-ASCII byte.  */
7929           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7930               && coding->detected_utf8_bytes == coding->src_bytes)
7931             {
7932               if (coding->detected_utf8_chars >= 0)
7933                 chars = coding->detected_utf8_chars;
7934               else
7935                 chars = check_utf_8 (coding);
7936               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7937                   && coding->head_ascii == 0
7938                   && coding->source[0] == UTF_8_BOM_1
7939                   && coding->source[1] == UTF_8_BOM_2
7940                   && coding->source[2] == UTF_8_BOM_3)
7941                 {
7942                   chars--;
7943                   bytes -= 3;
7944                   coding->src_bytes -= 3;
7945                 }
7946             }
7947           else
7948             chars = -1;
7949         }
7950       if (chars >= 0)
7951         {
7952           Lisp_Object eol_type;
7953
7954           eol_type = CODING_ID_EOL_TYPE (coding->id);
7955           if (VECTORP (eol_type))
7956             {
7957               if (coding->eol_seen != EOL_SEEN_NONE)
7958                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7959             }
7960           if (EQ (eol_type, Qmac))
7961             {
7962               unsigned char *src_end = GAP_END_ADDR;
7963               unsigned char *src = src_end - coding->src_bytes;
7964
7965               while (src < src_end)
7966                 {
7967                   if (*src++ == '\r')
7968                     src[-1] = '\n';
7969                 }
7970             }
7971           else if (EQ (eol_type, Qdos))
7972             {
7973               unsigned char *src = GAP_END_ADDR;
7974               unsigned char *src_beg = src - coding->src_bytes;
7975               unsigned char *dst = src;
7976               ptrdiff_t diff;
7977
7978               while (src_beg < src)
7979                 {
7980                   *--dst = *--src;
7981                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7982                     src--;
7983                 }
7984               diff = dst - src;
7985               bytes -= diff;
7986               chars -= diff;
7987             }
7988           coding->produced = bytes;
7989           coding->produced_char = chars;
7990           insert_from_gap (chars, bytes, 1);
7991           return;
7992         }
7993     }
7994   code_conversion_save (0, 0);
7995
7996   coding->mode |= CODING_MODE_LAST_BLOCK;
7997   current_buffer->text->inhibit_shrinking = 1;
7998   decode_coding (coding);
7999   current_buffer->text->inhibit_shrinking = 0;
8000
8001   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8002     {
8003       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8004       Lisp_Object val;
8005
8006       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8007       val = call1 (CODING_ATTR_POST_READ (attrs),
8008                    make_number (coding->produced_char));
8009       CHECK_NATNUM (val);
8010       coding->produced_char += Z - prev_Z;
8011       coding->produced += Z_BYTE - prev_Z_BYTE;
8012     }
8013
8014   unbind_to (count, Qnil);
8015 }
8016
8017
8018 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8019    SRC_OBJECT into DST_OBJECT by coding context CODING.
8020
8021    SRC_OBJECT is a buffer, a string, or Qnil.
8022
8023    If it is a buffer, the text is at point of the buffer.  FROM and TO
8024    are positions in the buffer.
8025
8026    If it is a string, the text is at the beginning of the string.
8027    FROM and TO are indices to the string.
8028
8029    If it is nil, the text is at coding->source.  FROM and TO are
8030    indices to coding->source.
8031
8032    DST_OBJECT is a buffer, Qt, or Qnil.
8033
8034    If it is a buffer, the decoded text is inserted at point of the
8035    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8036    is deleted.
8037
8038    If it is Qt, a string is made from the decoded text, and
8039    set in CODING->dst_object.
8040
8041    If it is Qnil, the decoded text is stored at CODING->destination.
8042    The caller must allocate CODING->dst_bytes bytes at
8043    CODING->destination by xmalloc.  If the decoded text is longer than
8044    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8045  */
8046
8047 void
8048 decode_coding_object (struct coding_system *coding,
8049                       Lisp_Object src_object,
8050                       ptrdiff_t from, ptrdiff_t from_byte,
8051                       ptrdiff_t to, ptrdiff_t to_byte,
8052                       Lisp_Object dst_object)
8053 {
8054   ptrdiff_t count = SPECPDL_INDEX ();
8055   unsigned char *destination IF_LINT (= NULL);
8056   ptrdiff_t dst_bytes IF_LINT (= 0);
8057   ptrdiff_t chars = to - from;
8058   ptrdiff_t bytes = to_byte - from_byte;
8059   Lisp_Object attrs;
8060   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8061   bool need_marker_adjustment = 0;
8062   Lisp_Object old_deactivate_mark;
8063
8064   old_deactivate_mark = Vdeactivate_mark;
8065
8066   if (NILP (dst_object))
8067     {
8068       destination = coding->destination;
8069       dst_bytes = coding->dst_bytes;
8070     }
8071
8072   coding->src_object = src_object;
8073   coding->src_chars = chars;
8074   coding->src_bytes = bytes;
8075   coding->src_multibyte = chars < bytes;
8076
8077   if (STRINGP (src_object))
8078     {
8079       coding->src_pos = from;
8080       coding->src_pos_byte = from_byte;
8081     }
8082   else if (BUFFERP (src_object))
8083     {
8084       set_buffer_internal (XBUFFER (src_object));
8085       if (from != GPT)
8086         move_gap_both (from, from_byte);
8087       if (EQ (src_object, dst_object))
8088         {
8089           struct Lisp_Marker *tail;
8090
8091           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8092             {
8093               tail->need_adjustment
8094                 = tail->charpos == (tail->insertion_type ? from : to);
8095               need_marker_adjustment |= tail->need_adjustment;
8096             }
8097           saved_pt = PT, saved_pt_byte = PT_BYTE;
8098           TEMP_SET_PT_BOTH (from, from_byte);
8099           current_buffer->text->inhibit_shrinking = 1;
8100           del_range_both (from, from_byte, to, to_byte, 1);
8101           coding->src_pos = -chars;
8102           coding->src_pos_byte = -bytes;
8103         }
8104       else
8105         {
8106           coding->src_pos = from;
8107           coding->src_pos_byte = from_byte;
8108         }
8109     }
8110
8111   if (CODING_REQUIRE_DETECTION (coding))
8112     detect_coding (coding);
8113   attrs = CODING_ID_ATTRS (coding->id);
8114
8115   if (EQ (dst_object, Qt)
8116       || (! NILP (CODING_ATTR_POST_READ (attrs))
8117           && NILP (dst_object)))
8118     {
8119       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8120       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8121       coding->dst_pos = BEG;
8122       coding->dst_pos_byte = BEG_BYTE;
8123     }
8124   else if (BUFFERP (dst_object))
8125     {
8126       code_conversion_save (0, 0);
8127       coding->dst_object = dst_object;
8128       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8129       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8130       coding->dst_multibyte
8131         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8132     }
8133   else
8134     {
8135       code_conversion_save (0, 0);
8136       coding->dst_object = Qnil;
8137       /* Most callers presume this will return a multibyte result, and they
8138          won't use `binary' or `raw-text' anyway, so let's not worry about
8139          CODING_FOR_UNIBYTE.  */
8140       coding->dst_multibyte = 1;
8141     }
8142
8143   decode_coding (coding);
8144
8145   if (BUFFERP (coding->dst_object))
8146     set_buffer_internal (XBUFFER (coding->dst_object));
8147
8148   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8149     {
8150       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8151       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8152       Lisp_Object val;
8153
8154       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8155       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8156               old_deactivate_mark);
8157       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8158                         make_number (coding->produced_char));
8159       UNGCPRO;
8160       CHECK_NATNUM (val);
8161       coding->produced_char += Z - prev_Z;
8162       coding->produced += Z_BYTE - prev_Z_BYTE;
8163     }
8164
8165   if (EQ (dst_object, Qt))
8166     {
8167       coding->dst_object = Fbuffer_string ();
8168     }
8169   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8170     {
8171       set_buffer_internal (XBUFFER (coding->dst_object));
8172       if (dst_bytes < coding->produced)
8173         {
8174           eassert (coding->produced > 0);
8175           destination = xrealloc (destination, coding->produced);
8176           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8177             move_gap_both (BEGV, BEGV_BYTE);
8178           memcpy (destination, BEGV_ADDR, coding->produced);
8179           coding->destination = destination;
8180         }
8181     }
8182
8183   if (saved_pt >= 0)
8184     {
8185       /* This is the case of:
8186          (BUFFERP (src_object) && EQ (src_object, dst_object))
8187          As we have moved PT while replacing the original buffer
8188          contents, we must recover it now.  */
8189       set_buffer_internal (XBUFFER (src_object));
8190       current_buffer->text->inhibit_shrinking = 0;
8191       if (saved_pt < from)
8192         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8193       else if (saved_pt < from + chars)
8194         TEMP_SET_PT_BOTH (from, from_byte);
8195       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8196         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8197                           saved_pt_byte + (coding->produced - bytes));
8198       else
8199         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8200                           saved_pt_byte + (coding->produced - bytes));
8201
8202       if (need_marker_adjustment)
8203         {
8204           struct Lisp_Marker *tail;
8205
8206           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8207             if (tail->need_adjustment)
8208               {
8209                 tail->need_adjustment = 0;
8210                 if (tail->insertion_type)
8211                   {
8212                     tail->bytepos = from_byte;
8213                     tail->charpos = from;
8214                   }
8215                 else
8216                   {
8217                     tail->bytepos = from_byte + coding->produced;
8218                     tail->charpos
8219                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8220                          ? tail->bytepos : from + coding->produced_char);
8221                   }
8222               }
8223         }
8224     }
8225
8226   Vdeactivate_mark = old_deactivate_mark;
8227   unbind_to (count, coding->dst_object);
8228 }
8229
8230
8231 void
8232 encode_coding_object (struct coding_system *coding,
8233                       Lisp_Object src_object,
8234                       ptrdiff_t from, ptrdiff_t from_byte,
8235                       ptrdiff_t to, ptrdiff_t to_byte,
8236                       Lisp_Object dst_object)
8237 {
8238   ptrdiff_t count = SPECPDL_INDEX ();
8239   ptrdiff_t chars = to - from;
8240   ptrdiff_t bytes = to_byte - from_byte;
8241   Lisp_Object attrs;
8242   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8243   bool need_marker_adjustment = 0;
8244   bool kill_src_buffer = 0;
8245   Lisp_Object old_deactivate_mark;
8246
8247   old_deactivate_mark = Vdeactivate_mark;
8248
8249   coding->src_object = src_object;
8250   coding->src_chars = chars;
8251   coding->src_bytes = bytes;
8252   coding->src_multibyte = chars < bytes;
8253
8254   attrs = CODING_ID_ATTRS (coding->id);
8255
8256   if (EQ (src_object, dst_object))
8257     {
8258       struct Lisp_Marker *tail;
8259
8260       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8261         {
8262           tail->need_adjustment
8263             = tail->charpos == (tail->insertion_type ? from : to);
8264           need_marker_adjustment |= tail->need_adjustment;
8265         }
8266     }
8267
8268   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8269     {
8270       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8271       set_buffer_internal (XBUFFER (coding->src_object));
8272       if (STRINGP (src_object))
8273         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8274       else if (BUFFERP (src_object))
8275         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8276       else
8277         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8278
8279       if (EQ (src_object, dst_object))
8280         {
8281           set_buffer_internal (XBUFFER (src_object));
8282           saved_pt = PT, saved_pt_byte = PT_BYTE;
8283           del_range_both (from, from_byte, to, to_byte, 1);
8284           set_buffer_internal (XBUFFER (coding->src_object));
8285         }
8286
8287       {
8288         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8289
8290         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8291                 old_deactivate_mark);
8292         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8293                     make_number (BEG), make_number (Z));
8294         UNGCPRO;
8295       }
8296       if (XBUFFER (coding->src_object) != current_buffer)
8297         kill_src_buffer = 1;
8298       coding->src_object = Fcurrent_buffer ();
8299       if (BEG != GPT)
8300         move_gap_both (BEG, BEG_BYTE);
8301       coding->src_chars = Z - BEG;
8302       coding->src_bytes = Z_BYTE - BEG_BYTE;
8303       coding->src_pos = BEG;
8304       coding->src_pos_byte = BEG_BYTE;
8305       coding->src_multibyte = Z < Z_BYTE;
8306     }
8307   else if (STRINGP (src_object))
8308     {
8309       code_conversion_save (0, 0);
8310       coding->src_pos = from;
8311       coding->src_pos_byte = from_byte;
8312     }
8313   else if (BUFFERP (src_object))
8314     {
8315       code_conversion_save (0, 0);
8316       set_buffer_internal (XBUFFER (src_object));
8317       if (EQ (src_object, dst_object))
8318         {
8319           saved_pt = PT, saved_pt_byte = PT_BYTE;
8320           coding->src_object = del_range_1 (from, to, 1, 1);
8321           coding->src_pos = 0;
8322           coding->src_pos_byte = 0;
8323         }
8324       else
8325         {
8326           if (from < GPT && to >= GPT)
8327             move_gap_both (from, from_byte);
8328           coding->src_pos = from;
8329           coding->src_pos_byte = from_byte;
8330         }
8331     }
8332   else
8333     code_conversion_save (0, 0);
8334
8335   if (BUFFERP (dst_object))
8336     {
8337       coding->dst_object = dst_object;
8338       if (EQ (src_object, dst_object))
8339         {
8340           coding->dst_pos = from;
8341           coding->dst_pos_byte = from_byte;
8342         }
8343       else
8344         {
8345           struct buffer *current = current_buffer;
8346
8347           set_buffer_temp (XBUFFER (dst_object));
8348           coding->dst_pos = PT;
8349           coding->dst_pos_byte = PT_BYTE;
8350           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8351           set_buffer_temp (current);
8352         }
8353       coding->dst_multibyte
8354         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8355     }
8356   else if (EQ (dst_object, Qt))
8357     {
8358       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8359       coding->dst_object = Qnil;
8360       coding->destination = xmalloc (dst_bytes);
8361       coding->dst_bytes = dst_bytes;
8362       coding->dst_multibyte = 0;
8363     }
8364   else
8365     {
8366       coding->dst_object = Qnil;
8367       coding->dst_multibyte = 0;
8368     }
8369
8370   encode_coding (coding);
8371
8372   if (EQ (dst_object, Qt))
8373     {
8374       if (BUFFERP (coding->dst_object))
8375         coding->dst_object = Fbuffer_string ();
8376       else if (coding->raw_destination)
8377         /* This is used to avoid creating huge Lisp string.
8378            NOTE: caller who sets `raw_destination' is also
8379            responsible for freeing `destination' buffer.  */
8380         coding->dst_object = Qnil;
8381       else
8382         {
8383           coding->dst_object
8384             = make_unibyte_string ((char *) coding->destination,
8385                                    coding->produced);
8386           xfree (coding->destination);
8387         }
8388     }
8389
8390   if (saved_pt >= 0)
8391     {
8392       /* This is the case of:
8393          (BUFFERP (src_object) && EQ (src_object, dst_object))
8394          As we have moved PT while replacing the original buffer
8395          contents, we must recover it now.  */
8396       set_buffer_internal (XBUFFER (src_object));
8397       if (saved_pt < from)
8398         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8399       else if (saved_pt < from + chars)
8400         TEMP_SET_PT_BOTH (from, from_byte);
8401       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8402         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8403                           saved_pt_byte + (coding->produced - bytes));
8404       else
8405         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8406                           saved_pt_byte + (coding->produced - bytes));
8407
8408       if (need_marker_adjustment)
8409         {
8410           struct Lisp_Marker *tail;
8411
8412           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8413             if (tail->need_adjustment)
8414               {
8415                 tail->need_adjustment = 0;
8416                 if (tail->insertion_type)
8417                   {
8418                     tail->bytepos = from_byte;
8419                     tail->charpos = from;
8420                   }
8421                 else
8422                   {
8423                     tail->bytepos = from_byte + coding->produced;
8424                     tail->charpos
8425                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8426                          ? tail->bytepos : from + coding->produced_char);
8427                   }
8428               }
8429         }
8430     }
8431
8432   if (kill_src_buffer)
8433     Fkill_buffer (coding->src_object);
8434
8435   Vdeactivate_mark = old_deactivate_mark;
8436   unbind_to (count, Qnil);
8437 }
8438
8439
8440 Lisp_Object
8441 preferred_coding_system (void)
8442 {
8443   int id = coding_categories[coding_priorities[0]].id;
8444
8445   return CODING_ID_NAME (id);
8446 }
8447
8448 #if defined (WINDOWSNT) || defined (CYGWIN)
8449
8450 Lisp_Object
8451 from_unicode (Lisp_Object str)
8452 {
8453   CHECK_STRING (str);
8454   if (!STRING_MULTIBYTE (str) &&
8455       SBYTES (str) & 1)
8456     {
8457       str = Fsubstring (str, make_number (0), make_number (-1));
8458     }
8459
8460   return code_convert_string_norecord (str, Qutf_16le, 0);
8461 }
8462
8463 Lisp_Object
8464 from_unicode_buffer (const wchar_t* wstr)
8465 {
8466     return from_unicode (
8467         make_unibyte_string (
8468             (char*) wstr,
8469             /* we get one of the two final 0 bytes for free. */
8470             1 + sizeof (wchar_t) * wcslen (wstr)));
8471 }
8472
8473 wchar_t *
8474 to_unicode (Lisp_Object str, Lisp_Object *buf)
8475 {
8476   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8477   /* We need to make another copy (in addition to the one made by
8478      code_convert_string_norecord) to ensure that the final string is
8479      _doubly_ zero terminated --- that is, that the string is
8480      terminated by two zero bytes and one utf-16le null character.
8481      Because strings are already terminated with a single zero byte,
8482      we just add one additional zero. */
8483   str = make_uninit_string (SBYTES (*buf) + 1);
8484   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8485   SDATA (str) [SBYTES (*buf)] = '\0';
8486   *buf = str;
8487   return WCSDATA (*buf);
8488 }
8489
8490 #endif /* WINDOWSNT || CYGWIN */
8491
8492 \f
8493 #ifdef emacs
8494 /*** 8. Emacs Lisp library functions ***/
8495
8496 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8497        doc: /* Return t if OBJECT is nil or a coding-system.
8498 See the documentation of `define-coding-system' for information
8499 about coding-system objects.  */)
8500   (Lisp_Object object)
8501 {
8502   if (NILP (object)
8503       || CODING_SYSTEM_ID (object) >= 0)
8504     return Qt;
8505   if (! SYMBOLP (object)
8506       || NILP (Fget (object, Qcoding_system_define_form)))
8507     return Qnil;
8508   return Qt;
8509 }
8510
8511 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8512        Sread_non_nil_coding_system, 1, 1, 0,
8513        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8514   (Lisp_Object prompt)
8515 {
8516   Lisp_Object val;
8517   do
8518     {
8519       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8520                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8521     }
8522   while (SCHARS (val) == 0);
8523   return (Fintern (val, Qnil));
8524 }
8525
8526 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8527        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8528 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8529 Ignores case when completing coding systems (all Emacs coding systems
8530 are lower-case).  */)
8531   (Lisp_Object prompt, Lisp_Object default_coding_system)
8532 {
8533   Lisp_Object val;
8534   ptrdiff_t count = SPECPDL_INDEX ();
8535
8536   if (SYMBOLP (default_coding_system))
8537     default_coding_system = SYMBOL_NAME (default_coding_system);
8538   specbind (Qcompletion_ignore_case, Qt);
8539   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8540                           Qt, Qnil, Qcoding_system_history,
8541                           default_coding_system, Qnil);
8542   unbind_to (count, Qnil);
8543   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8544 }
8545
8546 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8547        1, 1, 0,
8548        doc: /* Check validity of CODING-SYSTEM.
8549 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8550 It is valid if it is nil or a symbol defined as a coding system by the
8551 function `define-coding-system'.  */)
8552   (Lisp_Object coding_system)
8553 {
8554   Lisp_Object define_form;
8555
8556   define_form = Fget (coding_system, Qcoding_system_define_form);
8557   if (! NILP (define_form))
8558     {
8559       Fput (coding_system, Qcoding_system_define_form, Qnil);
8560       safe_eval (define_form);
8561     }
8562   if (!NILP (Fcoding_system_p (coding_system)))
8563     return coding_system;
8564   xsignal1 (Qcoding_system_error, coding_system);
8565 }
8566
8567 \f
8568 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8569    HIGHEST, return the coding system of the highest
8570    priority among the detected coding systems.  Otherwise return a
8571    list of detected coding systems sorted by their priorities.  If
8572    MULTIBYTEP, it is assumed that the bytes are in correct
8573    multibyte form but contains only ASCII and eight-bit chars.
8574    Otherwise, the bytes are raw bytes.
8575
8576    CODING-SYSTEM controls the detection as below:
8577
8578    If it is nil, detect both text-format and eol-format.  If the
8579    text-format part of CODING-SYSTEM is already specified
8580    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8581    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8582    detect only text-format.  */
8583
8584 Lisp_Object
8585 detect_coding_system (const unsigned char *src,
8586                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8587                       bool highest, bool multibytep,
8588                       Lisp_Object coding_system)
8589 {
8590   const unsigned char *src_end = src + src_bytes;
8591   Lisp_Object attrs, eol_type;
8592   Lisp_Object val = Qnil;
8593   struct coding_system coding;
8594   ptrdiff_t id;
8595   struct coding_detection_info detect_info;
8596   enum coding_category base_category;
8597   bool null_byte_found = 0, eight_bit_found = 0;
8598
8599   if (NILP (coding_system))
8600     coding_system = Qundecided;
8601   setup_coding_system (coding_system, &coding);
8602   attrs = CODING_ID_ATTRS (coding.id);
8603   eol_type = CODING_ID_EOL_TYPE (coding.id);
8604   coding_system = CODING_ATTR_BASE_NAME (attrs);
8605
8606   coding.source = src;
8607   coding.src_chars = src_chars;
8608   coding.src_bytes = src_bytes;
8609   coding.src_multibyte = multibytep;
8610   coding.consumed = 0;
8611   coding.mode |= CODING_MODE_LAST_BLOCK;
8612   coding.head_ascii = 0;
8613
8614   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8615
8616   /* At first, detect text-format if necessary.  */
8617   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8618   if (base_category == coding_category_undecided)
8619     {
8620       enum coding_category category IF_LINT (= 0);
8621       struct coding_system *this IF_LINT (= NULL);
8622       int c, i;
8623       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8624                                        inhibit_null_byte_detection);
8625       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8626                                        inhibit_iso_escape_detection);
8627       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8628
8629       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8630       for (; src < src_end; src++)
8631         {
8632           c = *src;
8633           if (c & 0x80)
8634             {
8635               eight_bit_found = 1;
8636               if (null_byte_found)
8637                 break;
8638             }
8639           else if (c < 0x20)
8640             {
8641               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8642                   && ! inhibit_ied
8643                   && ! detect_info.checked)
8644                 {
8645                   if (detect_coding_iso_2022 (&coding, &detect_info))
8646                     {
8647                       /* We have scanned the whole data.  */
8648                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8649                         {
8650                           /* We didn't find an 8-bit code.  We may
8651                              have found a null-byte, but it's very
8652                              rare that a binary file confirm to
8653                              ISO-2022.  */
8654                           src = src_end;
8655                           coding.head_ascii = src - coding.source;
8656                         }
8657                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8658                       break;
8659                     }
8660                 }
8661               else if (! c && !inhibit_nbd)
8662                 {
8663                   null_byte_found = 1;
8664                   if (eight_bit_found)
8665                     break;
8666                 }
8667               if (! eight_bit_found)
8668                 coding.head_ascii++;
8669             }
8670           else if (! eight_bit_found)
8671             coding.head_ascii++;
8672         }
8673
8674       if (null_byte_found || eight_bit_found
8675           || coding.head_ascii < coding.src_bytes
8676           || detect_info.found)
8677         {
8678           if (coding.head_ascii == coding.src_bytes)
8679             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8680             for (i = 0; i < coding_category_raw_text; i++)
8681               {
8682                 category = coding_priorities[i];
8683                 this = coding_categories + category;
8684                 if (detect_info.found & (1 << category))
8685                   break;
8686               }
8687           else
8688             {
8689               if (null_byte_found)
8690                 {
8691                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8692                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8693                 }
8694               else if (prefer_utf_8
8695                        && detect_coding_utf_8 (&coding, &detect_info))
8696                 {
8697                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8698                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8699                 }
8700               for (i = 0; i < coding_category_raw_text; i++)
8701                 {
8702                   category = coding_priorities[i];
8703                   this = coding_categories + category;
8704
8705                   if (this->id < 0)
8706                     {
8707                       /* No coding system of this category is defined.  */
8708                       detect_info.rejected |= (1 << category);
8709                     }
8710                   else if (category >= coding_category_raw_text)
8711                     continue;
8712                   else if (detect_info.checked & (1 << category))
8713                     {
8714                       if (highest
8715                           && (detect_info.found & (1 << category)))
8716                         break;
8717                     }
8718                   else if ((*(this->detector)) (&coding, &detect_info)
8719                            && highest
8720                            && (detect_info.found & (1 << category)))
8721                     {
8722                       if (category == coding_category_utf_16_auto)
8723                         {
8724                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8725                             category = coding_category_utf_16_le;
8726                           else
8727                             category = coding_category_utf_16_be;
8728                         }
8729                       break;
8730                     }
8731                 }
8732             }
8733         }
8734
8735       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8736           || null_byte_found)
8737         {
8738           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8739           id = CODING_SYSTEM_ID (Qno_conversion);
8740           val = list1 (make_number (id));
8741         }
8742       else if (! detect_info.rejected && ! detect_info.found)
8743         {
8744           detect_info.found = CATEGORY_MASK_ANY;
8745           id = coding_categories[coding_category_undecided].id;
8746           val = list1 (make_number (id));
8747         }
8748       else if (highest)
8749         {
8750           if (detect_info.found)
8751             {
8752               detect_info.found = 1 << category;
8753               val = list1 (make_number (this->id));
8754             }
8755           else
8756             for (i = 0; i < coding_category_raw_text; i++)
8757               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8758                 {
8759                   detect_info.found = 1 << coding_priorities[i];
8760                   id = coding_categories[coding_priorities[i]].id;
8761                   val = list1 (make_number (id));
8762                   break;
8763                 }
8764         }
8765       else
8766         {
8767           int mask = detect_info.rejected | detect_info.found;
8768           int found = 0;
8769
8770           for (i = coding_category_raw_text - 1; i >= 0; i--)
8771             {
8772               category = coding_priorities[i];
8773               if (! (mask & (1 << category)))
8774                 {
8775                   found |= 1 << category;
8776                   id = coding_categories[category].id;
8777                   if (id >= 0)
8778                     val = list1 (make_number (id));
8779                 }
8780             }
8781           for (i = coding_category_raw_text - 1; i >= 0; i--)
8782             {
8783               category = coding_priorities[i];
8784               if (detect_info.found & (1 << category))
8785                 {
8786                   id = coding_categories[category].id;
8787                   val = Fcons (make_number (id), val);
8788                 }
8789             }
8790           detect_info.found |= found;
8791         }
8792     }
8793   else if (base_category == coding_category_utf_8_auto)
8794     {
8795       if (detect_coding_utf_8 (&coding, &detect_info))
8796         {
8797           struct coding_system *this;
8798
8799           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8800             this = coding_categories + coding_category_utf_8_sig;
8801           else
8802             this = coding_categories + coding_category_utf_8_nosig;
8803           val = list1 (make_number (this->id));
8804         }
8805     }
8806   else if (base_category == coding_category_utf_16_auto)
8807     {
8808       if (detect_coding_utf_16 (&coding, &detect_info))
8809         {
8810           struct coding_system *this;
8811
8812           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8813             this = coding_categories + coding_category_utf_16_le;
8814           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8815             this = coding_categories + coding_category_utf_16_be;
8816           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8817             this = coding_categories + coding_category_utf_16_be_nosig;
8818           else
8819             this = coding_categories + coding_category_utf_16_le_nosig;
8820           val = list1 (make_number (this->id));
8821         }
8822     }
8823   else
8824     {
8825       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8826       val = list1 (make_number (coding.id));
8827     }
8828
8829   /* Then, detect eol-format if necessary.  */
8830   {
8831     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8832     Lisp_Object tail;
8833
8834     if (VECTORP (eol_type))
8835       {
8836         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8837           {
8838             if (null_byte_found)
8839               normal_eol = EOL_SEEN_LF;
8840             else
8841               normal_eol = detect_eol (coding.source, src_bytes,
8842                                        coding_category_raw_text);
8843           }
8844         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8845                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8846           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8847                                       coding_category_utf_16_be);
8848         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8849                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8850           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8851                                       coding_category_utf_16_le);
8852       }
8853     else
8854       {
8855         if (EQ (eol_type, Qunix))
8856           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8857         else if (EQ (eol_type, Qdos))
8858           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8859         else
8860           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8861       }
8862
8863     for (tail = val; CONSP (tail); tail = XCDR (tail))
8864       {
8865         enum coding_category category;
8866         int this_eol;
8867
8868         id = XINT (XCAR (tail));
8869         attrs = CODING_ID_ATTRS (id);
8870         category = XINT (CODING_ATTR_CATEGORY (attrs));
8871         eol_type = CODING_ID_EOL_TYPE (id);
8872         if (VECTORP (eol_type))
8873           {
8874             if (category == coding_category_utf_16_be
8875                 || category == coding_category_utf_16_be_nosig)
8876               this_eol = utf_16_be_eol;
8877             else if (category == coding_category_utf_16_le
8878                      || category == coding_category_utf_16_le_nosig)
8879               this_eol = utf_16_le_eol;
8880             else
8881               this_eol = normal_eol;
8882
8883             if (this_eol == EOL_SEEN_LF)
8884               XSETCAR (tail, AREF (eol_type, 0));
8885             else if (this_eol == EOL_SEEN_CRLF)
8886               XSETCAR (tail, AREF (eol_type, 1));
8887             else if (this_eol == EOL_SEEN_CR)
8888               XSETCAR (tail, AREF (eol_type, 2));
8889             else
8890               XSETCAR (tail, CODING_ID_NAME (id));
8891           }
8892         else
8893           XSETCAR (tail, CODING_ID_NAME (id));
8894       }
8895   }
8896
8897   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8898 }
8899
8900
8901 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8902        2, 3, 0,
8903        doc: /* Detect coding system of the text in the region between START and END.
8904 Return a list of possible coding systems ordered by priority.
8905 The coding systems to try and their priorities follows what
8906 the function `coding-system-priority-list' (which see) returns.
8907
8908 If only ASCII characters are found (except for such ISO-2022 control
8909 characters as ESC), it returns a list of single element `undecided'
8910 or its subsidiary coding system according to a detected end-of-line
8911 format.
8912
8913 If optional argument HIGHEST is non-nil, return the coding system of
8914 highest priority.  */)
8915   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8916 {
8917   ptrdiff_t from, to;
8918   ptrdiff_t from_byte, to_byte;
8919
8920   validate_region (&start, &end);
8921   from = XINT (start), to = XINT (end);
8922   from_byte = CHAR_TO_BYTE (from);
8923   to_byte = CHAR_TO_BYTE (to);
8924
8925   if (from < GPT && to >= GPT)
8926     move_gap_both (to, to_byte);
8927
8928   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8929                                to - from, to_byte - from_byte,
8930                                !NILP (highest),
8931                                !NILP (BVAR (current_buffer
8932                                       , enable_multibyte_characters)),
8933                                Qnil);
8934 }
8935
8936 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8937        1, 2, 0,
8938        doc: /* Detect coding system of the text in STRING.
8939 Return a list of possible coding systems ordered by priority.
8940 The coding systems to try and their priorities follows what
8941 the function `coding-system-priority-list' (which see) returns.
8942
8943 If only ASCII characters are found (except for such ISO-2022 control
8944 characters as ESC), it returns a list of single element `undecided'
8945 or its subsidiary coding system according to a detected end-of-line
8946 format.
8947
8948 If optional argument HIGHEST is non-nil, return the coding system of
8949 highest priority.  */)
8950   (Lisp_Object string, Lisp_Object highest)
8951 {
8952   CHECK_STRING (string);
8953
8954   return detect_coding_system (SDATA (string),
8955                                SCHARS (string), SBYTES (string),
8956                                !NILP (highest), STRING_MULTIBYTE (string),
8957                                Qnil);
8958 }
8959
8960
8961 static bool
8962 char_encodable_p (int c, Lisp_Object attrs)
8963 {
8964   Lisp_Object tail;
8965   struct charset *charset;
8966   Lisp_Object translation_table;
8967
8968   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8969   if (! NILP (translation_table))
8970     c = translate_char (translation_table, c);
8971   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8972        CONSP (tail); tail = XCDR (tail))
8973     {
8974       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8975       if (CHAR_CHARSET_P (c, charset))
8976         break;
8977     }
8978   return (! NILP (tail));
8979 }
8980
8981
8982 /* Return a list of coding systems that safely encode the text between
8983    START and END.  If EXCLUDE is non-nil, it is a list of coding
8984    systems not to check.  The returned list doesn't contain any such
8985    coding systems.  In any case, if the text contains only ASCII or is
8986    unibyte, return t.  */
8987
8988 DEFUN ("find-coding-systems-region-internal",
8989        Ffind_coding_systems_region_internal,
8990        Sfind_coding_systems_region_internal, 2, 3, 0,
8991        doc: /* Internal use only.  */)
8992   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8993 {
8994   Lisp_Object coding_attrs_list, safe_codings;
8995   ptrdiff_t start_byte, end_byte;
8996   const unsigned char *p, *pbeg, *pend;
8997   int c;
8998   Lisp_Object tail, elt, work_table;
8999
9000   if (STRINGP (start))
9001     {
9002       if (!STRING_MULTIBYTE (start)
9003           || SCHARS (start) == SBYTES (start))
9004         return Qt;
9005       start_byte = 0;
9006       end_byte = SBYTES (start);
9007     }
9008   else
9009     {
9010       CHECK_NUMBER_COERCE_MARKER (start);
9011       CHECK_NUMBER_COERCE_MARKER (end);
9012       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9013         args_out_of_range (start, end);
9014       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9015         return Qt;
9016       start_byte = CHAR_TO_BYTE (XINT (start));
9017       end_byte = CHAR_TO_BYTE (XINT (end));
9018       if (XINT (end) - XINT (start) == end_byte - start_byte)
9019         return Qt;
9020
9021       if (XINT (start) < GPT && XINT (end) > GPT)
9022         {
9023           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9024             move_gap_both (XINT (start), start_byte);
9025           else
9026             move_gap_both (XINT (end), end_byte);
9027         }
9028     }
9029
9030   coding_attrs_list = Qnil;
9031   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9032     if (NILP (exclude)
9033         || NILP (Fmemq (XCAR (tail), exclude)))
9034       {
9035         Lisp_Object attrs;
9036
9037         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9038         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9039           {
9040             ASET (attrs, coding_attr_trans_tbl,
9041                   get_translation_table (attrs, 1, NULL));
9042             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9043           }
9044       }
9045
9046   if (STRINGP (start))
9047     p = pbeg = SDATA (start);
9048   else
9049     p = pbeg = BYTE_POS_ADDR (start_byte);
9050   pend = p + (end_byte - start_byte);
9051
9052   while (p < pend && ASCII_BYTE_P (*p)) p++;
9053   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9054
9055   work_table = Fmake_char_table (Qnil, Qnil);
9056   while (p < pend)
9057     {
9058       if (ASCII_BYTE_P (*p))
9059         p++;
9060       else
9061         {
9062           c = STRING_CHAR_ADVANCE (p);
9063           if (!NILP (char_table_ref (work_table, c)))
9064             /* This character was already checked.  Ignore it.  */
9065             continue;
9066
9067           charset_map_loaded = 0;
9068           for (tail = coding_attrs_list; CONSP (tail);)
9069             {
9070               elt = XCAR (tail);
9071               if (NILP (elt))
9072                 tail = XCDR (tail);
9073               else if (char_encodable_p (c, elt))
9074                 tail = XCDR (tail);
9075               else if (CONSP (XCDR (tail)))
9076                 {
9077                   XSETCAR (tail, XCAR (XCDR (tail)));
9078                   XSETCDR (tail, XCDR (XCDR (tail)));
9079                 }
9080               else
9081                 {
9082                   XSETCAR (tail, Qnil);
9083                   tail = XCDR (tail);
9084                 }
9085             }
9086           if (charset_map_loaded)
9087             {
9088               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9089
9090               if (STRINGP (start))
9091                 pbeg = SDATA (start);
9092               else
9093                 pbeg = BYTE_POS_ADDR (start_byte);
9094               p = pbeg + p_offset;
9095               pend = pbeg + pend_offset;
9096             }
9097           char_table_set (work_table, c, Qt);
9098         }
9099     }
9100
9101   safe_codings = list2 (Qraw_text, Qno_conversion);
9102   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9103     if (! NILP (XCAR (tail)))
9104       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9105
9106   return safe_codings;
9107 }
9108
9109
9110 DEFUN ("unencodable-char-position", Funencodable_char_position,
9111        Sunencodable_char_position, 3, 5, 0,
9112        doc: /*
9113 Return position of first un-encodable character in a region.
9114 START and END specify the region and CODING-SYSTEM specifies the
9115 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9116
9117 If optional 4th argument COUNT is non-nil, it specifies at most how
9118 many un-encodable characters to search.  In this case, the value is a
9119 list of positions.
9120
9121 If optional 5th argument STRING is non-nil, it is a string to search
9122 for un-encodable characters.  In that case, START and END are indexes
9123 to the string.  */)
9124   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9125 {
9126   EMACS_INT n;
9127   struct coding_system coding;
9128   Lisp_Object attrs, charset_list, translation_table;
9129   Lisp_Object positions;
9130   ptrdiff_t from, to;
9131   const unsigned char *p, *stop, *pend;
9132   bool ascii_compatible;
9133
9134   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9135   attrs = CODING_ID_ATTRS (coding.id);
9136   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9137     return Qnil;
9138   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9139   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9140   translation_table = get_translation_table (attrs, 1, NULL);
9141
9142   if (NILP (string))
9143     {
9144       validate_region (&start, &end);
9145       from = XINT (start);
9146       to = XINT (end);
9147       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9148           || (ascii_compatible
9149               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9150         return Qnil;
9151       p = CHAR_POS_ADDR (from);
9152       pend = CHAR_POS_ADDR (to);
9153       if (from < GPT && to >= GPT)
9154         stop = GPT_ADDR;
9155       else
9156         stop = pend;
9157     }
9158   else
9159     {
9160       CHECK_STRING (string);
9161       CHECK_NATNUM (start);
9162       CHECK_NATNUM (end);
9163       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9164         args_out_of_range_3 (string, start, end);
9165       from = XINT (start);
9166       to = XINT (end);
9167       if (! STRING_MULTIBYTE (string))
9168         return Qnil;
9169       p = SDATA (string) + string_char_to_byte (string, from);
9170       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9171       if (ascii_compatible && (to - from) == (pend - p))
9172         return Qnil;
9173     }
9174
9175   if (NILP (count))
9176     n = 1;
9177   else
9178     {
9179       CHECK_NATNUM (count);
9180       n = XINT (count);
9181     }
9182
9183   positions = Qnil;
9184   charset_map_loaded = 0;
9185   while (1)
9186     {
9187       int c;
9188
9189       if (ascii_compatible)
9190         while (p < stop && ASCII_BYTE_P (*p))
9191           p++, from++;
9192       if (p >= stop)
9193         {
9194           if (p >= pend)
9195             break;
9196           stop = pend;
9197           p = GAP_END_ADDR;
9198         }
9199
9200       c = STRING_CHAR_ADVANCE (p);
9201       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9202           && ! char_charset (translate_char (translation_table, c),
9203                              charset_list, NULL))
9204         {
9205           positions = Fcons (make_number (from), positions);
9206           n--;
9207           if (n == 0)
9208             break;
9209         }
9210
9211       from++;
9212       if (charset_map_loaded && NILP (string))
9213         {
9214           p = CHAR_POS_ADDR (from);
9215           pend = CHAR_POS_ADDR (to);
9216           if (from < GPT && to >= GPT)
9217             stop = GPT_ADDR;
9218           else
9219             stop = pend;
9220           charset_map_loaded = 0;
9221         }
9222     }
9223
9224   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9225 }
9226
9227
9228 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9229        Scheck_coding_systems_region, 3, 3, 0,
9230        doc: /* Check if the region is encodable by coding systems.
9231
9232 START and END are buffer positions specifying the region.
9233 CODING-SYSTEM-LIST is a list of coding systems to check.
9234
9235 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9236 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9237 whole region, POS0, POS1, ... are buffer positions where non-encodable
9238 characters are found.
9239
9240 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9241 value is nil.
9242
9243 START may be a string.  In that case, check if the string is
9244 encodable, and the value contains indices to the string instead of
9245 buffer positions.  END is ignored.
9246
9247 If the current buffer (or START if it is a string) is unibyte, the value
9248 is nil.  */)
9249   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9250 {
9251   Lisp_Object list;
9252   ptrdiff_t start_byte, end_byte;
9253   ptrdiff_t pos;
9254   const unsigned char *p, *pbeg, *pend;
9255   int c;
9256   Lisp_Object tail, elt, attrs;
9257
9258   if (STRINGP (start))
9259     {
9260       if (!STRING_MULTIBYTE (start)
9261           || SCHARS (start) == SBYTES (start))
9262         return Qnil;
9263       start_byte = 0;
9264       end_byte = SBYTES (start);
9265       pos = 0;
9266     }
9267   else
9268     {
9269       CHECK_NUMBER_COERCE_MARKER (start);
9270       CHECK_NUMBER_COERCE_MARKER (end);
9271       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9272         args_out_of_range (start, end);
9273       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9274         return Qnil;
9275       start_byte = CHAR_TO_BYTE (XINT (start));
9276       end_byte = CHAR_TO_BYTE (XINT (end));
9277       if (XINT (end) - XINT (start) == end_byte - start_byte)
9278         return Qnil;
9279
9280       if (XINT (start) < GPT && XINT (end) > GPT)
9281         {
9282           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9283             move_gap_both (XINT (start), start_byte);
9284           else
9285             move_gap_both (XINT (end), end_byte);
9286         }
9287       pos = XINT (start);
9288     }
9289
9290   list = Qnil;
9291   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9292     {
9293       elt = XCAR (tail);
9294       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9295       ASET (attrs, coding_attr_trans_tbl,
9296             get_translation_table (attrs, 1, NULL));
9297       list = Fcons (list2 (elt, attrs), list);
9298     }
9299
9300   if (STRINGP (start))
9301     p = pbeg = SDATA (start);
9302   else
9303     p = pbeg = BYTE_POS_ADDR (start_byte);
9304   pend = p + (end_byte - start_byte);
9305
9306   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9307   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9308
9309   while (p < pend)
9310     {
9311       if (ASCII_BYTE_P (*p))
9312         p++;
9313       else
9314         {
9315           c = STRING_CHAR_ADVANCE (p);
9316
9317           charset_map_loaded = 0;
9318           for (tail = list; CONSP (tail); tail = XCDR (tail))
9319             {
9320               elt = XCDR (XCAR (tail));
9321               if (! char_encodable_p (c, XCAR (elt)))
9322                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9323             }
9324           if (charset_map_loaded)
9325             {
9326               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9327
9328               if (STRINGP (start))
9329                 pbeg = SDATA (start);
9330               else
9331                 pbeg = BYTE_POS_ADDR (start_byte);
9332               p = pbeg + p_offset;
9333               pend = pbeg + pend_offset;
9334             }
9335         }
9336       pos++;
9337     }
9338
9339   tail = list;
9340   list = Qnil;
9341   for (; CONSP (tail); tail = XCDR (tail))
9342     {
9343       elt = XCAR (tail);
9344       if (CONSP (XCDR (XCDR (elt))))
9345         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9346                       list);
9347     }
9348
9349   return list;
9350 }
9351
9352
9353 static Lisp_Object
9354 code_convert_region (Lisp_Object start, Lisp_Object end,
9355                      Lisp_Object coding_system, Lisp_Object dst_object,
9356                      bool encodep, bool norecord)
9357 {
9358   struct coding_system coding;
9359   ptrdiff_t from, from_byte, to, to_byte;
9360   Lisp_Object src_object;
9361
9362   if (NILP (coding_system))
9363     coding_system = Qno_conversion;
9364   else
9365     CHECK_CODING_SYSTEM (coding_system);
9366   src_object = Fcurrent_buffer ();
9367   if (NILP (dst_object))
9368     dst_object = src_object;
9369   else if (! EQ (dst_object, Qt))
9370     CHECK_BUFFER (dst_object);
9371
9372   validate_region (&start, &end);
9373   from = XFASTINT (start);
9374   from_byte = CHAR_TO_BYTE (from);
9375   to = XFASTINT (end);
9376   to_byte = CHAR_TO_BYTE (to);
9377
9378   setup_coding_system (coding_system, &coding);
9379   coding.mode |= CODING_MODE_LAST_BLOCK;
9380
9381   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9382     {
9383       struct buffer *buf = XBUFFER (dst_object);
9384       ptrdiff_t buf_pt = BUF_PT (buf);
9385
9386       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9387     }
9388
9389   if (encodep)
9390     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9391                           dst_object);
9392   else
9393     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9394                           dst_object);
9395   if (! norecord)
9396     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9397
9398   return (BUFFERP (dst_object)
9399           ? make_number (coding.produced_char)
9400           : coding.dst_object);
9401 }
9402
9403
9404 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9405        3, 4, "r\nzCoding system: ",
9406        doc: /* Decode the current region from the specified coding system.
9407 When called from a program, takes four arguments:
9408         START, END, CODING-SYSTEM, and DESTINATION.
9409 START and END are buffer positions.
9410
9411 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9412 If nil, the region between START and END is replaced by the decoded text.
9413 If buffer, the decoded text is inserted in that buffer after point (point
9414 does not move).
9415 In those cases, the length of the decoded text is returned.
9416 If DESTINATION is t, the decoded text is returned.
9417
9418 This function sets `last-coding-system-used' to the precise coding system
9419 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9420 not fully specified.)  */)
9421   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9422 {
9423   return code_convert_region (start, end, coding_system, destination, 0, 0);
9424 }
9425
9426 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9427        3, 4, "r\nzCoding system: ",
9428        doc: /* Encode the current region by specified coding system.
9429 When called from a program, takes four arguments:
9430         START, END, CODING-SYSTEM and DESTINATION.
9431 START and END are buffer positions.
9432
9433 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9434 If nil, the region between START and END is replace by the encoded text.
9435 If buffer, the encoded text is inserted in that buffer after point (point
9436 does not move).
9437 In those cases, the length of the encoded text is returned.
9438 If DESTINATION is t, the encoded text is returned.
9439
9440 This function sets `last-coding-system-used' to the precise coding system
9441 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9442 not fully specified.)  */)
9443   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9444 {
9445   return code_convert_region (start, end, coding_system, destination, 1, 0);
9446 }
9447
9448 Lisp_Object
9449 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9450                      Lisp_Object dst_object, bool encodep, bool nocopy,
9451                      bool norecord)
9452 {
9453   struct coding_system coding;
9454   ptrdiff_t chars, bytes;
9455
9456   CHECK_STRING (string);
9457   if (NILP (coding_system))
9458     {
9459       if (! norecord)
9460         Vlast_coding_system_used = Qno_conversion;
9461       if (NILP (dst_object))
9462         return (nocopy ? Fcopy_sequence (string) : string);
9463     }
9464
9465   if (NILP (coding_system))
9466     coding_system = Qno_conversion;
9467   else
9468     CHECK_CODING_SYSTEM (coding_system);
9469   if (NILP (dst_object))
9470     dst_object = Qt;
9471   else if (! EQ (dst_object, Qt))
9472     CHECK_BUFFER (dst_object);
9473
9474   setup_coding_system (coding_system, &coding);
9475   coding.mode |= CODING_MODE_LAST_BLOCK;
9476   chars = SCHARS (string);
9477   bytes = SBYTES (string);
9478
9479   if (BUFFERP (dst_object))
9480     {
9481       struct buffer *buf = XBUFFER (dst_object);
9482       ptrdiff_t buf_pt = BUF_PT (buf);
9483
9484       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9485     }
9486
9487   if (encodep)
9488     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9489   else
9490     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9491   if (! norecord)
9492     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9493
9494   return (BUFFERP (dst_object)
9495           ? make_number (coding.produced_char)
9496           : coding.dst_object);
9497 }
9498
9499
9500 /* Encode or decode STRING according to CODING_SYSTEM.
9501    Do not set Vlast_coding_system_used.
9502
9503    This function is called only from macros DECODE_FILE and
9504    ENCODE_FILE, thus we ignore character composition.  */
9505
9506 Lisp_Object
9507 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9508                               bool encodep)
9509 {
9510   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9511 }
9512
9513 /* Encode or decode a file name, to or from a unibyte string suitable
9514    for passing to C library functions.  */
9515 Lisp_Object
9516 decode_file_name (Lisp_Object fname)
9517 {
9518 #ifdef WINDOWSNT
9519   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9520      converts the file names either to UTF-16LE or to the system ANSI
9521      codepage internally, depending on the underlying OS; see w32.c.  */
9522   if (! NILP (Fcoding_system_p (Qutf_8)))
9523     return code_convert_string_norecord (fname, Qutf_8, 0);
9524   return fname;
9525 #else  /* !WINDOWSNT */
9526   if (! NILP (Vfile_name_coding_system))
9527     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9528   else if (! NILP (Vdefault_file_name_coding_system))
9529     return code_convert_string_norecord (fname,
9530                                          Vdefault_file_name_coding_system, 0);
9531   else
9532     return fname;
9533 #endif
9534 }
9535
9536 Lisp_Object
9537 encode_file_name (Lisp_Object fname)
9538 {
9539   /* This is especially important during bootstrap and dumping, when
9540      file-name encoding is not yet known, and therefore any non-ASCII
9541      file names are unibyte strings, and could only be thrashed if we
9542      try to encode them.  */
9543   if (!STRING_MULTIBYTE (fname))
9544     return fname;
9545 #ifdef WINDOWSNT
9546   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9547      converts the file names either to UTF-16LE or to the system ANSI
9548      codepage internally, depending on the underlying OS; see w32.c.  */
9549   if (! NILP (Fcoding_system_p (Qutf_8)))
9550     return code_convert_string_norecord (fname, Qutf_8, 1);
9551   return fname;
9552 #else  /* !WINDOWSNT */
9553   if (! NILP (Vfile_name_coding_system))
9554     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9555   else if (! NILP (Vdefault_file_name_coding_system))
9556     return code_convert_string_norecord (fname,
9557                                          Vdefault_file_name_coding_system, 1);
9558   else
9559     return fname;
9560 #endif
9561 }
9562
9563 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9564        2, 4, 0,
9565        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9566
9567 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9568 if the decoding operation is trivial.
9569
9570 Optional fourth arg BUFFER non-nil means that the decoded text is
9571 inserted in that buffer after point (point does not move).  In this
9572 case, the return value is the length of the decoded text.
9573
9574 This function sets `last-coding-system-used' to the precise coding system
9575 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9576 not fully specified.)  */)
9577   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9578 {
9579   return code_convert_string (string, coding_system, buffer,
9580                               0, ! NILP (nocopy), 0);
9581 }
9582
9583 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9584        2, 4, 0,
9585        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9586
9587 Optional third arg NOCOPY non-nil means it is OK to return STRING
9588 itself if the encoding operation is trivial.
9589
9590 Optional fourth arg BUFFER non-nil means that the encoded text is
9591 inserted in that buffer after point (point does not move).  In this
9592 case, the return value is the length of the encoded text.
9593
9594 This function sets `last-coding-system-used' to the precise coding system
9595 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9596 not fully specified.)  */)
9597   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9598 {
9599   return code_convert_string (string, coding_system, buffer,
9600                               1, ! NILP (nocopy), 0);
9601 }
9602
9603 \f
9604 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9605        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9606 Return the corresponding character.  */)
9607   (Lisp_Object code)
9608 {
9609   Lisp_Object spec, attrs, val;
9610   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9611   EMACS_INT ch;
9612   int c;
9613
9614   CHECK_NATNUM (code);
9615   ch = XFASTINT (code);
9616   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9617   attrs = AREF (spec, 0);
9618
9619   if (ASCII_BYTE_P (ch)
9620       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9621     return code;
9622
9623   val = CODING_ATTR_CHARSET_LIST (attrs);
9624   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9625   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9627
9628   if (ch <= 0x7F)
9629     {
9630       c = ch;
9631       charset = charset_roman;
9632     }
9633   else if (ch >= 0xA0 && ch < 0xDF)
9634     {
9635       c = ch - 0x80;
9636       charset = charset_kana;
9637     }
9638   else
9639     {
9640       EMACS_INT c1 = ch >> 8;
9641       int c2 = ch & 0xFF;
9642
9643       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9644           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9645         error ("Invalid code: %"pI"d", ch);
9646       c = ch;
9647       SJIS_TO_JIS (c);
9648       charset = charset_kanji;
9649     }
9650   c = DECODE_CHAR (charset, c);
9651   if (c < 0)
9652     error ("Invalid code: %"pI"d", ch);
9653   return make_number (c);
9654 }
9655
9656
9657 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9658        doc: /* Encode a Japanese character CH to shift_jis encoding.
9659 Return the corresponding code in SJIS.  */)
9660   (Lisp_Object ch)
9661 {
9662   Lisp_Object spec, attrs, charset_list;
9663   int c;
9664   struct charset *charset;
9665   unsigned code;
9666
9667   CHECK_CHARACTER (ch);
9668   c = XFASTINT (ch);
9669   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9670   attrs = AREF (spec, 0);
9671
9672   if (ASCII_CHAR_P (c)
9673       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9674     return ch;
9675
9676   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9677   charset = char_charset (c, charset_list, &code);
9678   if (code == CHARSET_INVALID_CODE (charset))
9679     error ("Can't encode by shift_jis encoding: %c", c);
9680   JIS_TO_SJIS (code);
9681
9682   return make_number (code);
9683 }
9684
9685 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9686        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9687 Return the corresponding character.  */)
9688   (Lisp_Object code)
9689 {
9690   Lisp_Object spec, attrs, val;
9691   struct charset *charset_roman, *charset_big5, *charset;
9692   EMACS_INT ch;
9693   int c;
9694
9695   CHECK_NATNUM (code);
9696   ch = XFASTINT (code);
9697   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9698   attrs = AREF (spec, 0);
9699
9700   if (ASCII_BYTE_P (ch)
9701       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9702     return code;
9703
9704   val = CODING_ATTR_CHARSET_LIST (attrs);
9705   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9706   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9707
9708   if (ch <= 0x7F)
9709     {
9710       c = ch;
9711       charset = charset_roman;
9712     }
9713   else
9714     {
9715       EMACS_INT b1 = ch >> 8;
9716       int b2 = ch & 0x7F;
9717       if (b1 < 0xA1 || b1 > 0xFE
9718           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9719         error ("Invalid code: %"pI"d", ch);
9720       c = ch;
9721       charset = charset_big5;
9722     }
9723   c = DECODE_CHAR (charset, c);
9724   if (c < 0)
9725     error ("Invalid code: %"pI"d", ch);
9726   return make_number (c);
9727 }
9728
9729 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9730        doc: /* Encode the Big5 character CH to BIG5 coding system.
9731 Return the corresponding character code in Big5.  */)
9732   (Lisp_Object ch)
9733 {
9734   Lisp_Object spec, attrs, charset_list;
9735   struct charset *charset;
9736   int c;
9737   unsigned code;
9738
9739   CHECK_CHARACTER (ch);
9740   c = XFASTINT (ch);
9741   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9742   attrs = AREF (spec, 0);
9743   if (ASCII_CHAR_P (c)
9744       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9745     return ch;
9746
9747   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9748   charset = char_charset (c, charset_list, &code);
9749   if (code == CHARSET_INVALID_CODE (charset))
9750     error ("Can't encode by Big5 encoding: %c", c);
9751
9752   return make_number (code);
9753 }
9754
9755 \f
9756 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9757        Sset_terminal_coding_system_internal, 1, 2, 0,
9758        doc: /* Internal use only.  */)
9759   (Lisp_Object coding_system, Lisp_Object terminal)
9760 {
9761   struct terminal *term = get_terminal (terminal, 1);
9762   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9763   CHECK_SYMBOL (coding_system);
9764   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9765   /* We had better not send unsafe characters to terminal.  */
9766   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9767   /* Character composition should be disabled.  */
9768   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9769   terminal_coding->src_multibyte = 1;
9770   terminal_coding->dst_multibyte = 0;
9771   tset_charset_list
9772     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9773             ? coding_charset_list (terminal_coding)
9774             : list1 (make_number (charset_ascii))));
9775   return Qnil;
9776 }
9777
9778 DEFUN ("set-safe-terminal-coding-system-internal",
9779        Fset_safe_terminal_coding_system_internal,
9780        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9781        doc: /* Internal use only.  */)
9782   (Lisp_Object coding_system)
9783 {
9784   CHECK_SYMBOL (coding_system);
9785   setup_coding_system (Fcheck_coding_system (coding_system),
9786                        &safe_terminal_coding);
9787   /* Character composition should be disabled.  */
9788   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9789   safe_terminal_coding.src_multibyte = 1;
9790   safe_terminal_coding.dst_multibyte = 0;
9791   return Qnil;
9792 }
9793
9794 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9795        Sterminal_coding_system, 0, 1, 0,
9796        doc: /* Return coding system specified for terminal output on the given terminal.
9797 TERMINAL may be a terminal object, a frame, or nil for the selected
9798 frame's terminal device.  */)
9799   (Lisp_Object terminal)
9800 {
9801   struct coding_system *terminal_coding
9802     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9803   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9804
9805   /* For backward compatibility, return nil if it is `undecided'.  */
9806   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9807 }
9808
9809 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9810        Sset_keyboard_coding_system_internal, 1, 2, 0,
9811        doc: /* Internal use only.  */)
9812   (Lisp_Object coding_system, Lisp_Object terminal)
9813 {
9814   struct terminal *t = get_terminal (terminal, 1);
9815   CHECK_SYMBOL (coding_system);
9816   if (NILP (coding_system))
9817     coding_system = Qno_conversion;
9818   else
9819     Fcheck_coding_system (coding_system);
9820   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9821   /* Character composition should be disabled.  */
9822   TERMINAL_KEYBOARD_CODING (t)->common_flags
9823     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9824   return Qnil;
9825 }
9826
9827 DEFUN ("keyboard-coding-system",
9828        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9829        doc: /* Return coding system specified for decoding keyboard input.  */)
9830   (Lisp_Object terminal)
9831 {
9832   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9833                          (get_terminal (terminal, 1))->id);
9834 }
9835
9836 \f
9837 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9838        Sfind_operation_coding_system,  1, MANY, 0,
9839        doc: /* Choose a coding system for an operation based on the target name.
9840 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9841 DECODING-SYSTEM is the coding system to use for decoding
9842 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9843 for encoding (in case OPERATION does encoding).
9844
9845 The first argument OPERATION specifies an I/O primitive:
9846   For file I/O, `insert-file-contents' or `write-region'.
9847   For process I/O, `call-process', `call-process-region', or `start-process'.
9848   For network I/O, `open-network-stream'.
9849
9850 The remaining arguments should be the same arguments that were passed
9851 to the primitive.  Depending on which primitive, one of those arguments
9852 is selected as the TARGET.  For example, if OPERATION does file I/O,
9853 whichever argument specifies the file name is TARGET.
9854
9855 TARGET has a meaning which depends on OPERATION:
9856   For file I/O, TARGET is a file name (except for the special case below).
9857   For process I/O, TARGET is a process name.
9858   For network I/O, TARGET is a service name or a port number.
9859
9860 This function looks up what is specified for TARGET in
9861 `file-coding-system-alist', `process-coding-system-alist',
9862 or `network-coding-system-alist' depending on OPERATION.
9863 They may specify a coding system, a cons of coding systems,
9864 or a function symbol to call.
9865 In the last case, we call the function with one argument,
9866 which is a list of all the arguments given to this function.
9867 If the function can't decide a coding system, it can return
9868 `undecided' so that the normal code-detection is performed.
9869
9870 If OPERATION is `insert-file-contents', the argument corresponding to
9871 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9872 file name to look up, and BUFFER is a buffer that contains the file's
9873 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9874 function to call for FILENAME, that function should examine the
9875 contents of BUFFER instead of reading the file.
9876
9877 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9878   (ptrdiff_t nargs, Lisp_Object *args)
9879 {
9880   Lisp_Object operation, target_idx, target, val;
9881   register Lisp_Object chain;
9882
9883   if (nargs < 2)
9884     error ("Too few arguments");
9885   operation = args[0];
9886   if (!SYMBOLP (operation)
9887       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9888     error ("Invalid first argument");
9889   if (nargs <= 1 + XFASTINT (target_idx))
9890     error ("Too few arguments for operation `%s'",
9891            SDATA (SYMBOL_NAME (operation)));
9892   target = args[XFASTINT (target_idx) + 1];
9893   if (!(STRINGP (target)
9894         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9895             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9896         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9897     error ("Invalid argument %"pI"d of operation `%s'",
9898            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9899   if (CONSP (target))
9900     target = XCAR (target);
9901
9902   chain = ((EQ (operation, Qinsert_file_contents)
9903             || EQ (operation, Qwrite_region))
9904            ? Vfile_coding_system_alist
9905            : (EQ (operation, Qopen_network_stream)
9906               ? Vnetwork_coding_system_alist
9907               : Vprocess_coding_system_alist));
9908   if (NILP (chain))
9909     return Qnil;
9910
9911   for (; CONSP (chain); chain = XCDR (chain))
9912     {
9913       Lisp_Object elt;
9914
9915       elt = XCAR (chain);
9916       if (CONSP (elt)
9917           && ((STRINGP (target)
9918                && STRINGP (XCAR (elt))
9919                && fast_string_match (XCAR (elt), target) >= 0)
9920               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9921         {
9922           val = XCDR (elt);
9923           /* Here, if VAL is both a valid coding system and a valid
9924              function symbol, we return VAL as a coding system.  */
9925           if (CONSP (val))
9926             return val;
9927           if (! SYMBOLP (val))
9928             return Qnil;
9929           if (! NILP (Fcoding_system_p (val)))
9930             return Fcons (val, val);
9931           if (! NILP (Ffboundp (val)))
9932             {
9933               /* We use call1 rather than safe_call1
9934                  so as to get bug reports about functions called here
9935                  which don't handle the current interface.  */
9936               val = call1 (val, Flist (nargs, args));
9937               if (CONSP (val))
9938                 return val;
9939               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9940                 return Fcons (val, val);
9941             }
9942           return Qnil;
9943         }
9944     }
9945   return Qnil;
9946 }
9947
9948 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9949        Sset_coding_system_priority, 0, MANY, 0,
9950        doc: /* Assign higher priority to the coding systems given as arguments.
9951 If multiple coding systems belong to the same category,
9952 all but the first one are ignored.
9953
9954 usage: (set-coding-system-priority &rest coding-systems)  */)
9955   (ptrdiff_t nargs, Lisp_Object *args)
9956 {
9957   ptrdiff_t i, j;
9958   bool changed[coding_category_max];
9959   enum coding_category priorities[coding_category_max];
9960
9961   memset (changed, 0, sizeof changed);
9962
9963   for (i = j = 0; i < nargs; i++)
9964     {
9965       enum coding_category category;
9966       Lisp_Object spec, attrs;
9967
9968       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9969       attrs = AREF (spec, 0);
9970       category = XINT (CODING_ATTR_CATEGORY (attrs));
9971       if (changed[category])
9972         /* Ignore this coding system because a coding system of the
9973            same category already had a higher priority.  */
9974         continue;
9975       changed[category] = 1;
9976       priorities[j++] = category;
9977       if (coding_categories[category].id >= 0
9978           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9979         setup_coding_system (args[i], &coding_categories[category]);
9980       Fset (AREF (Vcoding_category_table, category), args[i]);
9981     }
9982
9983   /* Now we have decided top J priorities.  Reflect the order of the
9984      original priorities to the remaining priorities.  */
9985
9986   for (i = j, j = 0; i < coding_category_max; i++, j++)
9987     {
9988       while (j < coding_category_max
9989              && changed[coding_priorities[j]])
9990         j++;
9991       if (j == coding_category_max)
9992         emacs_abort ();
9993       priorities[i] = coding_priorities[j];
9994     }
9995
9996   memcpy (coding_priorities, priorities, sizeof priorities);
9997
9998   /* Update `coding-category-list'.  */
9999   Vcoding_category_list = Qnil;
10000   for (i = coding_category_max; i-- > 0; )
10001     Vcoding_category_list
10002       = Fcons (AREF (Vcoding_category_table, priorities[i]),
10003                Vcoding_category_list);
10004
10005   return Qnil;
10006 }
10007
10008 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10009        Scoding_system_priority_list, 0, 1, 0,
10010        doc: /* Return a list of coding systems ordered by their priorities.
10011 The list contains a subset of coding systems; i.e. coding systems
10012 assigned to each coding category (see `coding-category-list').
10013
10014 HIGHESTP non-nil means just return the highest priority one.  */)
10015   (Lisp_Object highestp)
10016 {
10017   int i;
10018   Lisp_Object val;
10019
10020   for (i = 0, val = Qnil; i < coding_category_max; i++)
10021     {
10022       enum coding_category category = coding_priorities[i];
10023       int id = coding_categories[category].id;
10024       Lisp_Object attrs;
10025
10026       if (id < 0)
10027         continue;
10028       attrs = CODING_ID_ATTRS (id);
10029       if (! NILP (highestp))
10030         return CODING_ATTR_BASE_NAME (attrs);
10031       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10032     }
10033   return Fnreverse (val);
10034 }
10035
10036 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10037
10038 static Lisp_Object
10039 make_subsidiaries (Lisp_Object base)
10040 {
10041   Lisp_Object subsidiaries;
10042   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10043   char *buf = alloca (base_name_len + 6);
10044   int i;
10045
10046   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10047   subsidiaries = make_uninit_vector (3);
10048   for (i = 0; i < 3; i++)
10049     {
10050       strcpy (buf + base_name_len, suffixes[i]);
10051       ASET (subsidiaries, i, intern (buf));
10052     }
10053   return subsidiaries;
10054 }
10055
10056
10057 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10058        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10059        doc: /* For internal use only.
10060 usage: (define-coding-system-internal ...)  */)
10061   (ptrdiff_t nargs, Lisp_Object *args)
10062 {
10063   Lisp_Object name;
10064   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10065   Lisp_Object attrs;            /* Vector of attributes.  */
10066   Lisp_Object eol_type;
10067   Lisp_Object aliases;
10068   Lisp_Object coding_type, charset_list, safe_charsets;
10069   enum coding_category category;
10070   Lisp_Object tail, val;
10071   int max_charset_id = 0;
10072   int i;
10073
10074   if (nargs < coding_arg_max)
10075     goto short_args;
10076
10077   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10078
10079   name = args[coding_arg_name];
10080   CHECK_SYMBOL (name);
10081   ASET (attrs, coding_attr_base_name, name);
10082
10083   val = args[coding_arg_mnemonic];
10084   if (! STRINGP (val))
10085     CHECK_CHARACTER (val);
10086   ASET (attrs, coding_attr_mnemonic, val);
10087
10088   coding_type = args[coding_arg_coding_type];
10089   CHECK_SYMBOL (coding_type);
10090   ASET (attrs, coding_attr_type, coding_type);
10091
10092   charset_list = args[coding_arg_charset_list];
10093   if (SYMBOLP (charset_list))
10094     {
10095       if (EQ (charset_list, Qiso_2022))
10096         {
10097           if (! EQ (coding_type, Qiso_2022))
10098             error ("Invalid charset-list");
10099           charset_list = Viso_2022_charset_list;
10100         }
10101       else if (EQ (charset_list, Qemacs_mule))
10102         {
10103           if (! EQ (coding_type, Qemacs_mule))
10104             error ("Invalid charset-list");
10105           charset_list = Vemacs_mule_charset_list;
10106         }
10107       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10108         {
10109           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10110             error ("Invalid charset-list");
10111           if (max_charset_id < XFASTINT (XCAR (tail)))
10112             max_charset_id = XFASTINT (XCAR (tail));
10113         }
10114     }
10115   else
10116     {
10117       charset_list = Fcopy_sequence (charset_list);
10118       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10119         {
10120           struct charset *charset;
10121
10122           val = XCAR (tail);
10123           CHECK_CHARSET_GET_CHARSET (val, charset);
10124           if (EQ (coding_type, Qiso_2022)
10125               ? CHARSET_ISO_FINAL (charset) < 0
10126               : EQ (coding_type, Qemacs_mule)
10127               ? CHARSET_EMACS_MULE_ID (charset) < 0
10128               : 0)
10129             error ("Can't handle charset `%s'",
10130                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10131
10132           XSETCAR (tail, make_number (charset->id));
10133           if (max_charset_id < charset->id)
10134             max_charset_id = charset->id;
10135         }
10136     }
10137   ASET (attrs, coding_attr_charset_list, charset_list);
10138
10139   safe_charsets = make_uninit_string (max_charset_id + 1);
10140   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10141   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10142     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10143   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10144
10145   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10146
10147   val = args[coding_arg_decode_translation_table];
10148   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10149     CHECK_SYMBOL (val);
10150   ASET (attrs, coding_attr_decode_tbl, val);
10151
10152   val = args[coding_arg_encode_translation_table];
10153   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10154     CHECK_SYMBOL (val);
10155   ASET (attrs, coding_attr_encode_tbl, val);
10156
10157   val = args[coding_arg_post_read_conversion];
10158   CHECK_SYMBOL (val);
10159   ASET (attrs, coding_attr_post_read, val);
10160
10161   val = args[coding_arg_pre_write_conversion];
10162   CHECK_SYMBOL (val);
10163   ASET (attrs, coding_attr_pre_write, val);
10164
10165   val = args[coding_arg_default_char];
10166   if (NILP (val))
10167     ASET (attrs, coding_attr_default_char, make_number (' '));
10168   else
10169     {
10170       CHECK_CHARACTER (val);
10171       ASET (attrs, coding_attr_default_char, val);
10172     }
10173
10174   val = args[coding_arg_for_unibyte];
10175   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10176
10177   val = args[coding_arg_plist];
10178   CHECK_LIST (val);
10179   ASET (attrs, coding_attr_plist, val);
10180
10181   if (EQ (coding_type, Qcharset))
10182     {
10183       /* Generate a lisp vector of 256 elements.  Each element is nil,
10184          integer, or a list of charset IDs.
10185
10186          If Nth element is nil, the byte code N is invalid in this
10187          coding system.
10188
10189          If Nth element is a number NUM, N is the first byte of a
10190          charset whose ID is NUM.
10191
10192          If Nth element is a list of charset IDs, N is the first byte
10193          of one of them.  The list is sorted by dimensions of the
10194          charsets.  A charset of smaller dimension comes first. */
10195       val = Fmake_vector (make_number (256), Qnil);
10196
10197       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10198         {
10199           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10200           int dim = CHARSET_DIMENSION (charset);
10201           int idx = (dim - 1) * 4;
10202
10203           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10204             ASET (attrs, coding_attr_ascii_compat, Qt);
10205
10206           for (i = charset->code_space[idx];
10207                i <= charset->code_space[idx + 1]; i++)
10208             {
10209               Lisp_Object tmp, tmp2;
10210               int dim2;
10211
10212               tmp = AREF (val, i);
10213               if (NILP (tmp))
10214                 tmp = XCAR (tail);
10215               else if (NUMBERP (tmp))
10216                 {
10217                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10218                   if (dim < dim2)
10219                     tmp = list2 (XCAR (tail), tmp);
10220                   else
10221                     tmp = list2 (tmp, XCAR (tail));
10222                 }
10223               else
10224                 {
10225                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10226                     {
10227                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10228                       if (dim < dim2)
10229                         break;
10230                     }
10231                   if (NILP (tmp2))
10232                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10233                   else
10234                     {
10235                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10236                       XSETCAR (tmp2, XCAR (tail));
10237                     }
10238                 }
10239               ASET (val, i, tmp);
10240             }
10241         }
10242       ASET (attrs, coding_attr_charset_valids, val);
10243       category = coding_category_charset;
10244     }
10245   else if (EQ (coding_type, Qccl))
10246     {
10247       Lisp_Object valids;
10248
10249       if (nargs < coding_arg_ccl_max)
10250         goto short_args;
10251
10252       val = args[coding_arg_ccl_decoder];
10253       CHECK_CCL_PROGRAM (val);
10254       if (VECTORP (val))
10255         val = Fcopy_sequence (val);
10256       ASET (attrs, coding_attr_ccl_decoder, val);
10257
10258       val = args[coding_arg_ccl_encoder];
10259       CHECK_CCL_PROGRAM (val);
10260       if (VECTORP (val))
10261         val = Fcopy_sequence (val);
10262       ASET (attrs, coding_attr_ccl_encoder, val);
10263
10264       val = args[coding_arg_ccl_valids];
10265       valids = Fmake_string (make_number (256), make_number (0));
10266       for (tail = val; CONSP (tail); tail = XCDR (tail))
10267         {
10268           int from, to;
10269
10270           val = XCAR (tail);
10271           if (INTEGERP (val))
10272             {
10273               if (! (0 <= XINT (val) && XINT (val) <= 255))
10274                 args_out_of_range_3 (val, make_number (0), make_number (255));
10275               from = to = XINT (val);
10276             }
10277           else
10278             {
10279               CHECK_CONS (val);
10280               CHECK_NATNUM_CAR (val);
10281               CHECK_NUMBER_CDR (val);
10282               if (XINT (XCAR (val)) > 255)
10283                 args_out_of_range_3 (XCAR (val),
10284                                      make_number (0), make_number (255));
10285               from = XINT (XCAR (val));
10286               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10287                 args_out_of_range_3 (XCDR (val),
10288                                      XCAR (val), make_number (255));
10289               to = XINT (XCDR (val));
10290             }
10291           for (i = from; i <= to; i++)
10292             SSET (valids, i, 1);
10293         }
10294       ASET (attrs, coding_attr_ccl_valids, valids);
10295
10296       category = coding_category_ccl;
10297     }
10298   else if (EQ (coding_type, Qutf_16))
10299     {
10300       Lisp_Object bom, endian;
10301
10302       ASET (attrs, coding_attr_ascii_compat, Qnil);
10303
10304       if (nargs < coding_arg_utf16_max)
10305         goto short_args;
10306
10307       bom = args[coding_arg_utf16_bom];
10308       if (! NILP (bom) && ! EQ (bom, Qt))
10309         {
10310           CHECK_CONS (bom);
10311           val = XCAR (bom);
10312           CHECK_CODING_SYSTEM (val);
10313           val = XCDR (bom);
10314           CHECK_CODING_SYSTEM (val);
10315         }
10316       ASET (attrs, coding_attr_utf_bom, bom);
10317
10318       endian = args[coding_arg_utf16_endian];
10319       CHECK_SYMBOL (endian);
10320       if (NILP (endian))
10321         endian = Qbig;
10322       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10323         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10324       ASET (attrs, coding_attr_utf_16_endian, endian);
10325
10326       category = (CONSP (bom)
10327                   ? coding_category_utf_16_auto
10328                   : NILP (bom)
10329                   ? (EQ (endian, Qbig)
10330                      ? coding_category_utf_16_be_nosig
10331                      : coding_category_utf_16_le_nosig)
10332                   : (EQ (endian, Qbig)
10333                      ? coding_category_utf_16_be
10334                      : coding_category_utf_16_le));
10335     }
10336   else if (EQ (coding_type, Qiso_2022))
10337     {
10338       Lisp_Object initial, reg_usage, request, flags;
10339
10340       if (nargs < coding_arg_iso2022_max)
10341         goto short_args;
10342
10343       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10344       CHECK_VECTOR (initial);
10345       for (i = 0; i < 4; i++)
10346         {
10347           val = AREF (initial, i);
10348           if (! NILP (val))
10349             {
10350               struct charset *charset;
10351
10352               CHECK_CHARSET_GET_CHARSET (val, charset);
10353               ASET (initial, i, make_number (CHARSET_ID (charset)));
10354               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10355                 ASET (attrs, coding_attr_ascii_compat, Qt);
10356             }
10357           else
10358             ASET (initial, i, make_number (-1));
10359         }
10360
10361       reg_usage = args[coding_arg_iso2022_reg_usage];
10362       CHECK_CONS (reg_usage);
10363       CHECK_NUMBER_CAR (reg_usage);
10364       CHECK_NUMBER_CDR (reg_usage);
10365
10366       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10367       for (tail = request; CONSP (tail); tail = XCDR (tail))
10368         {
10369           int id;
10370           Lisp_Object tmp1;
10371
10372           val = XCAR (tail);
10373           CHECK_CONS (val);
10374           tmp1 = XCAR (val);
10375           CHECK_CHARSET_GET_ID (tmp1, id);
10376           CHECK_NATNUM_CDR (val);
10377           if (XINT (XCDR (val)) >= 4)
10378             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10379           XSETCAR (val, make_number (id));
10380         }
10381
10382       flags = args[coding_arg_iso2022_flags];
10383       CHECK_NATNUM (flags);
10384       i = XINT (flags) & INT_MAX;
10385       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10386         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10387       flags = make_number (i);
10388
10389       ASET (attrs, coding_attr_iso_initial, initial);
10390       ASET (attrs, coding_attr_iso_usage, reg_usage);
10391       ASET (attrs, coding_attr_iso_request, request);
10392       ASET (attrs, coding_attr_iso_flags, flags);
10393       setup_iso_safe_charsets (attrs);
10394
10395       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10396         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10397                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10398                     ? coding_category_iso_7_else
10399                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10400                     ? coding_category_iso_7
10401                     : coding_category_iso_7_tight);
10402       else
10403         {
10404           int id = XINT (AREF (initial, 1));
10405
10406           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10407                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10408                        || id < 0)
10409                       ? coding_category_iso_8_else
10410                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10411                       ? coding_category_iso_8_1
10412                       : coding_category_iso_8_2);
10413         }
10414       if (category != coding_category_iso_8_1
10415           && category != coding_category_iso_8_2)
10416         ASET (attrs, coding_attr_ascii_compat, Qnil);
10417     }
10418   else if (EQ (coding_type, Qemacs_mule))
10419     {
10420       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10421         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10422       ASET (attrs, coding_attr_ascii_compat, Qt);
10423       category = coding_category_emacs_mule;
10424     }
10425   else if (EQ (coding_type, Qshift_jis))
10426     {
10427
10428       struct charset *charset;
10429
10430       if (XINT (Flength (charset_list)) != 3
10431           && XINT (Flength (charset_list)) != 4)
10432         error ("There should be three or four charsets");
10433
10434       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10435       if (CHARSET_DIMENSION (charset) != 1)
10436         error ("Dimension of charset %s is not one",
10437                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10438       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10439         ASET (attrs, coding_attr_ascii_compat, Qt);
10440
10441       charset_list = XCDR (charset_list);
10442       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10443       if (CHARSET_DIMENSION (charset) != 1)
10444         error ("Dimension of charset %s is not one",
10445                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10446
10447       charset_list = XCDR (charset_list);
10448       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10449       if (CHARSET_DIMENSION (charset) != 2)
10450         error ("Dimension of charset %s is not two",
10451                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10452
10453       charset_list = XCDR (charset_list);
10454       if (! NILP (charset_list))
10455         {
10456           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10457           if (CHARSET_DIMENSION (charset) != 2)
10458             error ("Dimension of charset %s is not two",
10459                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10460         }
10461
10462       category = coding_category_sjis;
10463       Vsjis_coding_system = name;
10464     }
10465   else if (EQ (coding_type, Qbig5))
10466     {
10467       struct charset *charset;
10468
10469       if (XINT (Flength (charset_list)) != 2)
10470         error ("There should be just two charsets");
10471
10472       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10473       if (CHARSET_DIMENSION (charset) != 1)
10474         error ("Dimension of charset %s is not one",
10475                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10476       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10477         ASET (attrs, coding_attr_ascii_compat, Qt);
10478
10479       charset_list = XCDR (charset_list);
10480       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10481       if (CHARSET_DIMENSION (charset) != 2)
10482         error ("Dimension of charset %s is not two",
10483                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10484
10485       category = coding_category_big5;
10486       Vbig5_coding_system = name;
10487     }
10488   else if (EQ (coding_type, Qraw_text))
10489     {
10490       category = coding_category_raw_text;
10491       ASET (attrs, coding_attr_ascii_compat, Qt);
10492     }
10493   else if (EQ (coding_type, Qutf_8))
10494     {
10495       Lisp_Object bom;
10496
10497       if (nargs < coding_arg_utf8_max)
10498         goto short_args;
10499
10500       bom = args[coding_arg_utf8_bom];
10501       if (! NILP (bom) && ! EQ (bom, Qt))
10502         {
10503           CHECK_CONS (bom);
10504           val = XCAR (bom);
10505           CHECK_CODING_SYSTEM (val);
10506           val = XCDR (bom);
10507           CHECK_CODING_SYSTEM (val);
10508         }
10509       ASET (attrs, coding_attr_utf_bom, bom);
10510       if (NILP (bom))
10511         ASET (attrs, coding_attr_ascii_compat, Qt);
10512
10513       category = (CONSP (bom) ? coding_category_utf_8_auto
10514                   : NILP (bom) ? coding_category_utf_8_nosig
10515                   : coding_category_utf_8_sig);
10516     }
10517   else if (EQ (coding_type, Qundecided))
10518     {
10519       if (nargs < coding_arg_undecided_max)
10520         goto short_args;
10521       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10522             args[coding_arg_undecided_inhibit_null_byte_detection]);
10523       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10524             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10525       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10526             args[coding_arg_undecided_prefer_utf_8]);
10527       category = coding_category_undecided;
10528     }
10529   else
10530     error ("Invalid coding system type: %s",
10531            SDATA (SYMBOL_NAME (coding_type)));
10532
10533   ASET (attrs, coding_attr_category, make_number (category));
10534   ASET (attrs, coding_attr_plist,
10535         Fcons (QCcategory,
10536                Fcons (AREF (Vcoding_category_table, category),
10537                       CODING_ATTR_PLIST (attrs))));
10538   ASET (attrs, coding_attr_plist,
10539         Fcons (QCascii_compatible_p,
10540                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10541                       CODING_ATTR_PLIST (attrs))));
10542
10543   eol_type = args[coding_arg_eol_type];
10544   if (! NILP (eol_type)
10545       && ! EQ (eol_type, Qunix)
10546       && ! EQ (eol_type, Qdos)
10547       && ! EQ (eol_type, Qmac))
10548     error ("Invalid eol-type");
10549
10550   aliases = list1 (name);
10551
10552   if (NILP (eol_type))
10553     {
10554       eol_type = make_subsidiaries (name);
10555       for (i = 0; i < 3; i++)
10556         {
10557           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10558
10559           this_name = AREF (eol_type, i);
10560           this_aliases = list1 (this_name);
10561           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10562           this_spec = make_uninit_vector (3);
10563           ASET (this_spec, 0, attrs);
10564           ASET (this_spec, 1, this_aliases);
10565           ASET (this_spec, 2, this_eol_type);
10566           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10567           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10568           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10569           if (NILP (val))
10570             Vcoding_system_alist
10571               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10572                        Vcoding_system_alist);
10573         }
10574     }
10575
10576   spec_vec = make_uninit_vector (3);
10577   ASET (spec_vec, 0, attrs);
10578   ASET (spec_vec, 1, aliases);
10579   ASET (spec_vec, 2, eol_type);
10580
10581   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10582   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10583   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10584   if (NILP (val))
10585     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10586                                   Vcoding_system_alist);
10587
10588   {
10589     int id = coding_categories[category].id;
10590
10591     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10592       setup_coding_system (name, &coding_categories[category]);
10593   }
10594
10595   return Qnil;
10596
10597  short_args:
10598   return Fsignal (Qwrong_number_of_arguments,
10599                   Fcons (intern ("define-coding-system-internal"),
10600                          make_number (nargs)));
10601 }
10602
10603
10604 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10605        3, 3, 0,
10606        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10607   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10608 {
10609   Lisp_Object spec, attrs;
10610
10611   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10612   attrs = AREF (spec, 0);
10613   if (EQ (prop, QCmnemonic))
10614     {
10615       if (! STRINGP (val))
10616         CHECK_CHARACTER (val);
10617       ASET (attrs, coding_attr_mnemonic, val);
10618     }
10619   else if (EQ (prop, QCdefault_char))
10620     {
10621       if (NILP (val))
10622         val = make_number (' ');
10623       else
10624         CHECK_CHARACTER (val);
10625       ASET (attrs, coding_attr_default_char, val);
10626     }
10627   else if (EQ (prop, QCdecode_translation_table))
10628     {
10629       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10630         CHECK_SYMBOL (val);
10631       ASET (attrs, coding_attr_decode_tbl, val);
10632     }
10633   else if (EQ (prop, QCencode_translation_table))
10634     {
10635       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10636         CHECK_SYMBOL (val);
10637       ASET (attrs, coding_attr_encode_tbl, val);
10638     }
10639   else if (EQ (prop, QCpost_read_conversion))
10640     {
10641       CHECK_SYMBOL (val);
10642       ASET (attrs, coding_attr_post_read, val);
10643     }
10644   else if (EQ (prop, QCpre_write_conversion))
10645     {
10646       CHECK_SYMBOL (val);
10647       ASET (attrs, coding_attr_pre_write, val);
10648     }
10649   else if (EQ (prop, QCascii_compatible_p))
10650     {
10651       ASET (attrs, coding_attr_ascii_compat, val);
10652     }
10653
10654   ASET (attrs, coding_attr_plist,
10655         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10656   return val;
10657 }
10658
10659
10660 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10661        Sdefine_coding_system_alias, 2, 2, 0,
10662        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10663   (Lisp_Object alias, Lisp_Object coding_system)
10664 {
10665   Lisp_Object spec, aliases, eol_type, val;
10666
10667   CHECK_SYMBOL (alias);
10668   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10669   aliases = AREF (spec, 1);
10670   /* ALIASES should be a list of length more than zero, and the first
10671      element is a base coding system.  Append ALIAS at the tail of the
10672      list.  */
10673   while (!NILP (XCDR (aliases)))
10674     aliases = XCDR (aliases);
10675   XSETCDR (aliases, list1 (alias));
10676
10677   eol_type = AREF (spec, 2);
10678   if (VECTORP (eol_type))
10679     {
10680       Lisp_Object subsidiaries;
10681       int i;
10682
10683       subsidiaries = make_subsidiaries (alias);
10684       for (i = 0; i < 3; i++)
10685         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10686                                      AREF (eol_type, i));
10687     }
10688
10689   Fputhash (alias, spec, Vcoding_system_hash_table);
10690   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10691   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10692   if (NILP (val))
10693     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10694                                   Vcoding_system_alist);
10695
10696   return Qnil;
10697 }
10698
10699 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10700        1, 1, 0,
10701        doc: /* Return the base of CODING-SYSTEM.
10702 Any alias or subsidiary coding system is not a base coding system.  */)
10703   (Lisp_Object coding_system)
10704 {
10705   Lisp_Object spec, attrs;
10706
10707   if (NILP (coding_system))
10708     return (Qno_conversion);
10709   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10710   attrs = AREF (spec, 0);
10711   return CODING_ATTR_BASE_NAME (attrs);
10712 }
10713
10714 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10715        1, 1, 0,
10716        doc: "Return the property list of CODING-SYSTEM.")
10717   (Lisp_Object coding_system)
10718 {
10719   Lisp_Object spec, attrs;
10720
10721   if (NILP (coding_system))
10722     coding_system = Qno_conversion;
10723   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10724   attrs = AREF (spec, 0);
10725   return CODING_ATTR_PLIST (attrs);
10726 }
10727
10728
10729 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10730        1, 1, 0,
10731        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10732   (Lisp_Object coding_system)
10733 {
10734   Lisp_Object spec;
10735
10736   if (NILP (coding_system))
10737     coding_system = Qno_conversion;
10738   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10739   return AREF (spec, 1);
10740 }
10741
10742 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10743        Scoding_system_eol_type, 1, 1, 0,
10744        doc: /* Return eol-type of CODING-SYSTEM.
10745 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10746
10747 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10748 and CR respectively.
10749
10750 A vector value indicates that a format of end-of-line should be
10751 detected automatically.  Nth element of the vector is the subsidiary
10752 coding system whose eol-type is N.  */)
10753   (Lisp_Object coding_system)
10754 {
10755   Lisp_Object spec, eol_type;
10756   int n;
10757
10758   if (NILP (coding_system))
10759     coding_system = Qno_conversion;
10760   if (! CODING_SYSTEM_P (coding_system))
10761     return Qnil;
10762   spec = CODING_SYSTEM_SPEC (coding_system);
10763   eol_type = AREF (spec, 2);
10764   if (VECTORP (eol_type))
10765     return Fcopy_sequence (eol_type);
10766   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10767   return make_number (n);
10768 }
10769
10770 #endif /* emacs */
10771
10772 \f
10773 /*** 9. Post-amble ***/
10774
10775 void
10776 init_coding_once (void)
10777 {
10778   int i;
10779
10780   for (i = 0; i < coding_category_max; i++)
10781     {
10782       coding_categories[i].id = -1;
10783       coding_priorities[i] = i;
10784     }
10785
10786   /* ISO2022 specific initialize routine.  */
10787   for (i = 0; i < 0x20; i++)
10788     iso_code_class[i] = ISO_control_0;
10789   for (i = 0x21; i < 0x7F; i++)
10790     iso_code_class[i] = ISO_graphic_plane_0;
10791   for (i = 0x80; i < 0xA0; i++)
10792     iso_code_class[i] = ISO_control_1;
10793   for (i = 0xA1; i < 0xFF; i++)
10794     iso_code_class[i] = ISO_graphic_plane_1;
10795   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10796   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10797   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10798   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10799   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10800   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10801   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10802   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10803   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10804
10805   for (i = 0; i < 256; i++)
10806     {
10807       emacs_mule_bytes[i] = 1;
10808     }
10809   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10810   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10811   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10812   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10813 }
10814
10815 #ifdef emacs
10816
10817 void
10818 syms_of_coding (void)
10819 {
10820   staticpro (&Vcoding_system_hash_table);
10821   {
10822     Lisp_Object args[2];
10823     args[0] = QCtest;
10824     args[1] = Qeq;
10825     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10826   }
10827
10828   staticpro (&Vsjis_coding_system);
10829   Vsjis_coding_system = Qnil;
10830
10831   staticpro (&Vbig5_coding_system);
10832   Vbig5_coding_system = Qnil;
10833
10834   staticpro (&Vcode_conversion_reused_workbuf);
10835   Vcode_conversion_reused_workbuf = Qnil;
10836
10837   staticpro (&Vcode_conversion_workbuf_name);
10838   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10839
10840   reused_workbuf_in_use = 0;
10841
10842   DEFSYM (Qcharset, "charset");
10843   DEFSYM (Qtarget_idx, "target-idx");
10844   DEFSYM (Qcoding_system_history, "coding-system-history");
10845   Fset (Qcoding_system_history, Qnil);
10846
10847   /* Target FILENAME is the first argument.  */
10848   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10849   /* Target FILENAME is the third argument.  */
10850   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10851
10852   DEFSYM (Qcall_process, "call-process");
10853   /* Target PROGRAM is the first argument.  */
10854   Fput (Qcall_process, Qtarget_idx, make_number (0));
10855
10856   DEFSYM (Qcall_process_region, "call-process-region");
10857   /* Target PROGRAM is the third argument.  */
10858   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10859
10860   DEFSYM (Qstart_process, "start-process");
10861   /* Target PROGRAM is the third argument.  */
10862   Fput (Qstart_process, Qtarget_idx, make_number (2));
10863
10864   DEFSYM (Qopen_network_stream, "open-network-stream");
10865   /* Target SERVICE is the fourth argument.  */
10866   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10867
10868   DEFSYM (Qcoding_system, "coding-system");
10869   DEFSYM (Qcoding_aliases, "coding-aliases");
10870
10871   DEFSYM (Qeol_type, "eol-type");
10872   DEFSYM (Qunix, "unix");
10873   DEFSYM (Qdos, "dos");
10874   DEFSYM (Qmac, "mac");
10875
10876   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10877   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10878   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10879   DEFSYM (Qdefault_char, "default-char");
10880   DEFSYM (Qundecided, "undecided");
10881   DEFSYM (Qno_conversion, "no-conversion");
10882   DEFSYM (Qraw_text, "raw-text");
10883
10884   DEFSYM (Qiso_2022, "iso-2022");
10885
10886   DEFSYM (Qutf_8, "utf-8");
10887   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10888
10889 #if defined (WINDOWSNT) || defined (CYGWIN)
10890   /* No, not utf-16-le: that one has a BOM.  */
10891   DEFSYM (Qutf_16le, "utf-16le");
10892 #endif
10893
10894   DEFSYM (Qutf_16, "utf-16");
10895   DEFSYM (Qbig, "big");
10896   DEFSYM (Qlittle, "little");
10897
10898   DEFSYM (Qshift_jis, "shift-jis");
10899   DEFSYM (Qbig5, "big5");
10900
10901   DEFSYM (Qcoding_system_p, "coding-system-p");
10902
10903   DEFSYM (Qcoding_system_error, "coding-system-error");
10904   Fput (Qcoding_system_error, Qerror_conditions,
10905         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10906   Fput (Qcoding_system_error, Qerror_message,
10907         build_pure_c_string ("Invalid coding system"));
10908
10909   DEFSYM (Qtranslation_table, "translation-table");
10910   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10911   DEFSYM (Qtranslation_table_id, "translation-table-id");
10912   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10913   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10914
10915   DEFSYM (Qvalid_codes, "valid-codes");
10916
10917   DEFSYM (Qemacs_mule, "emacs-mule");
10918
10919   DEFSYM (QCcategory, ":category");
10920   DEFSYM (QCmnemonic, ":mnemonic");
10921   DEFSYM (QCdefault_char, ":default-char");
10922   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10923   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10924   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10925   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10926   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10927
10928   Vcoding_category_table
10929     = Fmake_vector (make_number (coding_category_max), Qnil);
10930   staticpro (&Vcoding_category_table);
10931   /* Followings are target of code detection.  */
10932   ASET (Vcoding_category_table, coding_category_iso_7,
10933         intern_c_string ("coding-category-iso-7"));
10934   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10935         intern_c_string ("coding-category-iso-7-tight"));
10936   ASET (Vcoding_category_table, coding_category_iso_8_1,
10937         intern_c_string ("coding-category-iso-8-1"));
10938   ASET (Vcoding_category_table, coding_category_iso_8_2,
10939         intern_c_string ("coding-category-iso-8-2"));
10940   ASET (Vcoding_category_table, coding_category_iso_7_else,
10941         intern_c_string ("coding-category-iso-7-else"));
10942   ASET (Vcoding_category_table, coding_category_iso_8_else,
10943         intern_c_string ("coding-category-iso-8-else"));
10944   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10945         intern_c_string ("coding-category-utf-8-auto"));
10946   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10947         intern_c_string ("coding-category-utf-8"));
10948   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10949         intern_c_string ("coding-category-utf-8-sig"));
10950   ASET (Vcoding_category_table, coding_category_utf_16_be,
10951         intern_c_string ("coding-category-utf-16-be"));
10952   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10953         intern_c_string ("coding-category-utf-16-auto"));
10954   ASET (Vcoding_category_table, coding_category_utf_16_le,
10955         intern_c_string ("coding-category-utf-16-le"));
10956   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10957         intern_c_string ("coding-category-utf-16-be-nosig"));
10958   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10959         intern_c_string ("coding-category-utf-16-le-nosig"));
10960   ASET (Vcoding_category_table, coding_category_charset,
10961         intern_c_string ("coding-category-charset"));
10962   ASET (Vcoding_category_table, coding_category_sjis,
10963         intern_c_string ("coding-category-sjis"));
10964   ASET (Vcoding_category_table, coding_category_big5,
10965         intern_c_string ("coding-category-big5"));
10966   ASET (Vcoding_category_table, coding_category_ccl,
10967         intern_c_string ("coding-category-ccl"));
10968   ASET (Vcoding_category_table, coding_category_emacs_mule,
10969         intern_c_string ("coding-category-emacs-mule"));
10970   /* Followings are NOT target of code detection.  */
10971   ASET (Vcoding_category_table, coding_category_raw_text,
10972         intern_c_string ("coding-category-raw-text"));
10973   ASET (Vcoding_category_table, coding_category_undecided,
10974         intern_c_string ("coding-category-undecided"));
10975
10976   DEFSYM (Qinsufficient_source, "insufficient-source");
10977   DEFSYM (Qinvalid_source, "invalid-source");
10978   DEFSYM (Qinterrupted, "interrupted");
10979   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10980
10981   defsubr (&Scoding_system_p);
10982   defsubr (&Sread_coding_system);
10983   defsubr (&Sread_non_nil_coding_system);
10984   defsubr (&Scheck_coding_system);
10985   defsubr (&Sdetect_coding_region);
10986   defsubr (&Sdetect_coding_string);
10987   defsubr (&Sfind_coding_systems_region_internal);
10988   defsubr (&Sunencodable_char_position);
10989   defsubr (&Scheck_coding_systems_region);
10990   defsubr (&Sdecode_coding_region);
10991   defsubr (&Sencode_coding_region);
10992   defsubr (&Sdecode_coding_string);
10993   defsubr (&Sencode_coding_string);
10994   defsubr (&Sdecode_sjis_char);
10995   defsubr (&Sencode_sjis_char);
10996   defsubr (&Sdecode_big5_char);
10997   defsubr (&Sencode_big5_char);
10998   defsubr (&Sset_terminal_coding_system_internal);
10999   defsubr (&Sset_safe_terminal_coding_system_internal);
11000   defsubr (&Sterminal_coding_system);
11001   defsubr (&Sset_keyboard_coding_system_internal);
11002   defsubr (&Skeyboard_coding_system);
11003   defsubr (&Sfind_operation_coding_system);
11004   defsubr (&Sset_coding_system_priority);
11005   defsubr (&Sdefine_coding_system_internal);
11006   defsubr (&Sdefine_coding_system_alias);
11007   defsubr (&Scoding_system_put);
11008   defsubr (&Scoding_system_base);
11009   defsubr (&Scoding_system_plist);
11010   defsubr (&Scoding_system_aliases);
11011   defsubr (&Scoding_system_eol_type);
11012   defsubr (&Scoding_system_priority_list);
11013
11014   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11015                doc: /* List of coding systems.
11016
11017 Do not alter the value of this variable manually.  This variable should be
11018 updated by the functions `define-coding-system' and
11019 `define-coding-system-alias'.  */);
11020   Vcoding_system_list = Qnil;
11021
11022   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11023                doc: /* Alist of coding system names.
11024 Each element is one element list of coding system name.
11025 This variable is given to `completing-read' as COLLECTION argument.
11026
11027 Do not alter the value of this variable manually.  This variable should be
11028 updated by the functions `make-coding-system' and
11029 `define-coding-system-alias'.  */);
11030   Vcoding_system_alist = Qnil;
11031
11032   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11033                doc: /* List of coding-categories (symbols) ordered by priority.
11034
11035 On detecting a coding system, Emacs tries code detection algorithms
11036 associated with each coding-category one by one in this order.  When
11037 one algorithm agrees with a byte sequence of source text, the coding
11038 system bound to the corresponding coding-category is selected.
11039
11040 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11041   {
11042     int i;
11043
11044     Vcoding_category_list = Qnil;
11045     for (i = coding_category_max - 1; i >= 0; i--)
11046       Vcoding_category_list
11047         = Fcons (AREF (Vcoding_category_table, i),
11048                  Vcoding_category_list);
11049   }
11050
11051   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11052                doc: /* Specify the coding system for read operations.
11053 It is useful to bind this variable with `let', but do not set it globally.
11054 If the value is a coding system, it is used for decoding on read operation.
11055 If not, an appropriate element is used from one of the coding system alists.
11056 There are three such tables: `file-coding-system-alist',
11057 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11058   Vcoding_system_for_read = Qnil;
11059
11060   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11061                doc: /* Specify the coding system for write operations.
11062 Programs bind this variable with `let', but you should not set it globally.
11063 If the value is a coding system, it is used for encoding of output,
11064 when writing it to a file and when sending it to a file or subprocess.
11065
11066 If this does not specify a coding system, an appropriate element
11067 is used from one of the coding system alists.
11068 There are three such tables: `file-coding-system-alist',
11069 `process-coding-system-alist', and `network-coding-system-alist'.
11070 For output to files, if the above procedure does not specify a coding system,
11071 the value of `buffer-file-coding-system' is used.  */);
11072   Vcoding_system_for_write = Qnil;
11073
11074   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11075                doc: /*
11076 Coding system used in the latest file or process I/O.  */);
11077   Vlast_coding_system_used = Qnil;
11078
11079   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11080                doc: /*
11081 Error status of the last code conversion.
11082
11083 When an error was detected in the last code conversion, this variable
11084 is set to one of the following symbols.
11085   `insufficient-source'
11086   `inconsistent-eol'
11087   `invalid-source'
11088   `interrupted'
11089   `insufficient-memory'
11090 When no error was detected, the value doesn't change.  So, to check
11091 the error status of a code conversion by this variable, you must
11092 explicitly set this variable to nil before performing code
11093 conversion.  */);
11094   Vlast_code_conversion_error = Qnil;
11095
11096   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11097                doc: /*
11098 *Non-nil means always inhibit code conversion of end-of-line format.
11099 See info node `Coding Systems' and info node `Text and Binary' concerning
11100 such conversion.  */);
11101   inhibit_eol_conversion = 0;
11102
11103   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11104                doc: /*
11105 Non-nil means process buffer inherits coding system of process output.
11106 Bind it to t if the process output is to be treated as if it were a file
11107 read from some filesystem.  */);
11108   inherit_process_coding_system = 0;
11109
11110   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11111                doc: /*
11112 Alist to decide a coding system to use for a file I/O operation.
11113 The format is ((PATTERN . VAL) ...),
11114 where PATTERN is a regular expression matching a file name,
11115 VAL is a coding system, a cons of coding systems, or a function symbol.
11116 If VAL is a coding system, it is used for both decoding and encoding
11117 the file contents.
11118 If VAL is a cons of coding systems, the car part is used for decoding,
11119 and the cdr part is used for encoding.
11120 If VAL is a function symbol, the function must return a coding system
11121 or a cons of coding systems which are used as above.  The function is
11122 called with an argument that is a list of the arguments with which
11123 `find-operation-coding-system' was called.  If the function can't decide
11124 a coding system, it can return `undecided' so that the normal
11125 code-detection is performed.
11126
11127 See also the function `find-operation-coding-system'
11128 and the variable `auto-coding-alist'.  */);
11129   Vfile_coding_system_alist = Qnil;
11130
11131   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11132                doc: /*
11133 Alist to decide a coding system to use for a process I/O operation.
11134 The format is ((PATTERN . VAL) ...),
11135 where PATTERN is a regular expression matching a program name,
11136 VAL is a coding system, a cons of coding systems, or a function symbol.
11137 If VAL is a coding system, it is used for both decoding what received
11138 from the program and encoding what sent to the program.
11139 If VAL is a cons of coding systems, the car part is used for decoding,
11140 and the cdr part is used for encoding.
11141 If VAL is a function symbol, the function must return a coding system
11142 or a cons of coding systems which are used as above.
11143
11144 See also the function `find-operation-coding-system'.  */);
11145   Vprocess_coding_system_alist = Qnil;
11146
11147   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11148                doc: /*
11149 Alist to decide a coding system to use for a network I/O operation.
11150 The format is ((PATTERN . VAL) ...),
11151 where PATTERN is a regular expression matching a network service name
11152 or is a port number to connect to,
11153 VAL is a coding system, a cons of coding systems, or a function symbol.
11154 If VAL is a coding system, it is used for both decoding what received
11155 from the network stream and encoding what sent to the network stream.
11156 If VAL is a cons of coding systems, the car part is used for decoding,
11157 and the cdr part is used for encoding.
11158 If VAL is a function symbol, the function must return a coding system
11159 or a cons of coding systems which are used as above.
11160
11161 See also the function `find-operation-coding-system'.  */);
11162   Vnetwork_coding_system_alist = Qnil;
11163
11164   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11165                doc: /* Coding system to use with system messages.
11166 Also used for decoding keyboard input on X Window system.  */);
11167   Vlocale_coding_system = Qnil;
11168
11169   /* The eol mnemonics are reset in startup.el system-dependently.  */
11170   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11171                doc: /*
11172 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11173   eol_mnemonic_unix = build_pure_c_string (":");
11174
11175   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11176                doc: /*
11177 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11178   eol_mnemonic_dos = build_pure_c_string ("\\");
11179
11180   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11181                doc: /*
11182 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11183   eol_mnemonic_mac = build_pure_c_string ("/");
11184
11185   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11186                doc: /*
11187 *String displayed in mode line when end-of-line format is not yet determined.  */);
11188   eol_mnemonic_undecided = build_pure_c_string (":");
11189
11190   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11191                doc: /*
11192 *Non-nil enables character translation while encoding and decoding.  */);
11193   Venable_character_translation = Qt;
11194
11195   DEFVAR_LISP ("standard-translation-table-for-decode",
11196                Vstandard_translation_table_for_decode,
11197                doc: /* Table for translating characters while decoding.  */);
11198   Vstandard_translation_table_for_decode = Qnil;
11199
11200   DEFVAR_LISP ("standard-translation-table-for-encode",
11201                Vstandard_translation_table_for_encode,
11202                doc: /* Table for translating characters while encoding.  */);
11203   Vstandard_translation_table_for_encode = Qnil;
11204
11205   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11206                doc: /* Alist of charsets vs revision numbers.
11207 While encoding, if a charset (car part of an element) is found,
11208 designate it with the escape sequence identifying revision (cdr part
11209 of the element).  */);
11210   Vcharset_revision_table = Qnil;
11211
11212   DEFVAR_LISP ("default-process-coding-system",
11213                Vdefault_process_coding_system,
11214                doc: /* Cons of coding systems used for process I/O by default.
11215 The car part is used for decoding a process output,
11216 the cdr part is used for encoding a text to be sent to a process.  */);
11217   Vdefault_process_coding_system = Qnil;
11218
11219   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11220                doc: /*
11221 Table of extra Latin codes in the range 128..159 (inclusive).
11222 This is a vector of length 256.
11223 If Nth element is non-nil, the existence of code N in a file
11224 \(or output of subprocess) doesn't prevent it to be detected as
11225 a coding system of ISO 2022 variant which has a flag
11226 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11227 or reading output of a subprocess.
11228 Only 128th through 159th elements have a meaning.  */);
11229   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11230
11231   DEFVAR_LISP ("select-safe-coding-system-function",
11232                Vselect_safe_coding_system_function,
11233                doc: /*
11234 Function to call to select safe coding system for encoding a text.
11235
11236 If set, this function is called to force a user to select a proper
11237 coding system which can encode the text in the case that a default
11238 coding system used in each operation can't encode the text.  The
11239 function should take care that the buffer is not modified while
11240 the coding system is being selected.
11241
11242 The default value is `select-safe-coding-system' (which see).  */);
11243   Vselect_safe_coding_system_function = Qnil;
11244
11245   DEFVAR_BOOL ("coding-system-require-warning",
11246                coding_system_require_warning,
11247                doc: /* Internal use only.
11248 If non-nil, on writing a file, `select-safe-coding-system-function' is
11249 called even if `coding-system-for-write' is non-nil.  The command
11250 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11251   coding_system_require_warning = 0;
11252
11253
11254   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11255                inhibit_iso_escape_detection,
11256                doc: /*
11257 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11258
11259 When Emacs reads text, it tries to detect how the text is encoded.
11260 This code detection is sensitive to escape sequences.  If Emacs sees
11261 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11262 of the ISO2022 encodings, and decodes text by the corresponding coding
11263 system (e.g. `iso-2022-7bit').
11264
11265 However, there may be a case that you want to read escape sequences in
11266 a file as is.  In such a case, you can set this variable to non-nil.
11267 Then the code detection will ignore any escape sequences, and no text is
11268 detected as encoded in some ISO-2022 encoding.  The result is that all
11269 escape sequences become visible in a buffer.
11270
11271 The default value is nil, and it is strongly recommended not to change
11272 it.  That is because many Emacs Lisp source files that contain
11273 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11274 in Emacs's distribution, and they won't be decoded correctly on
11275 reading if you suppress escape sequence detection.
11276
11277 The other way to read escape sequences in a file without decoding is
11278 to explicitly specify some coding system that doesn't use ISO-2022
11279 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11280   inhibit_iso_escape_detection = 0;
11281
11282   DEFVAR_BOOL ("inhibit-null-byte-detection",
11283                inhibit_null_byte_detection,
11284                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11285 By default, Emacs treats it as binary data, and does not attempt to
11286 decode it.  The effect is as if you specified `no-conversion' for
11287 reading that text.
11288
11289 Set this to non-nil when a regular text happens to include null bytes.
11290 Examples are Index nodes of Info files and null-byte delimited output
11291 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11292 decode text as usual.  */);
11293   inhibit_null_byte_detection = 0;
11294
11295   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11296                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11297 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11298   disable_ascii_optimization = 0;
11299
11300   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11301                doc: /* Char table for translating self-inserting characters.
11302 This is applied to the result of input methods, not their input.
11303 See also `keyboard-translate-table'.
11304
11305 Use of this variable for character code unification was rendered
11306 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11307 internal character representation.  */);
11308     Vtranslation_table_for_input = Qnil;
11309
11310   {
11311     Lisp_Object args[coding_arg_undecided_max];
11312     Lisp_Object plist[16];
11313     int i;
11314
11315     for (i = 0; i < coding_arg_undecided_max; i++)
11316       args[i] = Qnil;
11317
11318     plist[0] = intern_c_string (":name");
11319     plist[1] = args[coding_arg_name] = Qno_conversion;
11320     plist[2] = intern_c_string (":mnemonic");
11321     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11322     plist[4] = intern_c_string (":coding-type");
11323     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11324     plist[6] = intern_c_string (":ascii-compatible-p");
11325     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11326     plist[8] = intern_c_string (":default-char");
11327     plist[9] = args[coding_arg_default_char] = make_number (0);
11328     plist[10] = intern_c_string (":for-unibyte");
11329     plist[11] = args[coding_arg_for_unibyte] = Qt;
11330     plist[12] = intern_c_string (":docstring");
11331     plist[13] = build_pure_c_string ("Do no conversion.\n\
11332 \n\
11333 When you visit a file with this coding, the file is read into a\n\
11334 unibyte buffer as is, thus each byte of a file is treated as a\n\
11335 character.");
11336     plist[14] = intern_c_string (":eol-type");
11337     plist[15] = args[coding_arg_eol_type] = Qunix;
11338     args[coding_arg_plist] = Flist (16, plist);
11339     Fdefine_coding_system_internal (coding_arg_max, args);
11340
11341     plist[1] = args[coding_arg_name] = Qundecided;
11342     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11343     plist[5] = args[coding_arg_coding_type] = Qundecided;
11344     /* This is already set.
11345        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11346     plist[8] = intern_c_string (":charset-list");
11347     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11348     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11349     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11350     plist[15] = args[coding_arg_eol_type] = Qnil;
11351     args[coding_arg_plist] = Flist (16, plist);
11352     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11353     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11354     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11355   }
11356
11357   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11358
11359   {
11360     int i;
11361
11362     for (i = 0; i < coding_category_max; i++)
11363       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11364   }
11365 #if defined (DOS_NT)
11366   system_eol_type = Qdos;
11367 #else
11368   system_eol_type = Qunix;
11369 #endif
11370   staticpro (&system_eol_type);
11371 }
11372
11373 char *
11374 emacs_strerror (int error_number)
11375 {
11376   char *str;
11377
11378   synchronize_system_messages_locale ();
11379   str = strerror (error_number);
11380
11381   if (! NILP (Vlocale_coding_system))
11382     {
11383       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11384                                                       Vlocale_coding_system,
11385                                                       0);
11386       str = SSDATA (dec);
11387     }
11388
11389   return str;
11390 }
11391
11392 #endif /* emacs */