src/coding.h

   1 /* Header for coding system handler.
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 #ifndef EMACS_CODING_H
  27 #define EMACS_CODING_H
  28
  29 /* Index to arguments of Fdefine_coding_system_internal.  */
  30
  31 enum define_coding_system_arg_index
  32   {
  33     coding_arg_name,
  34     coding_arg_mnemonic,
  35     coding_arg_coding_type,
  36     coding_arg_charset_list,
  37     coding_arg_ascii_compatible_p,
  38     coding_arg_decode_translation_table,
  39     coding_arg_encode_translation_table,
  40     coding_arg_post_read_conversion,
  41     coding_arg_pre_write_conversion,
  42     coding_arg_default_char,
  43     coding_arg_for_unibyte,
  44     coding_arg_plist,
  45     coding_arg_eol_type,
  46     coding_arg_max
  47   };
  48
  49 enum define_coding_iso2022_arg_index
  50   {
  51     coding_arg_iso2022_initial = coding_arg_max,
  52     coding_arg_iso2022_reg_usage,
  53     coding_arg_iso2022_request,
  54     coding_arg_iso2022_flags,
  55     coding_arg_iso2022_max
  56   };
  57
  58 enum define_coding_utf8_arg_index
  59   {
  60     coding_arg_utf8_bom = coding_arg_max,
  61     coding_arg_utf8_max
  62   };
  63
  64 enum define_coding_utf16_arg_index
  65   {
  66     coding_arg_utf16_bom = coding_arg_max,
  67     coding_arg_utf16_endian,
  68     coding_arg_utf16_max
  69   };
  70
  71 enum define_coding_ccl_arg_index
  72   {
  73     coding_arg_ccl_decoder = coding_arg_max,
  74     coding_arg_ccl_encoder,
  75     coding_arg_ccl_valids,
  76     coding_arg_ccl_max
  77   };
  78
  79 enum define_coding_undecided_arg_index
  80   {
  81     coding_arg_undecided_inhibit_null_byte_detection = coding_arg_max,
  82     coding_arg_undecided_inhibit_iso_escape_detection,
  83     coding_arg_undecided_prefer_utf_8,
  84     coding_arg_undecided_max
  85   };
  86
  87 /* Hash table for all coding systems.  Keys are coding system symbols
  88    and values are spec vectors of the corresponding coding system.  A
  89    spec vector has the form [ ATTRS ALIASES EOL-TYPE ].  ATTRS is a
  90    vector of attribute of the coding system.  ALIASES is a list of
  91    aliases (symbols) of the coding system.  EOL-TYPE is `unix', `dos',
  92    `mac' or a vector of coding systems (symbols).  */
  93
  94 extern Lisp_Object Vcoding_system_hash_table;
  95
  96
  97 /* Enumeration of coding system type.  */
  98
  99 enum coding_system_type
 100   {
 101     coding_type_charset,
 102     coding_type_utf_8,
 103     coding_type_utf_16,
 104     coding_type_iso_2022,
 105     coding_type_emacs_mule,
 106     coding_type_sjis,
 107     coding_type_ccl,
 108     coding_type_raw_text,
 109     coding_type_undecided,
 110     coding_type_max
 111   };
 112
 113
 114 /* Enumeration of end-of-line format type.  */
 115
 116 enum end_of_line_type
 117   {
 118     eol_lf,             /* Line-feed only, same as Emacs' internal
 119                            format.  */
 120     eol_crlf,           /* Sequence of carriage-return and
 121                            line-feed.  */
 122     eol_cr,             /* Carriage-return only.  */
 123     eol_any,            /* Accept any of above.  Produce line-feed
 124                            only.  */
 125     eol_undecided,      /* This value is used to denote that the
 126                            eol-type is not yet undecided.  */
 127     eol_type_max
 128   };
 129
 130 /* Enumeration of index to an attribute vector of a coding system.  */
 131
 132 enum coding_attr_index
 133   {
 134     coding_attr_base_name,
 135     coding_attr_docstring,
 136     coding_attr_mnemonic,
 137     coding_attr_type,
 138     coding_attr_charset_list,
 139     coding_attr_ascii_compat,
 140     coding_attr_decode_tbl,
 141     coding_attr_encode_tbl,
 142     coding_attr_trans_tbl,
 143     coding_attr_post_read,
 144     coding_attr_pre_write,
 145     coding_attr_default_char,
 146     coding_attr_for_unibyte,
 147     coding_attr_plist,
 148
 149     coding_attr_category,
 150     coding_attr_safe_charsets,
 151
 152     /* The followings are extra attributes for each type.  */
 153     coding_attr_charset_valids,
 154
 155     coding_attr_ccl_decoder,
 156     coding_attr_ccl_encoder,
 157     coding_attr_ccl_valids,
 158
 159     coding_attr_iso_initial,
 160     coding_attr_iso_usage,
 161     coding_attr_iso_request,
 162     coding_attr_iso_flags,
 163
 164     coding_attr_utf_bom,
 165     coding_attr_utf_16_endian,
 166
 167     coding_attr_emacs_mule_full,
 168
 169     coding_attr_undecided_inhibit_null_byte_detection,
 170     coding_attr_undecided_inhibit_iso_escape_detection,
 171     coding_attr_undecided_prefer_utf_8,
 172
 173     coding_attr_last_index
 174   };
 175
 176
 177 /* Macros to access an element of an attribute vector.  */
 178
 179 #define CODING_ATTR_BASE_NAME(attrs)    AREF (attrs, coding_attr_base_name)
 180 #define CODING_ATTR_TYPE(attrs)         AREF (attrs, coding_attr_type)
 181 #define CODING_ATTR_CHARSET_LIST(attrs) AREF (attrs, coding_attr_charset_list)
 182 #define CODING_ATTR_MNEMONIC(attrs)     AREF (attrs, coding_attr_mnemonic)
 183 #define CODING_ATTR_DOCSTRING(attrs)    AREF (attrs, coding_attr_docstring)
 184 #define CODING_ATTR_ASCII_COMPAT(attrs) AREF (attrs, coding_attr_ascii_compat)
 185 #define CODING_ATTR_DECODE_TBL(attrs)   AREF (attrs, coding_attr_decode_tbl)
 186 #define CODING_ATTR_ENCODE_TBL(attrs)   AREF (attrs, coding_attr_encode_tbl)
 187 #define CODING_ATTR_TRANS_TBL(attrs)    AREF (attrs, coding_attr_trans_tbl)
 188 #define CODING_ATTR_POST_READ(attrs)    AREF (attrs, coding_attr_post_read)
 189 #define CODING_ATTR_PRE_WRITE(attrs)    AREF (attrs, coding_attr_pre_write)
 190 #define CODING_ATTR_DEFAULT_CHAR(attrs) AREF (attrs, coding_attr_default_char)
 191 #define CODING_ATTR_FOR_UNIBYTE(attrs)  AREF (attrs, coding_attr_for_unibyte)
 192 #define CODING_ATTR_PLIST(attrs)        AREF (attrs, coding_attr_plist)
 193 #define CODING_ATTR_CATEGORY(attrs)     AREF (attrs, coding_attr_category)
 194 #define CODING_ATTR_SAFE_CHARSETS(attrs)AREF (attrs, coding_attr_safe_charsets)
 195
 196
 197 /* Return the name of a coding system specified by ID.  */
 198 #define CODING_ID_NAME(id) \
 199   (HASH_KEY (XHASH_TABLE (Vcoding_system_hash_table), id))
 200
 201 /* Return the attribute vector of a coding system specified by ID.  */
 202
 203 #define CODING_ID_ATTRS(id)     \
 204   (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 0))
 205
 206 /* Return the list of aliases of a coding system specified by ID.  */
 207
 208 #define CODING_ID_ALIASES(id)   \
 209   (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 1))
 210
 211 /* Return the eol-type of a coding system specified by ID.  */
 212
 213 #define CODING_ID_EOL_TYPE(id)  \
 214   (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 2))
 215
 216
 217 /* Return the spec vector of CODING_SYSTEM_SYMBOL.  */
 218
 219 #define CODING_SYSTEM_SPEC(coding_system_symbol)        \
 220   (Fgethash (coding_system_symbol, Vcoding_system_hash_table, Qnil))
 221
 222
 223 /* Return the ID of CODING_SYSTEM_SYMBOL.  */
 224
 225 #define CODING_SYSTEM_ID(coding_system_symbol)                  \
 226   hash_lookup (XHASH_TABLE (Vcoding_system_hash_table),         \
 227                coding_system_symbol, NULL)
 228
 229 /* Return true if CODING_SYSTEM_SYMBOL is a coding system.  */
 230
 231 #define CODING_SYSTEM_P(coding_system_symbol)           \
 232   (CODING_SYSTEM_ID (coding_system_symbol) >= 0         \
 233    || (! NILP (coding_system_symbol)                    \
 234        && ! NILP (Fcoding_system_p (coding_system_symbol))))
 235
 236 /* Check if X is a coding system or not.  */
 237
 238 #define CHECK_CODING_SYSTEM(x)                          \
 239   do {                                                  \
 240     if (CODING_SYSTEM_ID (x) < 0                        \
 241         && NILP (Fcheck_coding_system (x)))             \
 242       wrong_type_argument (Qcoding_system_p, (x));      \
 243   } while (false)
 244
 245
 246 /* Check if X is a coding system or not.  If it is, set SEPC to the
 247    spec vector of the coding system.  */
 248
 249 #define CHECK_CODING_SYSTEM_GET_SPEC(x, spec)           \
 250   do {                                                  \
 251     spec = CODING_SYSTEM_SPEC (x);                      \
 252     if (NILP (spec))                                    \
 253       {                                                 \
 254         Fcheck_coding_system (x);                       \
 255         spec = CODING_SYSTEM_SPEC (x);                  \
 256       }                                                 \
 257     if (NILP (spec))                                    \
 258       wrong_type_argument (Qcoding_system_p, (x));      \
 259   } while (false)
 260
 261
 262 /* Check if X is a coding system or not.  If it is, set ID to the
 263    ID of the coding system.  */
 264
 265 #define CHECK_CODING_SYSTEM_GET_ID(x, id)                       \
 266   do                                                            \
 267     {                                                           \
 268       id = CODING_SYSTEM_ID (x);                                \
 269       if (id < 0)                                               \
 270         {                                                       \
 271           Fcheck_coding_system (x);                             \
 272           id = CODING_SYSTEM_ID (x);                            \
 273         }                                                       \
 274       if (id < 0)                                               \
 275         wrong_type_argument (Qcoding_system_p, (x));    \
 276     } while (false)
 277
 278
 279 /*** GENERAL section ***/
 280
 281 /* Enumeration of result code of code conversion.  */
 282 enum coding_result_code
 283   {
 284     CODING_RESULT_SUCCESS,
 285     CODING_RESULT_INSUFFICIENT_SRC,
 286     CODING_RESULT_INSUFFICIENT_DST,
 287     CODING_RESULT_INVALID_SRC,
 288     CODING_RESULT_INTERRUPT
 289   };
 290
 291
 292 /* Macros used for the member `mode' of the struct coding_system.  */
 293
 294 /* If set, the decoding/encoding routines treat the current data as
 295    the last block of the whole text to be converted, and do the
 296    appropriate finishing job.  */
 297 #define CODING_MODE_LAST_BLOCK                  0x01
 298
 299 /* If set, it means that the current source text is in a buffer which
 300    enables selective display.  */
 301 #define CODING_MODE_SELECTIVE_DISPLAY           0x02
 302
 303 /* This flag is used by the decoding/encoding routines on the fly.  If
 304    set, it means that right-to-left text is being processed.  */
 305 #define CODING_MODE_DIRECTION                   0x04
 306
 307 #define CODING_MODE_FIXED_DESTINATION           0x08
 308
 309 /* If set, it means that the encoding routines produces some safe
 310    ASCII characters (usually '?') for unsupported characters.  */
 311 #define CODING_MODE_SAFE_ENCODING               0x10
 312
 313   /* For handling composition sequence.  */
 314 #include "composite.h"
 315
 316 enum composition_state
 317   {
 318     COMPOSING_NO,
 319     COMPOSING_CHAR,
 320     COMPOSING_RULE,
 321     COMPOSING_COMPONENT_CHAR,
 322     COMPOSING_COMPONENT_RULE
 323   };
 324
 325 /* Structure for the current composition status.  */
 326 struct composition_status
 327 {
 328   enum composition_state state;
 329   enum composition_method method;
 330   bool old_form;          /* true if pre-21 form */
 331   int length;             /* number of elements produced in charbuf */
 332   int nchars;             /* number of characters composed */
 333   int ncomps;             /* number of composition components */
 334   /* Maximum carryover is for the case of COMPOSITION_WITH_RULE_ALTCHARS.
 335      See the comment in coding.c.  */
 336   int carryover[4               /* annotation header */
 337                 + MAX_COMPOSITION_COMPONENTS * 3 - 2 /* ALTs and RULEs */
 338                 + 2                                  /* intermediate -1 -1 */
 339                 + MAX_COMPOSITION_COMPONENTS         /* CHARs */
 340                 ];
 341 };
 342
 343
 344 /* Structure of the field `spec.iso_2022' in the structure
 345    `coding_system'.  */
 346 struct iso_2022_spec
 347 {
 348   /* Bit-wise-or of CODING_ISO_FLAG_XXX.  */
 349   unsigned flags;
 350
 351   /* The current graphic register invoked to each graphic plane.  */
 352   int current_invocation[2];
 353
 354   /* The current charset designated to each graphic register.  The
 355      value -1 means that not charset is designated, -2 means that
 356      there was an invalid designation previously.  */
 357   int current_designation[4];
 358
 359   /* If positive, we are now scanning CTEXT extended segment.  */
 360   int ctext_extended_segment_len;
 361
 362   /* True temporarily only when graphic register 2 or 3 is invoked by
 363      single-shift while encoding.  */
 364   bool_bf single_shifting : 1;
 365
 366   /* True temporarily only when processing at beginning of line.  */
 367   bool_bf bol : 1;
 368
 369   /* If true, we are now scanning embedded UTF-8 sequence.  */
 370   bool_bf embedded_utf_8 : 1;
 371
 372   /* The current composition.  */
 373   struct composition_status cmp_status;
 374 };
 375
 376 struct emacs_mule_spec
 377 {
 378   struct composition_status cmp_status;
 379 };
 380
 381 struct undecided_spec
 382 {
 383   /* Inhibit null byte detection.  1 means always inhibit,
 384      -1 means do not inhibit, 0 means rely on user variable.  */
 385   int inhibit_nbd;
 386
 387   /* Inhibit ISO escape detection.  -1, 0, 1 as above.  */
 388   int inhibit_ied;
 389
 390   /* Prefer UTF-8 when the input could be other encodings.  */
 391   bool prefer_utf_8;
 392 };
 393
 394 enum utf_bom_type
 395   {
 396     utf_detect_bom,
 397     utf_without_bom,
 398     utf_with_bom
 399   };
 400
 401 enum utf_16_endian_type
 402   {
 403     utf_16_big_endian,
 404     utf_16_little_endian
 405   };
 406
 407 struct utf_16_spec
 408 {
 409   enum utf_bom_type bom;
 410   enum utf_16_endian_type endian;
 411   int surrogate;
 412 };
 413
 414 struct coding_detection_info
 415 {
 416   /* Values of these members are bitwise-OR of CATEGORY_MASK_XXXs.  */
 417   /* Which categories are already checked.  */
 418   int checked;
 419   /* Which categories are strongly found.  */
 420   int found;
 421   /* Which categories are rejected.  */
 422   int rejected;
 423 };
 424
 425
 426 struct coding_system
 427 {
 428   /* ID number of the coding system.  This is an index to
 429      Vcoding_system_hash_table.  This value is set by
 430      setup_coding_system.  At the early stage of building time, this
 431      value is -1 in the array coding_categories to indicate that no
 432      coding-system of that category is yet defined.  */
 433   ptrdiff_t id;
 434
 435   /* Flag bits of the coding system.  The meaning of each bit is common
 436      to all types of coding systems.  */
 437   int common_flags;
 438
 439   /* Mode bits of the coding system.  See the comments of the macros
 440      CODING_MODE_XXX.  */
 441   unsigned int mode;
 442
 443   /* Detailed information specific to each type of coding system.  */
 444   union
 445     {
 446       struct iso_2022_spec iso_2022;
 447       struct ccl_spec *ccl;     /* Defined in ccl.h.  */
 448       struct utf_16_spec utf_16;
 449       enum utf_bom_type utf_8_bom;
 450       struct emacs_mule_spec emacs_mule;
 451       struct undecided_spec undecided;
 452     } spec;
 453
 454   int max_charset_id;
 455   unsigned char *safe_charsets;
 456
 457   /* The following two members specify how binary 8-bit code 128..255
 458      are represented in source and destination text respectively.  True
 459      means they are represented by 2-byte sequence, false means they are
 460      represented by 1-byte as is (see the comment in character.h).  */
 461   bool_bf src_multibyte : 1;
 462   bool_bf dst_multibyte : 1;
 463
 464   /* How may heading bytes we can skip for decoding.  This is set to
 465      -1 in setup_coding_system, and updated by detect_coding.  So,
 466      when this is equal to the byte length of the text being
 467      converted, we can skip the actual conversion process except for
 468      the eol format.  */
 469   ptrdiff_t head_ascii;
 470
 471   /* How many bytes/chars at the source are detected as valid utf-8
 472      sequence.  Set by detect_coding_utf_8.  */
 473   ptrdiff_t detected_utf8_bytes, detected_utf8_chars;
 474
 475   /* Used internally in coding.c.  See the comment of detect_ascii.  */
 476   int eol_seen;
 477
 478   /* The following members are set by encoding/decoding routine.  */
 479   ptrdiff_t produced, produced_char, consumed, consumed_char;
 480
 481   /* Number of error source data found in a decoding routine.  */
 482   ptrdiff_t errors;
 483
 484   /* Store the positions of error source data.  */
 485   ptrdiff_t *error_positions;
 486
 487   /* Finish status of code conversion.  */
 488   enum coding_result_code result;
 489
 490   ptrdiff_t src_pos, src_pos_byte, src_chars, src_bytes;
 491   Lisp_Object src_object;
 492   const unsigned char *source;
 493
 494   ptrdiff_t dst_pos, dst_pos_byte, dst_bytes;
 495   Lisp_Object dst_object;
 496   unsigned char *destination;
 497
 498   /* If an element is non-negative, it is a character code.
 499
 500      If it is in the range -128..-1, it is a 8-bit character code
 501      minus 256.
 502
 503      If it is less than -128, it specifies the start of an annotation
 504      chunk.  The length of the chunk is -128 minus the value of the
 505      element.  The following elements are OFFSET, ANNOTATION-TYPE, and
 506      a sequence of actual data for the annotation.  OFFSET is a
 507      character position offset from dst_pos or src_pos,
 508      ANNOTATION-TYPE specifies the meaning of the annotation and how to
 509      handle the following data..  */
 510   int *charbuf;
 511   int charbuf_size, charbuf_used;
 512
 513   /* True if the source of conversion is not in the member
 514      `charbuf', but at `src_object'.  */
 515   bool_bf chars_at_source : 1;
 516
 517   /* Nonzero if the result of conversion is in `destination'
 518      buffer rather than in `dst_object'.  */
 519   bool_bf raw_destination : 1;
 520
 521   /* Set to true if charbuf contains an annotation.  */
 522   bool_bf annotated : 1;
 523
 524   unsigned char carryover[64];
 525   int carryover_bytes;
 526
 527   int default_char;
 528
 529   bool (*detector) (struct coding_system *, struct coding_detection_info *);
 530   void (*decoder) (struct coding_system *);
 531   bool (*encoder) (struct coding_system *);
 532 };
 533
 534 /* Meanings of bits in the member `common_flags' of the structure
 535    coding_system.  The lowest 8 bits are reserved for various kind of
 536    annotations (currently two of them are used).  */
 537 #define CODING_ANNOTATION_MASK                  0x00FF
 538 #define CODING_ANNOTATE_COMPOSITION_MASK        0x0001
 539 #define CODING_ANNOTATE_DIRECTION_MASK          0x0002
 540 #define CODING_ANNOTATE_CHARSET_MASK            0x0003
 541 #define CODING_FOR_UNIBYTE_MASK                 0x0100
 542 #define CODING_REQUIRE_FLUSHING_MASK            0x0200
 543 #define CODING_REQUIRE_DECODING_MASK            0x0400
 544 #define CODING_REQUIRE_ENCODING_MASK            0x0800
 545 #define CODING_REQUIRE_DETECTION_MASK           0x1000
 546 #define CODING_RESET_AT_BOL_MASK                0x2000
 547
 548 /* Return nonzero if the coding context CODING requires annotation
 549    handling.  */
 550 #define CODING_REQUIRE_ANNOTATION(coding) \
 551   ((coding)->common_flags & CODING_ANNOTATION_MASK)
 552
 553 /* Return nonzero if the coding context CODING prefers decoding into
 554    unibyte.  */
 555 #define CODING_FOR_UNIBYTE(coding) \
 556   ((coding)->common_flags & CODING_FOR_UNIBYTE_MASK)
 557
 558 /* Return nonzero if the coding context CODING requires specific code to be
 559    attached at the tail of converted text.  */
 560 #define CODING_REQUIRE_FLUSHING(coding) \
 561   ((coding)->common_flags & CODING_REQUIRE_FLUSHING_MASK)
 562
 563 /* Return nonzero if the coding context CODING requires code conversion on
 564    decoding.  */
 565 #define CODING_REQUIRE_DECODING(coding) \
 566   ((coding)->dst_multibyte              \
 567    || (coding)->common_flags & CODING_REQUIRE_DECODING_MASK)
 568
 569
 570 /* Return nonzero if the coding context CODING requires code conversion on
 571    encoding.
 572    The non-multibyte part of the condition is to support encoding of
 573    unibyte strings/buffers generated by string-as-unibyte or
 574    (set-buffer-multibyte nil) from multibyte strings/buffers.  */
 575 #define CODING_REQUIRE_ENCODING(coding)                         \
 576   ((coding)->src_multibyte                                      \
 577    || (coding)->common_flags & CODING_REQUIRE_ENCODING_MASK     \
 578    || (coding)->mode & CODING_MODE_SELECTIVE_DISPLAY)
 579
 580
 581 /* Return nonzero if the coding context CODING requires some kind of code
 582    detection.  */
 583 #define CODING_REQUIRE_DETECTION(coding) \
 584   ((coding)->common_flags & CODING_REQUIRE_DETECTION_MASK)
 585
 586 /* Return nonzero if the coding context CODING requires code conversion on
 587    decoding or some kind of code detection.  */
 588 #define CODING_MAY_REQUIRE_DECODING(coding)     \
 589   (CODING_REQUIRE_DECODING (coding)             \
 590    || CODING_REQUIRE_DETECTION (coding))
 591
 592 /* Macros to decode or encode a character of JISX0208 in SJIS.  S1 and
 593    S2 are the 1st and 2nd position-codes of JISX0208 in SJIS coding
 594    system.  C1 and C2 are the 1st and 2nd position codes of Emacs'
 595    internal format.  */
 596
 597 #define SJIS_TO_JIS(code)                               \
 598   do {                                                  \
 599     int s1, s2, j1, j2;                                 \
 600                                                         \
 601     s1 = (code) >> 8, s2 = (code) & 0xFF;               \
 602                                                         \
 603     if (s2 >= 0x9F)                                     \
 604       (j1 = s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0),       \
 605        j2 = s2 - 0x7E);                                 \
 606     else                                                \
 607       (j1 = s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1),     \
 608        j2 = s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F));         \
 609     (code) = (j1 << 8) | j2;                            \
 610   } while (false)
 611
 612 #define SJIS_TO_JIS2(code)                              \
 613   do {                                                  \
 614     int s1, s2, j1, j2;                                 \
 615                                                         \
 616     s1 = (code) >> 8, s2 = (code) & 0xFF;               \
 617                                                         \
 618     if (s2 >= 0x9F)                                     \
 619       {                                                 \
 620         j1 = (s1 == 0xF0 ? 0x28                         \
 621               : s1 == 0xF1 ? 0x24                       \
 622               : s1 == 0xF2 ? 0x2C                       \
 623               : s1 == 0xF3 ? 0x2E                       \
 624               : 0x6E + (s1 - 0xF4) * 2);                \
 625         j2 = s2 - 0x7E;                                 \
 626       }                                                 \
 627     else                                                \
 628       {                                                 \
 629         j1 = (s1 <= 0xF2 ? 0x21 + (s1 - 0xF0) * 2       \
 630               : s1 <= 0xF4 ? 0x2D + (s1 - 0xF3) * 2     \
 631               : 0x6F + (s1 - 0xF5) * 2);                \
 632         j2 = s2 - ((s2 >= 0x7F ? 0x20 : 0x1F));         \
 633       }                                                 \
 634     (code) = (j1 << 8) | j2;                            \
 635   } while (false)
 636
 637
 638 #define JIS_TO_SJIS(code)                               \
 639   do {                                                  \
 640     int s1, s2, j1, j2;                                 \
 641                                                         \
 642     j1 = (code) >> 8, j2 = (code) & 0xFF;               \
 643     if (j1 & 1)                                         \
 644       (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x71 : 0xB1),       \
 645        s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F));         \
 646     else                                                \
 647       (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x70 : 0xB0),       \
 648        s2 = j2 + 0x7E);                                 \
 649     (code) = (s1 << 8) | s2;                            \
 650   } while (false)
 651
 652 #define JIS_TO_SJIS2(code)                              \
 653   do {                                                  \
 654     int s1, s2, j1, j2;                                 \
 655                                                         \
 656     j1 = (code) >> 8, j2 = (code) & 0xFF;               \
 657     if (j1 & 1)                                         \
 658       {                                                 \
 659         s1 = (j1 <= 0x25 ? 0xF0 + (j1 - 0x21) / 2       \
 660               : j1 <= 0x2F ? 0xF3 + (j1 - 0x2D) / 2     \
 661               : 0xF5 + (j1 - 0x6F) / 2);                \
 662         s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F);         \
 663       }                                                 \
 664     else                                                \
 665       {                                                 \
 666         s1 = (j1 == 0x28 ? 0xF0                         \
 667               : j1 == 0x24 ? 0xF1                       \
 668               : j1 == 0x2C ? 0xF2                       \
 669               : j1 == 0x2E ? 0xF3                       \
 670               : 0xF4 + (j1 - 0x6E) / 2);                \
 671         s2 = j2 + 0x7E;                                 \
 672       }                                                 \
 673     (code) = (s1 << 8) | s2;                            \
 674   } while (false)
 675
 676 /* Encode the file name NAME using the specified coding system
 677    for file names, if any.  */
 678 #define ENCODE_FILE(NAME)  encode_file_name (NAME)
 679
 680 /* Decode the file name NAME using the specified coding system
 681    for file names, if any.  */
 682 #define DECODE_FILE(NAME)  decode_file_name (NAME)
 683
 684 /* Encode the string STR using the specified coding system
 685    for system functions, if any.  */
 686 #define ENCODE_SYSTEM(str)                                                 \
 687   (! NILP (Vlocale_coding_system)                                          \
 688    ? code_convert_string_norecord (str, Vlocale_coding_system, true)       \
 689    : str)
 690
 691 /* Decode the string STR using the specified coding system
 692    for system functions, if any.  */
 693 #define DECODE_SYSTEM(str)                                                 \
 694   (! NILP (Vlocale_coding_system)                                          \
 695    ? code_convert_string_norecord (str, Vlocale_coding_system, false)      \
 696    : str)
 697
 698 /* Note that this encodes utf-8, not utf-8-emacs, so it's not a no-op.  */
 699 #define ENCODE_UTF_8(str) code_convert_string_norecord (str, Qutf_8, true)
 700
 701 /* Extern declarations.  */
 702 extern Lisp_Object code_conversion_save (bool, bool);
 703 extern void setup_coding_system (Lisp_Object, struct coding_system *);
 704 extern Lisp_Object coding_charset_list (struct coding_system *);
 705 extern Lisp_Object coding_system_charset_list (Lisp_Object);
 706 extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object,
 707                                         Lisp_Object, bool, bool, bool);
 708 extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
 709                                                  bool);
 710 extern Lisp_Object encode_file_name (Lisp_Object);
 711 extern Lisp_Object decode_file_name (Lisp_Object);
 712 extern Lisp_Object raw_text_coding_system (Lisp_Object);
 713 extern Lisp_Object coding_inherit_eol_type (Lisp_Object, Lisp_Object);
 714 extern Lisp_Object complement_process_encoding_system (Lisp_Object);
 715
 716 extern void decode_coding_gap (struct coding_system *,
 717                                ptrdiff_t, ptrdiff_t);
 718 extern void decode_coding_object (struct coding_system *,
 719                                   Lisp_Object, ptrdiff_t, ptrdiff_t,
 720                                   ptrdiff_t, ptrdiff_t, Lisp_Object);
 721 extern void encode_coding_object (struct coding_system *,
 722                                   Lisp_Object, ptrdiff_t, ptrdiff_t,
 723                                   ptrdiff_t, ptrdiff_t, Lisp_Object);
 724
 725 #if defined (WINDOWSNT) || defined (CYGWIN)
 726
 727 /* These functions use Lisp string objects to store the UTF-16LE
 728    strings that modern versions of Windows expect.  These strings are
 729    not particularly useful to Lisp, and all Lisp strings should be
 730    native Emacs multibyte.  */
 731
 732 /* Access the wide-character string stored in a Lisp string object.  */
 733 #define WCSDATA(x) ((wchar_t *) SDATA (x))
 734
 735 /* Convert the multi-byte string in STR to UTF-16LE encoded unibyte
 736    string, and store it in *BUF.  BUF may safely point to STR on entry.  */
 737 extern wchar_t *to_unicode (Lisp_Object str, Lisp_Object *buf);
 738
 739 /* Convert STR, a UTF-16LE encoded string embedded in a unibyte string
 740    object, to a multi-byte Emacs string and return it.  This function
 741    calls code_convert_string_norecord internally and has all its
 742    failure modes.  STR itself is not modified.  */
 743 extern Lisp_Object from_unicode (Lisp_Object str);
 744
 745 /* Convert WSTR to an Emacs string.  */
 746 extern Lisp_Object from_unicode_buffer (const wchar_t* wstr);
 747
 748 #endif /* WINDOWSNT || CYGWIN */
 749
 750 /* Macros for backward compatibility.  */
 751
 752 #define encode_coding_string(coding, string, nocopy)                    \
 753   (STRING_MULTIBYTE(string) ?                                           \
 754     (encode_coding_object (coding, string, 0, 0, SCHARS (string),       \
 755                            SBYTES (string), Qt),                        \
 756      (coding)->dst_object) : (string))
 757
 758
 759 #define decode_coding_c_string(coding, src, bytes, dst_object)          \
 760   do {                                                                  \
 761     (coding)->source = (src);                                           \
 762     (coding)->src_chars = (coding)->src_bytes = (bytes);                \
 763     decode_coding_object ((coding), Qnil, 0, 0, (bytes), (bytes),       \
 764                           (dst_object));                                \
 765   } while (false)
 766
 767
 768 extern Lisp_Object preferred_coding_system (void);
 769
 770
 771 extern Lisp_Object Qutf_8, Qutf_8_emacs;
 772
 773 extern Lisp_Object Qcoding_category_index;
 774 extern Lisp_Object Qcoding_system_p;
 775 extern Lisp_Object Qraw_text, Qemacs_mule, Qno_conversion, Qundecided;
 776 extern Lisp_Object Qbuffer_file_coding_system;
 777
 778 extern Lisp_Object Qunix, Qdos;
 779
 780 extern Lisp_Object Qtranslation_table;
 781 extern Lisp_Object Qtranslation_table_id;
 782
 783 #ifdef emacs
 784 extern Lisp_Object Qfile_coding_system;
 785 extern Lisp_Object Qcall_process, Qcall_process_region;
 786 extern Lisp_Object Qstart_process, Qopen_network_stream;
 787 extern Lisp_Object Qwrite_region;
 788
 789 extern char *emacs_strerror (int);
 790
 791 /* Coding system to be used to encode text for terminal display when
 792    terminal coding system is nil.  */
 793 extern struct coding_system safe_terminal_coding;
 794
 795 #endif
 796
 797 /* Error signaled when there's a problem with detecting coding system */
 798 extern Lisp_Object Qcoding_system_error;
 799
 800 extern char emacs_mule_bytes[256];
 801
 802 #endif /* EMACS_CODING_H */