1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
45 0. Emacs' internal format (emacs-mule)
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
114 detect_coding_emacs_mule (src
, src_end
)
115 unsigned char *src
, *src_end
;
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
137 Below is a template of these functions. */
139 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
140 struct coding_system
*coding
;
141 unsigned char *source
, *destination
;
142 int src_bytes
, dst_bytes
;
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
164 Below is a template of these functions. */
166 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
167 struct coding_system
*coding
;
168 unsigned char *source
, *destination
;
169 int src_bytes
, dst_bytes
;
175 /*** COMMONLY USED MACROS ***/
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
183 #define ONE_MORE_BYTE(c1) \
188 goto label_end_of_loop; \
191 #define TWO_MORE_BYTES(c1, c2) \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
196 goto label_end_of_loop; \
199 #define THREE_MORE_BYTES(c1, c2, c3) \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
204 goto label_end_of_loop; \
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
214 /* Decode one ASCII character C. */
216 #define DECODE_CHARACTER_ASCII(c) \
218 if (COMPOSING_P (coding->composing)) \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
227 /* If ASCII charset is invoked to GR, \
228 we must reset MSB now. */ \
229 *dst++ = (c) & 0x7F; \
230 coding->produced_char++; \
234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
235 position-code is C. */
237 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
258 position-codes are C1 and C2. */
260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
269 /*** 1. Preamble ***/
286 #else /* not emacs */
290 #endif /* not emacs */
292 Lisp_Object Qcoding_system
, Qeol_type
;
293 Lisp_Object Qbuffer_file_coding_system
;
294 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
295 Lisp_Object Qno_conversion
, Qundecided
;
296 Lisp_Object Qcoding_system_history
;
297 Lisp_Object Qsafe_charsets
;
298 Lisp_Object Qvalid_codes
;
300 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
301 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
302 Lisp_Object Qstart_process
, Qopen_network_stream
;
303 Lisp_Object Qtarget_idx
;
305 Lisp_Object Vselect_safe_coding_system_function
;
307 /* Mnemonic string for each format of end-of-line. */
308 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
309 /* Mnemonic string to indicate format of end-of-line is not yet
311 Lisp_Object eol_mnemonic_undecided
;
313 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
314 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
319 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
321 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
323 /* Coding system emacs-mule and raw-text are for converting only
324 end-of-line format. */
325 Lisp_Object Qemacs_mule
, Qraw_text
;
327 /* Coding-systems are handed between Emacs Lisp programs and C internal
328 routines by the following three variables. */
329 /* Coding-system for reading files and receiving data from process. */
330 Lisp_Object Vcoding_system_for_read
;
331 /* Coding-system for writing files and sending data to process. */
332 Lisp_Object Vcoding_system_for_write
;
333 /* Coding-system actually used in the latest I/O. */
334 Lisp_Object Vlast_coding_system_used
;
336 /* A vector of length 256 which contains information about special
337 Latin codes (especially for dealing with Microsoft codes). */
338 Lisp_Object Vlatin_extra_code_table
;
340 /* Flag to inhibit code conversion of end-of-line format. */
341 int inhibit_eol_conversion
;
343 /* Flag to make buffer-file-coding-system inherit from process-coding. */
344 int inherit_process_coding_system
;
346 /* Coding system to be used to encode text for terminal display. */
347 struct coding_system terminal_coding
;
349 /* Coding system to be used to encode text for terminal display when
350 terminal coding system is nil. */
351 struct coding_system safe_terminal_coding
;
353 /* Coding system of what is sent from terminal keyboard. */
354 struct coding_system keyboard_coding
;
356 /* Default coding system to be used to write a file. */
357 struct coding_system default_buffer_file_coding
;
359 Lisp_Object Vfile_coding_system_alist
;
360 Lisp_Object Vprocess_coding_system_alist
;
361 Lisp_Object Vnetwork_coding_system_alist
;
363 Lisp_Object Vlocale_coding_system
;
367 Lisp_Object Qcoding_category
, Qcoding_category_index
;
369 /* List of symbols `coding-category-xxx' ordered by priority. */
370 Lisp_Object Vcoding_category_list
;
372 /* Table of coding categories (Lisp symbols). */
373 Lisp_Object Vcoding_category_table
;
375 /* Table of names of symbol for each coding-category. */
376 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
377 "coding-category-emacs-mule",
378 "coding-category-sjis",
379 "coding-category-iso-7",
380 "coding-category-iso-7-tight",
381 "coding-category-iso-8-1",
382 "coding-category-iso-8-2",
383 "coding-category-iso-7-else",
384 "coding-category-iso-8-else",
385 "coding-category-ccl",
386 "coding-category-big5",
387 "coding-category-raw-text",
388 "coding-category-binary"
391 /* Table of pointers to coding systems corresponding to each coding
393 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
395 /* Table of coding category masks. Nth element is a mask for a coding
396 cateogry of which priority is Nth. */
398 int coding_priorities
[CODING_CATEGORY_IDX_MAX
];
400 /* Flag to tell if we look up translation table on character code
402 Lisp_Object Venable_character_translation
;
403 /* Standard translation table to look up on decoding (reading). */
404 Lisp_Object Vstandard_translation_table_for_decode
;
405 /* Standard translation table to look up on encoding (writing). */
406 Lisp_Object Vstandard_translation_table_for_encode
;
408 Lisp_Object Qtranslation_table
;
409 Lisp_Object Qtranslation_table_id
;
410 Lisp_Object Qtranslation_table_for_decode
;
411 Lisp_Object Qtranslation_table_for_encode
;
413 /* Alist of charsets vs revision number. */
414 Lisp_Object Vcharset_revision_alist
;
416 /* Default coding systems used for process I/O. */
417 Lisp_Object Vdefault_process_coding_system
;
419 /* Global flag to tell that we can't call post-read-conversion and
420 pre-write-conversion functions. Usually the value is zero, but it
421 is set to 1 temporarily while such functions are running. This is
422 to avoid infinite recursive call. */
423 static int inhibit_pre_post_conversion
;
426 /*** 2. Emacs internal format (emacs-mule) handlers ***/
428 /* Emacs' internal format for encoding multiple character sets is a
429 kind of multi-byte encoding, i.e. characters are encoded by
430 variable-length sequences of one-byte codes. ASCII characters
431 and control characters (e.g. `tab', `newline') are represented by
432 one-byte sequences which are their ASCII codes, in the range 0x00
433 through 0x7F. The other characters are represented by a sequence
434 of `base leading-code', optional `extended leading-code', and one
435 or two `position-code's. The length of the sequence is determined
436 by the base leading-code. Leading-code takes the range 0x80
437 through 0x9F, whereas extended leading-code and position-code take
438 the range 0xA0 through 0xFF. See `charset.h' for more details
439 about leading-code and position-code.
441 There's one exception to this rule. Special leading-code
442 `leading-code-composition' denotes that the following several
443 characters should be composed into one character. Leading-codes of
444 components (except for ASCII) are added 0x20. An ASCII character
445 component is represented by a 2-byte sequence of `0xA0' and
446 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
447 details of composite character. Hence, we can summarize the code
450 --- CODE RANGE of Emacs' internal format ---
451 (character set) (range)
453 ELSE (1st byte) 0x80 .. 0x9F
454 (rest bytes) 0xA0 .. 0xFF
455 ---------------------------------------------
459 enum emacs_code_class_type emacs_code_class
[256];
461 /* Go to the next statement only if *SRC is accessible and the code is
462 greater than 0xA0. */
463 #define CHECK_CODE_RANGE_A0_FF \
465 if (src >= src_end) \
466 goto label_end_of_switch; \
467 else if (*src++ < 0xA0) \
471 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
472 Check if a text is encoded in Emacs' internal format. If it is,
473 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
476 detect_coding_emacs_mule (src
, src_end
)
477 unsigned char *src
, *src_end
;
482 while (src
< src_end
)
494 switch (emacs_code_class
[c
])
496 case EMACS_ascii_code
:
497 case EMACS_linefeed_code
:
500 case EMACS_control_code
:
501 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
505 case EMACS_invalid_code
:
508 case EMACS_leading_code_composition
: /* c == 0x80 */
510 CHECK_CODE_RANGE_A0_FF
;
515 case EMACS_leading_code_4
:
516 CHECK_CODE_RANGE_A0_FF
;
517 /* fall down to check it two more times ... */
519 case EMACS_leading_code_3
:
520 CHECK_CODE_RANGE_A0_FF
;
521 /* fall down to check it one more time ... */
523 case EMACS_leading_code_2
:
524 CHECK_CODE_RANGE_A0_FF
;
532 return CODING_CATEGORY_MASK_EMACS_MULE
;
536 /*** 3. ISO2022 handlers ***/
538 /* The following note describes the coding system ISO2022 briefly.
539 Since the intention of this note is to help understand the
540 functions in this file, some parts are NOT ACCURATE or OVERLY
541 SIMPLIFIED. For thorough understanding, please refer to the
542 original document of ISO2022.
544 ISO2022 provides many mechanisms to encode several character sets
545 in 7-bit and 8-bit environments. For 7-bite environments, all text
546 is encoded using bytes less than 128. This may make the encoded
547 text a little bit longer, but the text passes more easily through
548 several gateways, some of which strip off MSB (Most Signigant Bit).
550 There are two kinds of character sets: control character set and
551 graphic character set. The former contains control characters such
552 as `newline' and `escape' to provide control functions (control
553 functions are also provided by escape sequences). The latter
554 contains graphic characters such as 'A' and '-'. Emacs recognizes
555 two control character sets and many graphic character sets.
557 Graphic character sets are classified into one of the following
558 four classes, according to the number of bytes (DIMENSION) and
559 number of characters in one dimension (CHARS) of the set:
565 In addition, each character set is assigned an identification tag,
566 unique for each set, called "final character" (denoted as <F>
567 hereafter). The <F> of each character set is decided by ECMA(*)
568 when it is registered in ISO. The code range of <F> is 0x30..0x7F
569 (0x30..0x3F are for private use only).
571 Note (*): ECMA = European Computer Manufacturers Association
573 Here are examples of graphic character set [NAME(<F>)]:
574 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
575 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
576 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
577 o DIMENSION2_CHARS96 -- none for the moment
579 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
580 C0 [0x00..0x1F] -- control character plane 0
581 GL [0x20..0x7F] -- graphic character plane 0
582 C1 [0x80..0x9F] -- control character plane 1
583 GR [0xA0..0xFF] -- graphic character plane 1
585 A control character set is directly designated and invoked to C0 or
586 C1 by an escape sequence. The most common case is that:
587 - ISO646's control character set is designated/invoked to C0, and
588 - ISO6429's control character set is designated/invoked to C1,
589 and usually these designations/invocations are omitted in encoded
590 text. In a 7-bit environment, only C0 can be used, and a control
591 character for C1 is encoded by an appropriate escape sequence to
592 fit into the environment. All control characters for C1 are
593 defined to have corresponding escape sequences.
595 A graphic character set is at first designated to one of four
596 graphic registers (G0 through G3), then these graphic registers are
597 invoked to GL or GR. These designations and invocations can be
598 done independently. The most common case is that G0 is invoked to
599 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
600 these invocations and designations are omitted in encoded text.
601 In a 7-bit environment, only GL can be used.
603 When a graphic character set of CHARS94 is invoked to GL, codes
604 0x20 and 0x7F of the GL area work as control characters SPACE and
605 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
608 There are two ways of invocation: locking-shift and single-shift.
609 With locking-shift, the invocation lasts until the next different
610 invocation, whereas with single-shift, the invocation affects the
611 following character only and doesn't affect the locking-shift
612 state. Invocations are done by the following control characters or
615 ----------------------------------------------------------------------
616 abbrev function cntrl escape seq description
617 ----------------------------------------------------------------------
618 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
619 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
620 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
621 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
622 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
623 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
624 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
625 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
626 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
627 ----------------------------------------------------------------------
628 (*) These are not used by any known coding system.
630 Control characters for these functions are defined by macros
631 ISO_CODE_XXX in `coding.h'.
633 Designations are done by the following escape sequences:
634 ----------------------------------------------------------------------
635 escape sequence description
636 ----------------------------------------------------------------------
637 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
638 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
639 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
640 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
641 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
642 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
643 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
644 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
645 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
646 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
647 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
648 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
649 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
650 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
651 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
652 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
653 ----------------------------------------------------------------------
655 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
656 of dimension 1, chars 94, and final character <F>, etc...
658 Note (*): Although these designations are not allowed in ISO2022,
659 Emacs accepts them on decoding, and produces them on encoding
660 CHARS96 character sets in a coding system which is characterized as
661 7-bit environment, non-locking-shift, and non-single-shift.
663 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
664 '(' can be omitted. We refer to this as "short-form" hereafter.
666 Now you may notice that there are a lot of ways for encoding the
667 same multilingual text in ISO2022. Actually, there exist many
668 coding systems such as Compound Text (used in X11's inter client
669 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
670 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
671 localized platforms), and all of these are variants of ISO2022.
673 In addition to the above, Emacs handles two more kinds of escape
674 sequences: ISO6429's direction specification and Emacs' private
675 sequence for specifying character composition.
677 ISO6429's direction specification takes the following form:
678 o CSI ']' -- end of the current direction
679 o CSI '0' ']' -- end of the current direction
680 o CSI '1' ']' -- start of left-to-right text
681 o CSI '2' ']' -- start of right-to-left text
682 The control character CSI (0x9B: control sequence introducer) is
683 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
685 Character composition specification takes the following form:
686 o ESC '0' -- start character composition
687 o ESC '1' -- end character composition
688 Since these are not standard escape sequences of any ISO standard,
689 the use of them for these meaning is restricted to Emacs only. */
691 enum iso_code_class_type iso_code_class
[256];
693 #define CHARSET_OK(idx, charset) \
694 (coding_system_table[idx] \
695 && (coding_system_table[idx]->safe_charsets[charset] \
696 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
697 (coding_system_table[idx], charset) \
698 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
700 #define SHIFT_OUT_OK(idx) \
701 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
703 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
704 Check if a text is encoded in ISO2022. If it is, returns an
705 integer in which appropriate flag bits any of:
706 CODING_CATEGORY_MASK_ISO_7
707 CODING_CATEGORY_MASK_ISO_7_TIGHT
708 CODING_CATEGORY_MASK_ISO_8_1
709 CODING_CATEGORY_MASK_ISO_8_2
710 CODING_CATEGORY_MASK_ISO_7_ELSE
711 CODING_CATEGORY_MASK_ISO_8_ELSE
712 are set. If a code which should never appear in ISO2022 is found,
716 detect_coding_iso2022 (src
, src_end
)
717 unsigned char *src
, *src_end
;
719 int mask
= CODING_CATEGORY_MASK_ISO
;
721 int reg
[4], shift_out
= 0, single_shifting
= 0;
722 int c
, c1
, i
, charset
;
724 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
725 while (mask
&& src
< src_end
)
735 if (c
>= '(' && c
<= '/')
737 /* Designation sequence for a charset of dimension 1. */
741 if (c1
< ' ' || c1
>= 0x80
742 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
743 /* Invalid designation sequence. Just ignore. */
745 reg
[(c
- '(') % 4] = charset
;
749 /* Designation sequence for a charset of dimension 2. */
753 if (c
>= '@' && c
<= 'B')
754 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
755 reg
[0] = charset
= iso_charset_table
[1][0][c
];
756 else if (c
>= '(' && c
<= '/')
761 if (c1
< ' ' || c1
>= 0x80
762 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
763 /* Invalid designation sequence. Just ignore. */
765 reg
[(c
- '(') % 4] = charset
;
768 /* Invalid designation sequence. Just ignore. */
771 else if (c
== 'N' || c
== 'O')
773 /* ESC <Fe> for SS2 or SS3. */
774 mask
&= CODING_CATEGORY_MASK_ISO_7_ELSE
;
777 else if (c
== '0' || c
== '1' || c
== '2')
778 /* ESC <Fp> for start/end composition. Just ignore. */
781 /* Invalid escape sequence. Just ignore. */
784 /* We found a valid designation sequence for CHARSET. */
785 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
786 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
787 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
789 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
790 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
791 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
793 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
794 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
795 mask_found
|= CODING_CATEGORY_MASK_ISO_7_ELSE
;
797 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
798 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
799 mask_found
|= CODING_CATEGORY_MASK_ISO_8_ELSE
;
801 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
808 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
809 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
811 /* Locking shift out. */
812 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
813 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
821 /* Locking shift in. */
822 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
823 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
832 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
834 if (c
!= ISO_CODE_CSI
)
836 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
837 & CODING_FLAG_ISO_SINGLE_SHIFT
)
838 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
839 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
840 & CODING_FLAG_ISO_SINGLE_SHIFT
)
841 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
844 if (VECTORP (Vlatin_extra_code_table
)
845 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
847 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
848 & CODING_FLAG_ISO_LATIN_EXTRA
)
849 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
850 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
851 & CODING_FLAG_ISO_LATIN_EXTRA
)
852 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
855 mask_found
|= newmask
;
868 if (VECTORP (Vlatin_extra_code_table
)
869 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
873 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
874 & CODING_FLAG_ISO_LATIN_EXTRA
)
875 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
876 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
877 & CODING_FLAG_ISO_LATIN_EXTRA
)
878 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
880 mask_found
|= newmask
;
887 unsigned char *src_begin
= src
;
889 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
890 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
891 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
892 /* Check the length of succeeding codes of the range
893 0xA0..0FF. If the byte length is odd, we exclude
894 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
895 when we are not single shifting. */
896 if (!single_shifting
)
898 while (src
< src_end
&& *src
>= 0xA0)
900 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
901 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
903 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
910 return (mask
& mask_found
);
913 /* Decode a character of which charset is CHARSET and the 1st position
914 code is C1. If dimension of CHARSET is 2, the 2nd position code is
915 fetched from SRC and set to C2. If CHARSET is negative, it means
916 that we are decoding ill formed text, and what we can do is just to
919 #define DECODE_ISO_CHARACTER(charset, c1) \
921 int c_alt, charset_alt = (charset); \
922 if (COMPOSING_HEAD_P (coding->composing)) \
924 *dst++ = LEADING_CODE_COMPOSITION; \
925 if (COMPOSING_WITH_RULE_P (coding->composing)) \
926 /* To tell composition rules are embeded. */ \
928 coding->composing += 2; \
930 if (charset_alt >= 0) \
932 if (CHARSET_DIMENSION (charset_alt) == 2) \
934 ONE_MORE_BYTE (c2); \
935 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
936 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
939 charset_alt = CHARSET_ASCII; \
942 if (!NILP (translation_table) \
943 && ((c_alt = translate_char (translation_table, \
944 -1, charset_alt, c1, c2)) >= 0)) \
945 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
947 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
948 DECODE_CHARACTER_ASCII (c1); \
949 else if (CHARSET_DIMENSION (charset_alt) == 1) \
950 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
952 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
953 if (COMPOSING_WITH_RULE_P (coding->composing)) \
954 /* To tell a composition rule follows. */ \
955 coding->composing = COMPOSING_WITH_RULE_RULE; \
958 /* Set designation state into CODING. */
959 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
963 if (final_char < '0' || final_char >= 128) \
964 goto label_invalid_code; \
965 charset = ISO_CHARSET_TABLE (make_number (dimension), \
966 make_number (chars), \
967 make_number (final_char)); \
969 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
970 || coding->safe_charsets[charset])) \
972 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
974 && charset == CHARSET_ASCII) \
976 /* We should insert this designation sequence as is so \
977 that it is surely written back to a file. */ \
978 coding->spec.iso2022.last_invalid_designation_register = -1; \
979 goto label_invalid_code; \
981 coding->spec.iso2022.last_invalid_designation_register = -1; \
982 if ((coding->mode & CODING_MODE_DIRECTION) \
983 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
984 charset = CHARSET_REVERSE_CHARSET (charset); \
985 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
989 coding->spec.iso2022.last_invalid_designation_register = reg; \
990 goto label_invalid_code; \
994 /* Return 0 if there's a valid composing sequence starting at SRC and
995 ending before SRC_END, else return -1. */
998 check_composing_code (coding
, src
, src_end
)
999 struct coding_system
*coding
;
1000 unsigned char *src
, *src_end
;
1002 int charset
, c
, c1
, dim
;
1004 while (src
< src_end
)
1009 if (c
!= ISO_CODE_ESC
|| src
>= src_end
)
1012 if (c
== '1') /* end of compsition */
1014 if (src
+ 2 >= src_end
1015 || !coding
->flags
& CODING_FLAG_ISO_DESIGNATION
)
1020 c
= (*src
>= '@' && *src
<= 'B') ? '(' : *src
++;
1021 if (c
>= '(' && c
<= '/')
1024 if ((c1
< ' ' || c1
>= 0x80)
1025 || (charset
= iso_charset_table
[dim
][c
>= ','][c1
]) < 0
1026 || ! coding
->safe_charsets
[charset
]
1027 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
1028 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
1035 /* We have not found the sequence "ESC 1". */
1039 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1042 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1043 struct coding_system
*coding
;
1044 unsigned char *source
, *destination
;
1045 int src_bytes
, dst_bytes
;
1047 unsigned char *src
= source
;
1048 unsigned char *src_end
= source
+ src_bytes
;
1049 unsigned char *dst
= destination
;
1050 unsigned char *dst_end
= destination
+ dst_bytes
;
1051 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1052 from DST_END to assure that overflow checking is necessary only
1053 at the head of loop. */
1054 unsigned char *adjusted_dst_end
= dst_end
- 6;
1056 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1057 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1058 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1059 Lisp_Object translation_table
1060 = coding
->translation_table_for_decode
;
1061 int result
= CODING_FINISH_NORMAL
;
1063 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1064 translation_table
= Vstandard_translation_table_for_decode
;
1066 coding
->produced_char
= 0;
1067 coding
->fake_multibyte
= 0;
1068 while (src
< src_end
&& (dst_bytes
1069 ? (dst
< adjusted_dst_end
)
1072 /* SRC_BASE remembers the start position in source in each loop.
1073 The loop will be exited when there's not enough source text
1074 to analyze long escape sequence or 2-byte code (within macros
1075 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1076 to SRC_BASE before exiting. */
1077 unsigned char *src_base
= src
;
1078 int c1
= *src
++, c2
;
1080 switch (iso_code_class
[c1
])
1082 case ISO_0x20_or_0x7F
:
1083 if (!coding
->composing
1084 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
1086 /* This is SPACE or DEL. */
1088 coding
->produced_char
++;
1091 /* This is a graphic character, we fall down ... */
1093 case ISO_graphic_plane_0
:
1094 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1096 /* This is a composition rule. */
1098 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
1101 DECODE_ISO_CHARACTER (charset0
, c1
);
1104 case ISO_0xA0_or_0xFF
:
1105 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1106 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1107 goto label_invalid_code
;
1108 /* This is a graphic character, we fall down ... */
1110 case ISO_graphic_plane_1
:
1111 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1112 goto label_invalid_code
;
1114 DECODE_ISO_CHARACTER (charset1
, c1
);
1117 case ISO_control_code
:
1118 /* All ISO2022 control characters in this class have the
1119 same representation in Emacs internal format. */
1121 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1122 && (coding
->eol_type
== CODING_EOL_CR
1123 || coding
->eol_type
== CODING_EOL_CRLF
))
1125 result
= CODING_FINISH_INCONSISTENT_EOL
;
1126 goto label_end_of_loop_2
;
1129 coding
->produced_char
++;
1131 coding
->fake_multibyte
= 1;
1134 case ISO_carriage_return
:
1135 if (coding
->eol_type
== CODING_EOL_CR
)
1137 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1140 if (c1
== ISO_CODE_LF
)
1144 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1146 result
= CODING_FINISH_INCONSISTENT_EOL
;
1147 goto label_end_of_loop_2
;
1155 coding
->produced_char
++;
1159 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1160 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1161 goto label_invalid_code
;
1162 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1163 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1167 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1168 goto label_invalid_code
;
1169 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1170 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1173 case ISO_single_shift_2_7
:
1174 case ISO_single_shift_2
:
1175 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1176 goto label_invalid_code
;
1177 /* SS2 is handled as an escape sequence of ESC 'N' */
1179 goto label_escape_sequence
;
1181 case ISO_single_shift_3
:
1182 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1183 goto label_invalid_code
;
1184 /* SS2 is handled as an escape sequence of ESC 'O' */
1186 goto label_escape_sequence
;
1188 case ISO_control_sequence_introducer
:
1189 /* CSI is handled as an escape sequence of ESC '[' ... */
1191 goto label_escape_sequence
;
1195 label_escape_sequence
:
1196 /* Escape sequences handled by Emacs are invocation,
1197 designation, direction specification, and character
1198 composition specification. */
1201 case '&': /* revision of following character set */
1203 if (!(c1
>= '@' && c1
<= '~'))
1204 goto label_invalid_code
;
1206 if (c1
!= ISO_CODE_ESC
)
1207 goto label_invalid_code
;
1209 goto label_escape_sequence
;
1211 case '$': /* designation of 2-byte character set */
1212 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1213 goto label_invalid_code
;
1215 if (c1
>= '@' && c1
<= 'B')
1216 { /* designation of JISX0208.1978, GB2312.1980,
1218 DECODE_DESIGNATION (0, 2, 94, c1
);
1220 else if (c1
>= 0x28 && c1
<= 0x2B)
1221 { /* designation of DIMENSION2_CHARS94 character set */
1223 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1225 else if (c1
>= 0x2C && c1
<= 0x2F)
1226 { /* designation of DIMENSION2_CHARS96 character set */
1228 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1231 goto label_invalid_code
;
1234 case 'n': /* invocation of locking-shift-2 */
1235 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1236 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1237 goto label_invalid_code
;
1238 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1239 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1242 case 'o': /* invocation of locking-shift-3 */
1243 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1244 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1245 goto label_invalid_code
;
1246 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1247 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1250 case 'N': /* invocation of single-shift-2 */
1251 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1252 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1253 goto label_invalid_code
;
1255 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1256 DECODE_ISO_CHARACTER (charset
, c1
);
1259 case 'O': /* invocation of single-shift-3 */
1260 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1261 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1262 goto label_invalid_code
;
1264 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1265 DECODE_ISO_CHARACTER (charset
, c1
);
1268 case '0': case '2': /* start composing */
1269 /* Before processing composing, we must be sure that all
1270 characters being composed are supported by CODING.
1271 If not, we must give up composing. */
1272 if (check_composing_code (coding
, src
, src_end
) == 0)
1274 /* We are looking at a valid composition sequence. */
1275 coding
->composing
= (c1
== '0'
1276 ? COMPOSING_NO_RULE_HEAD
1277 : COMPOSING_WITH_RULE_HEAD
);
1278 coding
->composed_chars
= 0;
1282 *dst
++ = ISO_CODE_ESC
;
1284 coding
->produced_char
+= 2;
1288 case '1': /* end composing */
1289 if (!coding
->composing
)
1291 *dst
++ = ISO_CODE_ESC
;
1293 coding
->produced_char
+= 2;
1297 if (coding
->composed_chars
> 0)
1299 if (coding
->composed_chars
== 1)
1301 unsigned char *this_char_start
= dst
;
1304 /* Only one character is in the composing
1305 sequence. Make it a normal character. */
1306 while (*--this_char_start
!= LEADING_CODE_COMPOSITION
);
1307 dst
= (this_char_start
1308 + (coding
->composing
== COMPOSING_NO_RULE_TAIL
1313 this_bytes
= BYTES_BY_CHAR_HEAD (*dst
);
1314 while (this_bytes
--) *this_char_start
++ = *dst
++;
1315 dst
= this_char_start
;
1317 coding
->produced_char
++;
1319 coding
->composing
= COMPOSING_NO
;
1322 case '[': /* specification of direction */
1323 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1324 goto label_invalid_code
;
1325 /* For the moment, nested direction is not supported.
1326 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1327 left-to-right, and nozero means right-to-left. */
1331 case ']': /* end of the current direction */
1332 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1334 case '0': /* end of the current direction */
1335 case '1': /* start of left-to-right direction */
1338 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1340 goto label_invalid_code
;
1343 case '2': /* start of right-to-left direction */
1346 coding
->mode
|= CODING_MODE_DIRECTION
;
1348 goto label_invalid_code
;
1352 goto label_invalid_code
;
1357 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1358 goto label_invalid_code
;
1359 if (c1
>= 0x28 && c1
<= 0x2B)
1360 { /* designation of DIMENSION1_CHARS94 character set */
1362 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1364 else if (c1
>= 0x2C && c1
<= 0x2F)
1365 { /* designation of DIMENSION1_CHARS96 character set */
1367 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1371 goto label_invalid_code
;
1374 /* We must update these variables now. */
1375 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1376 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1380 while (src_base
< src
)
1381 *dst
++ = *src_base
++;
1382 coding
->fake_multibyte
= 1;
1387 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1388 label_end_of_loop_2
:
1395 if (result
== CODING_FINISH_NORMAL
)
1396 result
= CODING_FINISH_INSUFFICIENT_DST
;
1397 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
1398 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
1400 /* This is the last block of the text to be decoded. We had
1401 better just flush out all remaining codes in the text
1402 although they are not valid characters. */
1403 src_bytes
= src_end
- src
;
1404 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
1405 src_bytes
= dst_end
- dst
;
1406 bcopy (src
, dst
, src_bytes
);
1409 coding
->fake_multibyte
= 1;
1413 coding
->consumed
= coding
->consumed_char
= src
- source
;
1414 coding
->produced
= dst
- destination
;
1418 /* ISO2022 encoding stuff. */
1421 It is not enough to say just "ISO2022" on encoding, we have to
1422 specify more details. In Emacs, each coding system of ISO2022
1423 variant has the following specifications:
1424 1. Initial designation to G0 thru G3.
1425 2. Allows short-form designation?
1426 3. ASCII should be designated to G0 before control characters?
1427 4. ASCII should be designated to G0 at end of line?
1428 5. 7-bit environment or 8-bit environment?
1429 6. Use locking-shift?
1430 7. Use Single-shift?
1431 And the following two are only for Japanese:
1432 8. Use ASCII in place of JIS0201-1976-Roman?
1433 9. Use JISX0208-1983 in place of JISX0208-1978?
1434 These specifications are encoded in `coding->flags' as flag bits
1435 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1439 /* Produce codes (escape sequence) for designating CHARSET to graphic
1440 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1441 the coding system CODING allows, produce designation sequence of
1444 #define ENCODE_DESIGNATION(charset, reg, coding) \
1446 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1447 char *intermediate_char_94 = "()*+"; \
1448 char *intermediate_char_96 = ",-./"; \
1449 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1450 if (revision < 255) \
1452 *dst++ = ISO_CODE_ESC; \
1454 *dst++ = '@' + revision; \
1456 *dst++ = ISO_CODE_ESC; \
1457 if (CHARSET_DIMENSION (charset) == 1) \
1459 if (CHARSET_CHARS (charset) == 94) \
1460 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1462 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1467 if (CHARSET_CHARS (charset) == 94) \
1469 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1471 || final_char < '@' || final_char > 'B') \
1472 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1475 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1477 *dst++ = final_char; \
1478 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1481 /* The following two macros produce codes (control character or escape
1482 sequence) for ISO2022 single-shift functions (single-shift-2 and
1485 #define ENCODE_SINGLE_SHIFT_2 \
1487 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1488 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1491 *dst++ = ISO_CODE_SS2; \
1492 coding->fake_multibyte = 1; \
1494 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1497 #define ENCODE_SINGLE_SHIFT_3 \
1499 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1500 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1503 *dst++ = ISO_CODE_SS3; \
1504 coding->fake_multibyte = 1; \
1506 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1509 /* The following four macros produce codes (control character or
1510 escape sequence) for ISO2022 locking-shift functions (shift-in,
1511 shift-out, locking-shift-2, and locking-shift-3). */
1513 #define ENCODE_SHIFT_IN \
1515 *dst++ = ISO_CODE_SI; \
1516 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1519 #define ENCODE_SHIFT_OUT \
1521 *dst++ = ISO_CODE_SO; \
1522 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1525 #define ENCODE_LOCKING_SHIFT_2 \
1527 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1528 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1531 #define ENCODE_LOCKING_SHIFT_3 \
1533 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1534 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1537 /* Produce codes for a DIMENSION1 character whose character set is
1538 CHARSET and whose position-code is C1. Designation and invocation
1539 sequences are also produced in advance if necessary. */
1542 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1544 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1546 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1547 *dst++ = c1 & 0x7F; \
1549 *dst++ = c1 | 0x80; \
1550 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1553 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1555 *dst++ = c1 & 0x7F; \
1558 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1560 *dst++ = c1 | 0x80; \
1563 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1564 && !coding->safe_charsets[charset]) \
1566 /* We should not encode this character, instead produce one or \
1568 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1569 if (CHARSET_WIDTH (charset) == 2) \
1570 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1574 /* Since CHARSET is not yet invoked to any graphic planes, we \
1575 must invoke it, or, at first, designate it to some graphic \
1576 register. Then repeat the loop to actually produce the \
1578 dst = encode_invocation_designation (charset, coding, dst); \
1581 /* Produce codes for a DIMENSION2 character whose character set is
1582 CHARSET and whose position-codes are C1 and C2. Designation and
1583 invocation codes are also produced in advance if necessary. */
1585 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1587 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1589 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1590 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1592 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1593 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1596 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1598 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1601 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1603 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1606 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1607 && !coding->safe_charsets[charset]) \
1609 /* We should not encode this character, instead produce one or \
1611 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1612 if (CHARSET_WIDTH (charset) == 2) \
1613 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1617 /* Since CHARSET is not yet invoked to any graphic planes, we \
1618 must invoke it, or, at first, designate it to some graphic \
1619 register. Then repeat the loop to actually produce the \
1621 dst = encode_invocation_designation (charset, coding, dst); \
1624 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1626 int c_alt, charset_alt; \
1627 if (!NILP (translation_table) \
1628 && ((c_alt = translate_char (translation_table, -1, \
1631 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1633 charset_alt = charset; \
1634 if (CHARSET_DEFINED_P (charset_alt)) \
1636 if (CHARSET_DIMENSION (charset_alt) == 1) \
1638 if (charset == CHARSET_ASCII \
1639 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1640 charset_alt = charset_latin_jisx0201; \
1641 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1645 if (charset == charset_jisx0208 \
1646 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1647 charset_alt = charset_jisx0208_1978; \
1648 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1653 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1655 *dst++ = charset & 0x7f; \
1656 *dst++ = c1 & 0x7f; \
1658 *dst++ = c2 & 0x7f; \
1668 if (! COMPOSING_P (coding->composing)) \
1669 coding->consumed_char++; \
1672 /* Produce designation and invocation codes at a place pointed by DST
1673 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1677 encode_invocation_designation (charset
, coding
, dst
)
1679 struct coding_system
*coding
;
1682 int reg
; /* graphic register number */
1684 /* At first, check designations. */
1685 for (reg
= 0; reg
< 4; reg
++)
1686 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1691 /* CHARSET is not yet designated to any graphic registers. */
1692 /* At first check the requested designation. */
1693 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1694 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1695 /* Since CHARSET requests no special designation, designate it
1696 to graphic register 0. */
1699 ENCODE_DESIGNATION (charset
, reg
, coding
);
1702 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1703 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1705 /* Since the graphic register REG is not invoked to any graphic
1706 planes, invoke it to graphic plane 0. */
1709 case 0: /* graphic register 0 */
1713 case 1: /* graphic register 1 */
1717 case 2: /* graphic register 2 */
1718 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1719 ENCODE_SINGLE_SHIFT_2
;
1721 ENCODE_LOCKING_SHIFT_2
;
1724 case 3: /* graphic register 3 */
1725 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1726 ENCODE_SINGLE_SHIFT_3
;
1728 ENCODE_LOCKING_SHIFT_3
;
1735 /* The following two macros produce codes for indicating composition. */
1736 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1737 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1738 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1740 /* The following three macros produce codes for indicating direction
1742 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1744 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1745 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1747 *dst++ = ISO_CODE_CSI; \
1750 #define ENCODE_DIRECTION_R2L \
1751 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1753 #define ENCODE_DIRECTION_L2R \
1754 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1756 /* Produce codes for designation and invocation to reset the graphic
1757 planes and registers to initial state. */
1758 #define ENCODE_RESET_PLANE_AND_REGISTER \
1761 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1763 for (reg = 0; reg < 4; reg++) \
1764 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1765 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1766 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1767 ENCODE_DESIGNATION \
1768 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1771 /* Produce designation sequences of charsets in the line started from
1772 SRC to a place pointed by *DSTP, and update DSTP.
1774 If the current block ends before any end-of-line, we may fail to
1775 find all the necessary designations. */
1778 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1779 struct coding_system
*coding
;
1781 unsigned char *src
, *src_end
, **dstp
;
1783 int charset
, c
, found
= 0, reg
;
1784 /* Table of charsets to be designated to each graphic register. */
1786 unsigned char *dst
= *dstp
;
1788 for (reg
= 0; reg
< 4; reg
++)
1791 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1793 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1796 charset
= CHARSET_AT (src
);
1800 unsigned char c1
, c2
;
1802 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1803 if ((c_alt
= translate_char (table
, -1, charset
, c1
, c2
)) >= 0)
1804 charset
= CHAR_CHARSET (c_alt
);
1807 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1808 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1819 for (reg
= 0; reg
< 4; reg
++)
1821 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1822 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1827 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1830 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1831 struct coding_system
*coding
;
1832 unsigned char *source
, *destination
;
1833 int src_bytes
, dst_bytes
;
1835 unsigned char *src
= source
;
1836 unsigned char *src_end
= source
+ src_bytes
;
1837 unsigned char *dst
= destination
;
1838 unsigned char *dst_end
= destination
+ dst_bytes
;
1839 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1840 from DST_END to assure overflow checking is necessary only at the
1842 unsigned char *adjusted_dst_end
= dst_end
- 19;
1843 Lisp_Object translation_table
1844 = coding
->translation_table_for_encode
;
1845 int result
= CODING_FINISH_NORMAL
;
1847 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1848 translation_table
= Vstandard_translation_table_for_encode
;
1850 coding
->consumed_char
= 0;
1851 coding
->fake_multibyte
= 0;
1852 while (src
< src_end
&& (dst_bytes
1853 ? (dst
< adjusted_dst_end
)
1854 : (dst
< src
- 19)))
1856 /* SRC_BASE remembers the start position in source in each loop.
1857 The loop will be exited when there's not enough source text
1858 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1859 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1860 reset to SRC_BASE before exiting. */
1861 unsigned char *src_base
= src
;
1862 int charset
, c1
, c2
, c3
, c4
;
1864 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1865 && CODING_SPEC_ISO_BOL (coding
))
1867 /* We have to produce designation sequences if any now. */
1868 encode_designation_at_bol (coding
, translation_table
,
1869 src
, src_end
, &dst
);
1870 CODING_SPEC_ISO_BOL (coding
) = 0;
1874 /* If we are seeing a component of a composite character, we are
1875 seeing a leading-code encoded irregularly for composition, or
1876 a composition rule if composing with rule. We must set C1 to
1877 a normal leading-code or an ASCII code. If we are not seeing
1878 a composite character, we must reset composition,
1879 designation, and invocation states. */
1880 if (COMPOSING_P (coding
->composing
))
1884 /* We are not in a composite character any longer. */
1885 coding
->composing
= COMPOSING_NO
;
1886 ENCODE_RESET_PLANE_AND_REGISTER
;
1887 ENCODE_COMPOSITION_END
;
1891 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1894 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1897 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1898 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1901 /* This is an ASCII component. */
1906 /* This is a leading-code of non ASCII component. */
1911 /* Now encode one character. C1 is a control character, an
1912 ASCII character, or a leading-code of multi-byte character. */
1913 switch (emacs_code_class
[c1
])
1915 case EMACS_ascii_code
:
1917 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1920 case EMACS_control_code
:
1921 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1922 ENCODE_RESET_PLANE_AND_REGISTER
;
1924 coding
->consumed_char
++;
1927 case EMACS_carriage_return_code
:
1928 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
1930 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1931 ENCODE_RESET_PLANE_AND_REGISTER
;
1933 coding
->consumed_char
++;
1936 /* fall down to treat '\r' as '\n' ... */
1938 case EMACS_linefeed_code
:
1939 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1940 ENCODE_RESET_PLANE_AND_REGISTER
;
1941 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1942 bcopy (coding
->spec
.iso2022
.initial_designation
,
1943 coding
->spec
.iso2022
.current_designation
,
1944 sizeof coding
->spec
.iso2022
.initial_designation
);
1945 if (coding
->eol_type
== CODING_EOL_LF
1946 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1947 *dst
++ = ISO_CODE_LF
;
1948 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1949 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1951 *dst
++ = ISO_CODE_CR
;
1952 CODING_SPEC_ISO_BOL (coding
) = 1;
1953 coding
->consumed_char
++;
1956 case EMACS_leading_code_2
:
1961 /* invalid sequence */
1964 coding
->consumed_char
++;
1967 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1970 case EMACS_leading_code_3
:
1971 TWO_MORE_BYTES (c2
, c3
);
1973 if (c2
< 0xA0 || c3
< 0xA0)
1975 /* invalid sequence */
1978 coding
->consumed_char
++;
1980 else if (c1
< LEADING_CODE_PRIVATE_11
)
1981 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1983 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1986 case EMACS_leading_code_4
:
1987 THREE_MORE_BYTES (c2
, c3
, c4
);
1988 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1990 /* invalid sequence */
1993 coding
->consumed_char
++;
1996 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1999 case EMACS_leading_code_composition
:
2003 /* invalid sequence */
2006 coding
->consumed_char
++;
2008 else if (c2
== 0xFF)
2010 ENCODE_RESET_PLANE_AND_REGISTER
;
2011 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
2012 ENCODE_COMPOSITION_WITH_RULE_START
;
2013 coding
->consumed_char
++;
2017 ENCODE_RESET_PLANE_AND_REGISTER
;
2018 /* Rewind one byte because it is a character code of
2019 composition elements. */
2021 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
2022 ENCODE_COMPOSITION_NO_RULE_START
;
2023 coding
->consumed_char
++;
2027 case EMACS_invalid_code
:
2028 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
2029 ENCODE_RESET_PLANE_AND_REGISTER
;
2031 coding
->consumed_char
++;
2036 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2041 if (src
< src_end
&& result
== CODING_FINISH_NORMAL
)
2042 result
= CODING_FINISH_INSUFFICIENT_DST
;
2044 /* If this is the last block of the text to be encoded, we must
2045 reset graphic planes and registers to the initial state, and
2046 flush out the carryover if any. */
2047 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2049 ENCODE_RESET_PLANE_AND_REGISTER
;
2050 if (COMPOSING_P (coding
->composing
))
2051 ENCODE_COMPOSITION_END
;
2052 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
2054 while (src
< src_end
&& dst
< dst_end
)
2058 coding
->consumed
= src
- source
;
2059 coding
->produced
= coding
->produced_char
= dst
- destination
;
2064 /*** 4. SJIS and BIG5 handlers ***/
2066 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2067 quite widely. So, for the moment, Emacs supports them in the bare
2068 C code. But, in the future, they may be supported only by CCL. */
2070 /* SJIS is a coding system encoding three character sets: ASCII, right
2071 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2072 as is. A character of charset katakana-jisx0201 is encoded by
2073 "position-code + 0x80". A character of charset japanese-jisx0208
2074 is encoded in 2-byte but two position-codes are divided and shifted
2075 so that it fit in the range below.
2077 --- CODE RANGE of SJIS ---
2078 (character set) (range)
2080 KATAKANA-JISX0201 0xA0 .. 0xDF
2081 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2082 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2083 -------------------------------
2087 /* BIG5 is a coding system encoding two character sets: ASCII and
2088 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2089 character set and is encoded in two-byte.
2091 --- CODE RANGE of BIG5 ---
2092 (character set) (range)
2094 Big5 (1st byte) 0xA1 .. 0xFE
2095 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2096 --------------------------
2098 Since the number of characters in Big5 is larger than maximum
2099 characters in Emacs' charset (96x96), it can't be handled as one
2100 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2101 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2102 contains frequently used characters and the latter contains less
2103 frequently used characters. */
2105 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2106 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2107 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2108 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2110 /* Number of Big5 characters which have the same code in 1st byte. */
2111 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2113 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2116 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2118 charset = charset_big5_1; \
2121 charset = charset_big5_2; \
2122 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2124 c1 = temp / (0xFF - 0xA1) + 0x21; \
2125 c2 = temp % (0xFF - 0xA1) + 0x21; \
2128 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2130 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2131 if (charset == charset_big5_2) \
2132 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2133 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2134 b2 = temp % BIG5_SAME_ROW; \
2135 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2138 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2140 int c_alt, charset_alt = (charset); \
2141 if (!NILP (translation_table) \
2142 && ((c_alt = translate_char (translation_table, \
2143 -1, (charset), c1, c2)) >= 0)) \
2144 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2145 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2146 DECODE_CHARACTER_ASCII (c1); \
2147 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2148 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2150 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2153 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2155 int c_alt, charset_alt; \
2156 if (!NILP (translation_table) \
2157 && ((c_alt = translate_char (translation_table, -1, \
2160 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2162 charset_alt = charset; \
2163 if (charset_alt == charset_ascii) \
2165 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2167 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2169 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2170 *dst++ = c1 & 0x7F; \
2173 *dst++ = charset_alt, *dst++ = c1; \
2174 coding->fake_multibyte = 1; \
2179 c1 &= 0x7F, c2 &= 0x7F; \
2180 if (sjis_p && (charset_alt == charset_jisx0208 \
2181 || charset_alt == charset_jisx0208_1978))\
2183 unsigned char s1, s2; \
2185 ENCODE_SJIS (c1, c2, s1, s2); \
2186 *dst++ = s1, *dst++ = s2; \
2187 coding->fake_multibyte = 1; \
2190 && (charset_alt == charset_big5_1 \
2191 || charset_alt == charset_big5_2)) \
2193 unsigned char b1, b2; \
2195 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2196 *dst++ = b1, *dst++ = b2; \
2200 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2201 coding->fake_multibyte = 1; \
2204 coding->consumed_char++; \
2207 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2208 Check if a text is encoded in SJIS. If it is, return
2209 CODING_CATEGORY_MASK_SJIS, else return 0. */
2212 detect_coding_sjis (src
, src_end
)
2213 unsigned char *src
, *src_end
;
2217 while (src
< src_end
)
2220 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2222 if (src
< src_end
&& *src
++ < 0x40)
2226 return CODING_CATEGORY_MASK_SJIS
;
2229 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2230 Check if a text is encoded in BIG5. If it is, return
2231 CODING_CATEGORY_MASK_BIG5, else return 0. */
2234 detect_coding_big5 (src
, src_end
)
2235 unsigned char *src
, *src_end
;
2239 while (src
< src_end
)
2247 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2251 return CODING_CATEGORY_MASK_BIG5
;
2254 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2255 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2258 decode_coding_sjis_big5 (coding
, source
, destination
,
2259 src_bytes
, dst_bytes
, sjis_p
)
2260 struct coding_system
*coding
;
2261 unsigned char *source
, *destination
;
2262 int src_bytes
, dst_bytes
;
2265 unsigned char *src
= source
;
2266 unsigned char *src_end
= source
+ src_bytes
;
2267 unsigned char *dst
= destination
;
2268 unsigned char *dst_end
= destination
+ dst_bytes
;
2269 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2270 from DST_END to assure overflow checking is necessary only at the
2272 unsigned char *adjusted_dst_end
= dst_end
- 3;
2273 Lisp_Object translation_table
2274 = coding
->translation_table_for_decode
;
2275 int result
= CODING_FINISH_NORMAL
;
2277 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2278 translation_table
= Vstandard_translation_table_for_decode
;
2280 coding
->produced_char
= 0;
2281 coding
->fake_multibyte
= 0;
2282 while (src
< src_end
&& (dst_bytes
2283 ? (dst
< adjusted_dst_end
)
2286 /* SRC_BASE remembers the start position in source in each loop.
2287 The loop will be exited when there's not enough source text
2288 to analyze two-byte character (within macro ONE_MORE_BYTE).
2289 In that case, SRC is reset to SRC_BASE before exiting. */
2290 unsigned char *src_base
= src
;
2291 unsigned char c1
= *src
++, c2
, c3
, c4
;
2297 if (coding
->eol_type
== CODING_EOL_CRLF
)
2302 else if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2304 result
= CODING_FINISH_INCONSISTENT_EOL
;
2305 goto label_end_of_loop_2
;
2308 /* To process C2 again, SRC is subtracted by 1. */
2311 else if (coding
->eol_type
== CODING_EOL_CR
)
2317 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2318 && (coding
->eol_type
== CODING_EOL_CR
2319 || coding
->eol_type
== CODING_EOL_CRLF
))
2321 result
= CODING_FINISH_INCONSISTENT_EOL
;
2322 goto label_end_of_loop_2
;
2326 coding
->produced_char
++;
2330 c2
= 0; /* avoid warning */
2331 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2337 if (c1
< 0xA0 || (c1
>= 0xE0 && c1
< 0xF0))
2339 /* SJIS -> JISX0208 */
2341 if (c2
>= 0x40 && c2
!= 0x7F && c2
<= 0xFC)
2343 DECODE_SJIS (c1
, c2
, c3
, c4
);
2344 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2347 goto label_invalid_code_2
;
2350 /* SJIS -> JISX0201-Kana */
2352 c2
= 0; /* avoid warning */
2353 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
,
2357 goto label_invalid_code_1
;
2362 if (c1
>= 0xA1 && c1
<= 0xFE)
2365 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2369 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2370 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2373 goto label_invalid_code_2
;
2376 goto label_invalid_code_1
;
2381 label_invalid_code_1
:
2383 coding
->produced_char
++;
2384 coding
->fake_multibyte
= 1;
2387 label_invalid_code_2
:
2388 *dst
++ = c1
; *dst
++= c2
;
2389 coding
->produced_char
+= 2;
2390 coding
->fake_multibyte
= 1;
2394 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2395 label_end_of_loop_2
:
2402 if (result
== CODING_FINISH_NORMAL
)
2403 result
= CODING_FINISH_INSUFFICIENT_DST
;
2404 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2405 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2407 src_bytes
= src_end
- src
;
2408 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2409 src_bytes
= dst_end
- dst
;
2410 bcopy (dst
, src
, src_bytes
);
2413 coding
->fake_multibyte
= 1;
2417 coding
->consumed
= coding
->consumed_char
= src
- source
;
2418 coding
->produced
= dst
- destination
;
2422 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2423 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2424 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2425 sure that all these charsets are registered as official charset
2426 (i.e. do not have extended leading-codes). Characters of other
2427 charsets are produced without any encoding. If SJIS_P is 1, encode
2428 SJIS text, else encode BIG5 text. */
2431 encode_coding_sjis_big5 (coding
, source
, destination
,
2432 src_bytes
, dst_bytes
, sjis_p
)
2433 struct coding_system
*coding
;
2434 unsigned char *source
, *destination
;
2435 int src_bytes
, dst_bytes
;
2438 unsigned char *src
= source
;
2439 unsigned char *src_end
= source
+ src_bytes
;
2440 unsigned char *dst
= destination
;
2441 unsigned char *dst_end
= destination
+ dst_bytes
;
2442 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2443 from DST_END to assure overflow checking is necessary only at the
2445 unsigned char *adjusted_dst_end
= dst_end
- 1;
2446 Lisp_Object translation_table
2447 = coding
->translation_table_for_encode
;
2448 int result
= CODING_FINISH_NORMAL
;
2450 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2451 translation_table
= Vstandard_translation_table_for_encode
;
2453 coding
->consumed_char
= 0;
2454 coding
->fake_multibyte
= 0;
2455 while (src
< src_end
&& (dst_bytes
2456 ? (dst
< adjusted_dst_end
)
2459 /* SRC_BASE remembers the start position in source in each loop.
2460 The loop will be exited when there's not enough source text
2461 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2462 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2464 unsigned char *src_base
= src
;
2465 unsigned char c1
= *src
++, c2
, c3
, c4
;
2467 if (coding
->composing
)
2474 else if (c1
>= 0xA0)
2477 coding
->composing
= 0;
2480 switch (emacs_code_class
[c1
])
2482 case EMACS_ascii_code
:
2483 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2486 case EMACS_control_code
:
2488 coding
->consumed_char
++;
2491 case EMACS_carriage_return_code
:
2492 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2495 coding
->consumed_char
++;
2498 /* fall down to treat '\r' as '\n' ... */
2500 case EMACS_linefeed_code
:
2501 if (coding
->eol_type
== CODING_EOL_LF
2502 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2504 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2505 *dst
++ = '\r', *dst
++ = '\n';
2508 coding
->consumed_char
++;
2511 case EMACS_leading_code_2
:
2513 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2516 case EMACS_leading_code_3
:
2517 TWO_MORE_BYTES (c2
, c3
);
2518 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2521 case EMACS_leading_code_4
:
2522 THREE_MORE_BYTES (c2
, c3
, c4
);
2523 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2526 case EMACS_leading_code_composition
:
2527 coding
->composing
= 1;
2530 default: /* i.e. case EMACS_invalid_code: */
2532 coding
->consumed_char
++;
2537 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2542 if (result
== CODING_FINISH_NORMAL
2544 result
= CODING_FINISH_INSUFFICIENT_DST
;
2545 coding
->consumed
= src
- source
;
2546 coding
->produced
= coding
->produced_char
= dst
- destination
;
2551 /*** 5. CCL handlers ***/
2553 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2554 Check if a text is encoded in a coding system of which
2555 encoder/decoder are written in CCL program. If it is, return
2556 CODING_CATEGORY_MASK_CCL, else return 0. */
2559 detect_coding_ccl (src
, src_end
)
2560 unsigned char *src
, *src_end
;
2562 unsigned char *valid
;
2564 /* No coding system is assigned to coding-category-ccl. */
2565 if (!coding_system_table
[CODING_CATEGORY_IDX_CCL
])
2568 valid
= coding_system_table
[CODING_CATEGORY_IDX_CCL
]->spec
.ccl
.valid_codes
;
2569 while (src
< src_end
)
2571 if (! valid
[*src
]) return 0;
2574 return CODING_CATEGORY_MASK_CCL
;
2578 /*** 6. End-of-line handlers ***/
2580 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2581 This function is called only when `coding->eol_type' is
2582 CODING_EOL_CRLF or CODING_EOL_CR. */
2585 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2586 struct coding_system
*coding
;
2587 unsigned char *source
, *destination
;
2588 int src_bytes
, dst_bytes
;
2590 unsigned char *src
= source
;
2591 unsigned char *src_end
= source
+ src_bytes
;
2592 unsigned char *dst
= destination
;
2593 unsigned char *dst_end
= destination
+ dst_bytes
;
2595 int result
= CODING_FINISH_NORMAL
;
2597 coding
->fake_multibyte
= 0;
2601 coding
->produced
= coding
->produced_char
= 0;
2602 coding
->consumed
= coding
->consumed_char
= 0;
2606 switch (coding
->eol_type
)
2608 case CODING_EOL_CRLF
:
2610 /* Since the maximum bytes produced by each loop is 2, we
2611 subtract 1 from DST_END to assure overflow checking is
2612 necessary only at the head of loop. */
2613 unsigned char *adjusted_dst_end
= dst_end
- 1;
2615 while (src
< src_end
&& (dst_bytes
2616 ? (dst
< adjusted_dst_end
)
2619 unsigned char *src_base
= src
;
2629 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2631 result
= CODING_FINISH_INCONSISTENT_EOL
;
2632 goto label_end_of_loop_2
;
2636 if (BASE_LEADING_CODE_P (c
))
2637 coding
->fake_multibyte
= 1;
2641 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2643 result
= CODING_FINISH_INCONSISTENT_EOL
;
2644 goto label_end_of_loop_2
;
2649 if (BASE_LEADING_CODE_P (c
))
2650 coding
->fake_multibyte
= 1;
2655 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2656 label_end_of_loop_2
:
2662 if (result
== CODING_FINISH_NORMAL
)
2663 result
= CODING_FINISH_INSUFFICIENT_DST
;
2664 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2665 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2667 /* This is the last block of the text to be decoded.
2668 We flush out all remaining codes. */
2669 src_bytes
= src_end
- src
;
2670 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2671 src_bytes
= dst_end
- dst
;
2672 bcopy (src
, dst
, src_bytes
);
2681 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2683 while (src
< src_end
)
2685 if ((c
= *src
++) == '\n')
2687 if (BASE_LEADING_CODE_P (c
))
2688 coding
->fake_multibyte
= 1;
2692 src_bytes
= src
- source
;
2693 result
= CODING_FINISH_INCONSISTENT_EOL
;
2696 if (dst_bytes
&& src_bytes
> dst_bytes
)
2698 result
= CODING_FINISH_INSUFFICIENT_DST
;
2699 src_bytes
= dst_bytes
;
2702 bcopy (source
, destination
, src_bytes
);
2704 safe_bcopy (source
, destination
, src_bytes
);
2705 src
= source
+ src_bytes
;
2706 while (src_bytes
--) if (*dst
++ == '\r') dst
[-1] = '\n';
2709 default: /* i.e. case: CODING_EOL_LF */
2710 if (dst_bytes
&& src_bytes
> dst_bytes
)
2712 result
= CODING_FINISH_INSUFFICIENT_DST
;
2713 src_bytes
= dst_bytes
;
2716 bcopy (source
, destination
, src_bytes
);
2718 safe_bcopy (source
, destination
, src_bytes
);
2721 coding
->fake_multibyte
= 1;
2725 coding
->consumed
= coding
->consumed_char
= src
- source
;
2726 coding
->produced
= coding
->produced_char
= dst
- destination
;
2730 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2731 format of end-of-line according to `coding->eol_type'. If
2732 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2733 '\r' in source text also means end-of-line. */
2736 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2737 struct coding_system
*coding
;
2738 unsigned char *source
, *destination
;
2739 int src_bytes
, dst_bytes
;
2741 unsigned char *src
= source
;
2742 unsigned char *dst
= destination
;
2743 int result
= CODING_FINISH_NORMAL
;
2745 coding
->fake_multibyte
= 0;
2747 if (coding
->eol_type
== CODING_EOL_CRLF
)
2750 unsigned char *src_end
= source
+ src_bytes
;
2751 unsigned char *dst_end
= destination
+ dst_bytes
;
2752 /* Since the maximum bytes produced by each loop is 2, we
2753 subtract 1 from DST_END to assure overflow checking is
2754 necessary only at the head of loop. */
2755 unsigned char *adjusted_dst_end
= dst_end
- 1;
2757 while (src
< src_end
&& (dst_bytes
2758 ? (dst
< adjusted_dst_end
)
2763 || (c
== '\r' && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)))
2764 *dst
++ = '\r', *dst
++ = '\n';
2768 if (BASE_LEADING_CODE_P (c
))
2769 coding
->fake_multibyte
= 1;
2773 result
= CODING_FINISH_INSUFFICIENT_DST
;
2779 if (dst_bytes
&& src_bytes
> dst_bytes
)
2781 src_bytes
= dst_bytes
;
2782 result
= CODING_FINISH_INSUFFICIENT_DST
;
2785 bcopy (source
, destination
, src_bytes
);
2787 safe_bcopy (source
, destination
, src_bytes
);
2788 dst_bytes
= src_bytes
;
2789 if (coding
->eol_type
== CODING_EOL_CR
)
2793 if ((c
= *dst
++) == '\n')
2795 else if (BASE_LEADING_CODE_P (c
))
2796 coding
->fake_multibyte
= 1;
2801 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2804 if (*dst
++ == '\r') dst
[-1] = '\n';
2806 coding
->fake_multibyte
= 1;
2808 src
= source
+ dst_bytes
;
2809 dst
= destination
+ dst_bytes
;
2812 coding
->consumed
= coding
->consumed_char
= src
- source
;
2813 coding
->produced
= coding
->produced_char
= dst
- destination
;
2818 /*** 7. C library functions ***/
2820 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2821 has a property `coding-system'. The value of this property is a
2822 vector of length 5 (called as coding-vector). Among elements of
2823 this vector, the first (element[0]) and the fifth (element[4])
2824 carry important information for decoding/encoding. Before
2825 decoding/encoding, this information should be set in fields of a
2826 structure of type `coding_system'.
2828 A value of property `coding-system' can be a symbol of another
2829 subsidiary coding-system. In that case, Emacs gets coding-vector
2832 `element[0]' contains information to be set in `coding->type'. The
2833 value and its meaning is as follows:
2835 0 -- coding_type_emacs_mule
2836 1 -- coding_type_sjis
2837 2 -- coding_type_iso2022
2838 3 -- coding_type_big5
2839 4 -- coding_type_ccl encoder/decoder written in CCL
2840 nil -- coding_type_no_conversion
2841 t -- coding_type_undecided (automatic conversion on decoding,
2842 no-conversion on encoding)
2844 `element[4]' contains information to be set in `coding->flags' and
2845 `coding->spec'. The meaning varies by `coding->type'.
2847 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2848 of length 32 (of which the first 13 sub-elements are used now).
2849 Meanings of these sub-elements are:
2851 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2852 If the value is an integer of valid charset, the charset is
2853 assumed to be designated to graphic register N initially.
2855 If the value is minus, it is a minus value of charset which
2856 reserves graphic register N, which means that the charset is
2857 not designated initially but should be designated to graphic
2858 register N just before encoding a character in that charset.
2860 If the value is nil, graphic register N is never used on
2863 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2864 Each value takes t or nil. See the section ISO2022 of
2865 `coding.h' for more information.
2867 If `coding->type' is `coding_type_big5', element[4] is t to denote
2868 BIG5-ETen or nil to denote BIG5-HKU.
2870 If `coding->type' takes the other value, element[4] is ignored.
2872 Emacs Lisp's coding system also carries information about format of
2873 end-of-line in a value of property `eol-type'. If the value is
2874 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2875 means CODING_EOL_CR. If it is not integer, it should be a vector
2876 of subsidiary coding systems of which property `eol-type' has one
2881 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2882 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2883 is setup so that no conversion is necessary and return -1, else
2887 setup_coding_system (coding_system
, coding
)
2888 Lisp_Object coding_system
;
2889 struct coding_system
*coding
;
2891 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2895 /* Initialize some fields required for all kinds of coding systems. */
2896 coding
->symbol
= coding_system
;
2897 coding
->common_flags
= 0;
2899 coding
->heading_ascii
= -1;
2900 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2902 if (NILP (coding_system
))
2903 goto label_invalid_coding_system
;
2905 coding_spec
= Fget (coding_system
, Qcoding_system
);
2907 if (!VECTORP (coding_spec
)
2908 || XVECTOR (coding_spec
)->size
!= 5
2909 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2910 goto label_invalid_coding_system
;
2912 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2913 if (VECTORP (eol_type
))
2915 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2916 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2918 else if (XFASTINT (eol_type
) == 1)
2920 coding
->eol_type
= CODING_EOL_CRLF
;
2921 coding
->common_flags
2922 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2924 else if (XFASTINT (eol_type
) == 2)
2926 coding
->eol_type
= CODING_EOL_CR
;
2927 coding
->common_flags
2928 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2931 coding
->eol_type
= CODING_EOL_LF
;
2933 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2934 /* Try short cut. */
2935 if (SYMBOLP (coding_type
))
2937 if (EQ (coding_type
, Qt
))
2939 coding
->type
= coding_type_undecided
;
2940 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2943 coding
->type
= coding_type_no_conversion
;
2947 /* Initialize remaining fields. */
2948 coding
->composing
= 0;
2949 coding
->composed_chars
= 0;
2951 /* Get values of coding system properties:
2952 `post-read-conversion', `pre-write-conversion',
2953 `translation-table-for-decode', `translation-table-for-encode'. */
2954 plist
= XVECTOR (coding_spec
)->contents
[3];
2955 /* Pre & post conversion functions should be disabled if
2956 inhibit_eol_conversion is nozero. This is the case that a code
2957 conversion function is called while those functions are running. */
2958 if (! inhibit_pre_post_conversion
)
2960 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2961 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2963 val
= Fplist_get (plist
, Qtranslation_table_for_decode
);
2965 val
= Fget (val
, Qtranslation_table_for_decode
);
2966 coding
->translation_table_for_decode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2967 val
= Fplist_get (plist
, Qtranslation_table_for_encode
);
2969 val
= Fget (val
, Qtranslation_table_for_encode
);
2970 coding
->translation_table_for_encode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2971 val
= Fplist_get (plist
, Qcoding_category
);
2974 val
= Fget (val
, Qcoding_category_index
);
2976 coding
->category_idx
= XINT (val
);
2978 goto label_invalid_coding_system
;
2981 goto label_invalid_coding_system
;
2983 val
= Fplist_get (plist
, Qsafe_charsets
);
2986 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2987 coding
->safe_charsets
[i
] = 1;
2991 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2994 if ((i
= get_charset_id (XCAR (val
))) >= 0)
2995 coding
->safe_charsets
[i
] = 1;
3000 switch (XFASTINT (coding_type
))
3003 coding
->type
= coding_type_emacs_mule
;
3004 if (!NILP (coding
->post_read_conversion
))
3005 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
3006 if (!NILP (coding
->pre_write_conversion
))
3007 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
3011 coding
->type
= coding_type_sjis
;
3012 coding
->common_flags
3013 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3017 coding
->type
= coding_type_iso2022
;
3018 coding
->common_flags
3019 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3021 Lisp_Object val
, temp
;
3023 int i
, charset
, reg_bits
= 0;
3025 val
= XVECTOR (coding_spec
)->contents
[4];
3027 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
3028 goto label_invalid_coding_system
;
3030 flags
= XVECTOR (val
)->contents
;
3032 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
3033 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
3034 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
3035 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
3036 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
3037 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
3038 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
3039 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
3040 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
3041 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
3042 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3043 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
3044 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
3047 /* Invoke graphic register 0 to plane 0. */
3048 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
3049 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3050 CODING_SPEC_ISO_INVOCATION (coding
, 1)
3051 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
3052 /* Not single shifting at first. */
3053 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
3054 /* Beginning of buffer should also be regarded as bol. */
3055 CODING_SPEC_ISO_BOL (coding
) = 1;
3057 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3058 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
3059 val
= Vcharset_revision_alist
;
3062 charset
= get_charset_id (Fcar_safe (XCAR (val
)));
3064 && (temp
= Fcdr_safe (XCAR (val
)), INTEGERP (temp
))
3065 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
3066 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
3070 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3071 FLAGS[REG] can be one of below:
3072 integer CHARSET: CHARSET occupies register I,
3073 t: designate nothing to REG initially, but can be used
3075 list of integer, nil, or t: designate the first
3076 element (if integer) to REG initially, the remaining
3077 elements (if integer) is designated to REG on request,
3078 if an element is t, REG can be used by any charsets,
3079 nil: REG is never used. */
3080 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3081 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3082 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
3083 for (i
= 0; i
< 4; i
++)
3085 if (INTEGERP (flags
[i
])
3086 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
3087 || (charset
= get_charset_id (flags
[i
])) >= 0)
3089 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3090 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
3092 else if (EQ (flags
[i
], Qt
))
3094 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3096 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3098 else if (CONSP (flags
[i
]))
3103 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3104 if (INTEGERP (XCAR (tail
))
3105 && (charset
= XINT (XCAR (tail
)),
3106 CHARSET_VALID_P (charset
))
3107 || (charset
= get_charset_id (XCAR (tail
))) >= 0)
3109 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3110 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
3113 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3115 while (CONSP (tail
))
3117 if (INTEGERP (XCAR (tail
))
3118 && (charset
= XINT (XCAR (tail
)),
3119 CHARSET_VALID_P (charset
))
3120 || (charset
= get_charset_id (XCAR (tail
))) >= 0)
3121 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3123 else if (EQ (XCAR (tail
), Qt
))
3129 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3131 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
3132 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
3135 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
3137 /* REG 1 can be used only by locking shift in 7-bit env. */
3138 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
3140 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3141 /* Without any shifting, only REG 0 and 1 can be used. */
3146 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3148 if (CHARSET_VALID_P (charset
))
3150 /* There exist some default graphic registers to be
3153 /* We had better avoid designating a charset of
3154 CHARS96 to REG 0 as far as possible. */
3155 if (CHARSET_CHARS (charset
) == 96)
3156 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3158 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3160 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3162 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3166 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3167 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3171 coding
->type
= coding_type_big5
;
3172 coding
->common_flags
3173 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3175 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3176 ? CODING_FLAG_BIG5_HKU
3177 : CODING_FLAG_BIG5_ETEN
);
3181 coding
->type
= coding_type_ccl
;
3182 coding
->common_flags
3183 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3185 val
= XVECTOR (coding_spec
)->contents
[4];
3187 || setup_ccl_program (&(coding
->spec
.ccl
.decoder
),
3189 || setup_ccl_program (&(coding
->spec
.ccl
.encoder
),
3191 goto label_invalid_coding_system
;
3193 bzero (coding
->spec
.ccl
.valid_codes
, 256);
3194 val
= Fplist_get (plist
, Qvalid_codes
);
3199 for (; CONSP (val
); val
= XCDR (val
))
3203 && XINT (this) >= 0 && XINT (this) < 256)
3204 coding
->spec
.ccl
.valid_codes
[XINT (this)] = 1;
3205 else if (CONSP (this)
3206 && INTEGERP (XCAR (this))
3207 && INTEGERP (XCDR (this)))
3209 int start
= XINT (XCAR (this));
3210 int end
= XINT (XCDR (this));
3212 if (start
>= 0 && start
<= end
&& end
< 256)
3213 while (start
<= end
)
3214 coding
->spec
.ccl
.valid_codes
[start
++] = 1;
3219 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3223 coding
->type
= coding_type_raw_text
;
3227 goto label_invalid_coding_system
;
3231 label_invalid_coding_system
:
3232 coding
->type
= coding_type_no_conversion
;
3233 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3234 coding
->common_flags
= 0;
3235 coding
->eol_type
= CODING_EOL_LF
;
3236 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3240 /* Setup raw-text or one of its subsidiaries in the structure
3241 coding_system CODING according to the already setup value eol_type
3242 in CODING. CODING should be setup for some coding system in
3246 setup_raw_text_coding_system (coding
)
3247 struct coding_system
*coding
;
3249 if (coding
->type
!= coding_type_raw_text
)
3251 coding
->symbol
= Qraw_text
;
3252 coding
->type
= coding_type_raw_text
;
3253 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3255 Lisp_Object subsidiaries
;
3256 subsidiaries
= Fget (Qraw_text
, Qeol_type
);
3258 if (VECTORP (subsidiaries
)
3259 && XVECTOR (subsidiaries
)->size
== 3)
3261 = XVECTOR (subsidiaries
)->contents
[coding
->eol_type
];
3263 setup_coding_system (coding
->symbol
, coding
);
3268 /* Emacs has a mechanism to automatically detect a coding system if it
3269 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3270 it's impossible to distinguish some coding systems accurately
3271 because they use the same range of codes. So, at first, coding
3272 systems are categorized into 7, those are:
3274 o coding-category-emacs-mule
3276 The category for a coding system which has the same code range
3277 as Emacs' internal format. Assigned the coding-system (Lisp
3278 symbol) `emacs-mule' by default.
3280 o coding-category-sjis
3282 The category for a coding system which has the same code range
3283 as SJIS. Assigned the coding-system (Lisp
3284 symbol) `japanese-shift-jis' by default.
3286 o coding-category-iso-7
3288 The category for a coding system which has the same code range
3289 as ISO2022 of 7-bit environment. This doesn't use any locking
3290 shift and single shift functions. This can encode/decode all
3291 charsets. Assigned the coding-system (Lisp symbol)
3292 `iso-2022-7bit' by default.
3294 o coding-category-iso-7-tight
3296 Same as coding-category-iso-7 except that this can
3297 encode/decode only the specified charsets.
3299 o coding-category-iso-8-1
3301 The category for a coding system which has the same code range
3302 as ISO2022 of 8-bit environment and graphic plane 1 used only
3303 for DIMENSION1 charset. This doesn't use any locking shift
3304 and single shift functions. Assigned the coding-system (Lisp
3305 symbol) `iso-latin-1' by default.
3307 o coding-category-iso-8-2
3309 The category for a coding system which has the same code range
3310 as ISO2022 of 8-bit environment and graphic plane 1 used only
3311 for DIMENSION2 charset. This doesn't use any locking shift
3312 and single shift functions. Assigned the coding-system (Lisp
3313 symbol) `japanese-iso-8bit' by default.
3315 o coding-category-iso-7-else
3317 The category for a coding system which has the same code range
3318 as ISO2022 of 7-bit environemnt but uses locking shift or
3319 single shift functions. Assigned the coding-system (Lisp
3320 symbol) `iso-2022-7bit-lock' by default.
3322 o coding-category-iso-8-else
3324 The category for a coding system which has the same code range
3325 as ISO2022 of 8-bit environemnt but uses locking shift or
3326 single shift functions. Assigned the coding-system (Lisp
3327 symbol) `iso-2022-8bit-ss2' by default.
3329 o coding-category-big5
3331 The category for a coding system which has the same code range
3332 as BIG5. Assigned the coding-system (Lisp symbol)
3333 `cn-big5' by default.
3335 o coding-category-ccl
3337 The category for a coding system of which encoder/decoder is
3338 written in CCL programs. The default value is nil, i.e., no
3339 coding system is assigned.
3341 o coding-category-binary
3343 The category for a coding system not categorized in any of the
3344 above. Assigned the coding-system (Lisp symbol)
3345 `no-conversion' by default.
3347 Each of them is a Lisp symbol and the value is an actual
3348 `coding-system's (this is also a Lisp symbol) assigned by a user.
3349 What Emacs does actually is to detect a category of coding system.
3350 Then, it uses a `coding-system' assigned to it. If Emacs can't
3351 decide only one possible category, it selects a category of the
3352 highest priority. Priorities of categories are also specified by a
3353 user in a Lisp variable `coding-category-list'.
3358 int ascii_skip_code
[256];
3360 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3361 If it detects possible coding systems, return an integer in which
3362 appropriate flag bits are set. Flag bits are defined by macros
3363 CODING_CATEGORY_MASK_XXX in `coding.h'.
3365 How many ASCII characters are at the head is returned as *SKIP. */
3368 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3369 unsigned char *source
;
3370 int src_bytes
, *priorities
, *skip
;
3372 register unsigned char c
;
3373 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3377 /* At first, skip all ASCII characters and control characters except
3378 for three ISO2022 specific control characters. */
3379 ascii_skip_code
[ISO_CODE_SO
] = 0;
3380 ascii_skip_code
[ISO_CODE_SI
] = 0;
3381 ascii_skip_code
[ISO_CODE_ESC
] = 0;
3383 label_loop_detect_coding
:
3384 while (src
< src_end
&& ascii_skip_code
[*src
]) src
++;
3385 *skip
= src
- source
;
3388 /* We found nothing other than ASCII. There's nothing to do. */
3392 /* The text seems to be encoded in some multilingual coding system.
3393 Now, try to find in which coding system the text is encoded. */
3396 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3397 /* C is an ISO2022 specific control code of C0. */
3398 mask
= detect_coding_iso2022 (src
, src_end
);
3401 /* No valid ISO2022 code follows C. Try again. */
3403 if (c
== ISO_CODE_ESC
)
3404 ascii_skip_code
[ISO_CODE_ESC
] = 1;
3406 ascii_skip_code
[ISO_CODE_SO
] = ascii_skip_code
[ISO_CODE_SI
] = 1;
3407 goto label_loop_detect_coding
;
3410 goto label_return_highest_only
;
3418 /* C is the first byte of SJIS character code,
3419 or a leading-code of Emacs' internal format (emacs-mule). */
3420 try = CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
;
3422 /* Or, if C is a special latin extra code,
3423 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3424 or is an ISO2022 control-sequence-introducer (CSI),
3425 we should also consider the possibility of ISO2022 codings. */
3426 if ((VECTORP (Vlatin_extra_code_table
)
3427 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3428 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3429 || (c
== ISO_CODE_CSI
3432 || ((*src
== '0' || *src
== '1' || *src
== '2')
3433 && src
+ 1 < src_end
3434 && src
[1] == ']')))))
3435 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3436 | CODING_CATEGORY_MASK_ISO_8BIT
);
3439 /* C is a character of ISO2022 in graphic plane right,
3440 or a SJIS's 1-byte character code (i.e. JISX0201),
3441 or the first byte of BIG5's 2-byte code. */
3442 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3443 | CODING_CATEGORY_MASK_ISO_8BIT
3444 | CODING_CATEGORY_MASK_SJIS
3445 | CODING_CATEGORY_MASK_BIG5
);
3447 /* Or, we may have to consider the possibility of CCL. */
3448 if (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3449 && (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3450 ->spec
.ccl
.valid_codes
)[c
])
3451 try |= CODING_CATEGORY_MASK_CCL
;
3456 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3458 if (priorities
[i
] & try & CODING_CATEGORY_MASK_ISO
)
3459 mask
= detect_coding_iso2022 (src
, src_end
);
3460 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_SJIS
)
3461 mask
= detect_coding_sjis (src
, src_end
);
3462 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_BIG5
)
3463 mask
= detect_coding_big5 (src
, src_end
);
3464 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_EMACS_MULE
)
3465 mask
= detect_coding_emacs_mule (src
, src_end
);
3466 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_CCL
)
3467 mask
= detect_coding_ccl (src
, src_end
);
3468 else if (priorities
[i
] & CODING_CATEGORY_MASK_RAW_TEXT
)
3469 mask
= CODING_CATEGORY_MASK_RAW_TEXT
;
3470 else if (priorities
[i
] & CODING_CATEGORY_MASK_BINARY
)
3471 mask
= CODING_CATEGORY_MASK_BINARY
;
3473 goto label_return_highest_only
;
3475 return CODING_CATEGORY_MASK_RAW_TEXT
;
3477 if (try & CODING_CATEGORY_MASK_ISO
)
3478 mask
|= detect_coding_iso2022 (src
, src_end
);
3479 if (try & CODING_CATEGORY_MASK_SJIS
)
3480 mask
|= detect_coding_sjis (src
, src_end
);
3481 if (try & CODING_CATEGORY_MASK_BIG5
)
3482 mask
|= detect_coding_big5 (src
, src_end
);
3483 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3484 mask
|= detect_coding_emacs_mule (src
, src_end
);
3485 if (try & CODING_CATEGORY_MASK_CCL
)
3486 mask
|= detect_coding_ccl (src
, src_end
);
3488 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
| CODING_CATEGORY_MASK_BINARY
);
3490 label_return_highest_only
:
3491 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3493 if (mask
& priorities
[i
])
3494 return priorities
[i
];
3496 return CODING_CATEGORY_MASK_RAW_TEXT
;
3499 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3500 The information of the detected coding system is set in CODING. */
3503 detect_coding (coding
, src
, src_bytes
)
3504 struct coding_system
*coding
;
3512 val
= Vcoding_category_list
;
3513 mask
= detect_coding_mask (src
, src_bytes
, coding_priorities
, &skip
);
3514 coding
->heading_ascii
= skip
;
3518 /* We found a single coding system of the highest priority in MASK. */
3520 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3522 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3524 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3526 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3530 tmp
= Fget (val
, Qeol_type
);
3532 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3534 setup_coding_system (val
, coding
);
3535 /* Set this again because setup_coding_system reset this member. */
3536 coding
->heading_ascii
= skip
;
3539 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3540 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3541 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3543 How many non-eol characters are at the head is returned as *SKIP. */
3545 #define MAX_EOL_CHECK_COUNT 3
3548 detect_eol_type (source
, src_bytes
, skip
)
3549 unsigned char *source
;
3550 int src_bytes
, *skip
;
3552 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3554 int total
= 0; /* How many end-of-lines are found so far. */
3555 int eol_type
= CODING_EOL_UNDECIDED
;
3560 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3563 if (c
== '\n' || c
== '\r')
3566 *skip
= src
- 1 - source
;
3569 this_eol_type
= CODING_EOL_LF
;
3570 else if (src
>= src_end
|| *src
!= '\n')
3571 this_eol_type
= CODING_EOL_CR
;
3573 this_eol_type
= CODING_EOL_CRLF
, src
++;
3575 if (eol_type
== CODING_EOL_UNDECIDED
)
3576 /* This is the first end-of-line. */
3577 eol_type
= this_eol_type
;
3578 else if (eol_type
!= this_eol_type
)
3580 /* The found type is different from what found before. */
3581 eol_type
= CODING_EOL_INCONSISTENT
;
3588 *skip
= src_end
- source
;
3592 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3593 is encoded. If it detects an appropriate format of end-of-line, it
3594 sets the information in *CODING. */
3597 detect_eol (coding
, src
, src_bytes
)
3598 struct coding_system
*coding
;
3604 int eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3606 if (coding
->heading_ascii
> skip
)
3607 coding
->heading_ascii
= skip
;
3609 skip
= coding
->heading_ascii
;
3611 if (eol_type
== CODING_EOL_UNDECIDED
)
3613 if (eol_type
== CODING_EOL_INCONSISTENT
)
3616 /* This code is suppressed until we find a better way to
3617 distinguish raw text file and binary file. */
3619 /* If we have already detected that the coding is raw-text, the
3620 coding should actually be no-conversion. */
3621 if (coding
->type
== coding_type_raw_text
)
3623 setup_coding_system (Qno_conversion
, coding
);
3626 /* Else, let's decode only text code anyway. */
3628 eol_type
= CODING_EOL_LF
;
3631 val
= Fget (coding
->symbol
, Qeol_type
);
3632 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3634 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3635 coding
->heading_ascii
= skip
;
3639 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3641 #define DECODING_BUFFER_MAG(coding) \
3642 (coding->type == coding_type_iso2022 \
3644 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3646 : (coding->type == coding_type_raw_text \
3648 : (coding->type == coding_type_ccl \
3649 ? coding->spec.ccl.decoder.buf_magnification \
3652 /* Return maximum size (bytes) of a buffer enough for decoding
3653 SRC_BYTES of text encoded in CODING. */
3656 decoding_buffer_size (coding
, src_bytes
)
3657 struct coding_system
*coding
;
3660 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3661 + CONVERSION_BUFFER_EXTRA_ROOM
);
3664 /* Return maximum size (bytes) of a buffer enough for encoding
3665 SRC_BYTES of text to CODING. */
3668 encoding_buffer_size (coding
, src_bytes
)
3669 struct coding_system
*coding
;
3674 if (coding
->type
== coding_type_ccl
)
3675 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3679 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3682 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3683 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3686 char *conversion_buffer
;
3687 int conversion_buffer_size
;
3689 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3690 or decoding. Sufficient memory is allocated automatically. If we
3691 run out of memory, return NULL. */
3694 get_conversion_buffer (size
)
3697 if (size
> conversion_buffer_size
)
3700 int real_size
= conversion_buffer_size
* 2;
3702 while (real_size
< size
) real_size
*= 2;
3703 buf
= (char *) xmalloc (real_size
);
3704 xfree (conversion_buffer
);
3705 conversion_buffer
= buf
;
3706 conversion_buffer_size
= real_size
;
3708 return conversion_buffer
;
3712 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3713 struct coding_system
*coding
;
3714 unsigned char *source
, *destination
;
3715 int src_bytes
, dst_bytes
, encodep
;
3717 struct ccl_program
*ccl
3718 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3721 ccl
->last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
3723 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3724 src_bytes
, dst_bytes
, &(coding
->consumed
));
3725 coding
->produced_char
3728 : multibyte_chars_in_text (destination
, coding
->produced
));
3729 coding
->consumed_char
3730 = multibyte_chars_in_text (source
, coding
->consumed
);
3732 switch (ccl
->status
)
3734 case CCL_STAT_SUSPEND_BY_SRC
:
3735 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3737 case CCL_STAT_SUSPEND_BY_DST
:
3738 result
= CODING_FINISH_INSUFFICIENT_DST
;
3741 case CCL_STAT_INVALID_CMD
:
3742 result
= CODING_FINISH_INTERRUPT
;
3745 result
= CODING_FINISH_NORMAL
;
3751 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3752 decoding, it may detect coding system and format of end-of-line if
3753 those are not yet decided.
3755 This function does not make full use of DESTINATION buffer. For
3756 instance, if coding->type is coding_type_iso2022, it uses only
3757 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3758 DST_BYTES is decided by the function decoding_buffer_size, it
3759 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3760 So, this function can decode the full SOURCE. But, in the other
3761 case, if you want to avoid carry over, you must supply at least 7
3762 bytes more area in DESTINATION buffer than expected maximum bytes
3763 that will be produced by this function. */
3766 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3767 struct coding_system
*coding
;
3768 unsigned char *source
, *destination
;
3769 int src_bytes
, dst_bytes
;
3774 && coding
->type
!= coding_type_ccl
3775 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3776 && CODING_REQUIRE_FLUSHING (coding
)))
3778 coding
->produced
= coding
->produced_char
= 0;
3779 coding
->consumed
= coding
->consumed_char
= 0;
3780 coding
->fake_multibyte
= 0;
3781 return CODING_FINISH_NORMAL
;
3784 if (coding
->type
== coding_type_undecided
)
3785 detect_coding (coding
, source
, src_bytes
);
3787 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3788 detect_eol (coding
, source
, src_bytes
);
3790 switch (coding
->type
)
3792 case coding_type_emacs_mule
:
3793 case coding_type_undecided
:
3794 case coding_type_raw_text
:
3795 if (coding
->eol_type
== CODING_EOL_LF
3796 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3797 goto label_no_conversion
;
3798 result
= decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3801 case coding_type_sjis
:
3802 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3803 src_bytes
, dst_bytes
, 1);
3806 case coding_type_iso2022
:
3807 result
= decode_coding_iso2022 (coding
, source
, destination
,
3808 src_bytes
, dst_bytes
);
3811 case coding_type_big5
:
3812 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3813 src_bytes
, dst_bytes
, 0);
3816 case coding_type_ccl
:
3817 result
= ccl_coding_driver (coding
, source
, destination
,
3818 src_bytes
, dst_bytes
, 0);
3821 default: /* i.e. case coding_type_no_conversion: */
3822 label_no_conversion
:
3823 if (dst_bytes
&& src_bytes
> dst_bytes
)
3825 coding
->produced
= dst_bytes
;
3826 result
= CODING_FINISH_INSUFFICIENT_DST
;
3830 coding
->produced
= src_bytes
;
3831 result
= CODING_FINISH_NORMAL
;
3834 bcopy (source
, destination
, coding
->produced
);
3836 safe_bcopy (source
, destination
, coding
->produced
);
3837 coding
->fake_multibyte
= 1;
3839 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3846 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3848 This function does not make full use of DESTINATION buffer. For
3849 instance, if coding->type is coding_type_iso2022, it uses only
3850 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3851 DST_BYTES is decided by the function encoding_buffer_size, it
3852 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3853 So, this function can encode the full SOURCE. But, in the other
3854 case, if you want to avoid carry over, you must supply at least 20
3855 bytes more area in DESTINATION buffer than expected maximum bytes
3856 that will be produced by this function. */
3859 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3860 struct coding_system
*coding
;
3861 unsigned char *source
, *destination
;
3862 int src_bytes
, dst_bytes
;
3867 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3868 && CODING_REQUIRE_FLUSHING (coding
)))
3870 coding
->produced
= coding
->produced_char
= 0;
3871 coding
->consumed
= coding
->consumed_char
= 0;
3872 coding
->fake_multibyte
= 0;
3873 return CODING_FINISH_NORMAL
;
3876 switch (coding
->type
)
3878 case coding_type_emacs_mule
:
3879 case coding_type_undecided
:
3880 case coding_type_raw_text
:
3881 if (coding
->eol_type
== CODING_EOL_LF
3882 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3883 goto label_no_conversion
;
3884 result
= encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3887 case coding_type_sjis
:
3888 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3889 src_bytes
, dst_bytes
, 1);
3892 case coding_type_iso2022
:
3893 result
= encode_coding_iso2022 (coding
, source
, destination
,
3894 src_bytes
, dst_bytes
);
3897 case coding_type_big5
:
3898 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3899 src_bytes
, dst_bytes
, 0);
3902 case coding_type_ccl
:
3903 result
= ccl_coding_driver (coding
, source
, destination
,
3904 src_bytes
, dst_bytes
, 1);
3907 default: /* i.e. case coding_type_no_conversion: */
3908 label_no_conversion
:
3909 if (dst_bytes
&& src_bytes
> dst_bytes
)
3911 coding
->produced
= dst_bytes
;
3912 result
= CODING_FINISH_INSUFFICIENT_DST
;
3916 coding
->produced
= src_bytes
;
3917 result
= CODING_FINISH_NORMAL
;
3920 bcopy (source
, destination
, coding
->produced
);
3922 safe_bcopy (source
, destination
, coding
->produced
);
3923 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
3925 unsigned char *p
= destination
, *pend
= p
+ coding
->produced
;
3927 if (*p
++ == '\015') p
[-1] = '\n';
3929 coding
->fake_multibyte
= 1;
3931 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3938 /* Scan text in the region between *BEG and *END (byte positions),
3939 skip characters which we don't have to decode by coding system
3940 CODING at the head and tail, then set *BEG and *END to the region
3941 of the text we actually have to convert. The caller should move
3942 the gap out of the region in advance.
3944 If STR is not NULL, *BEG and *END are indices into STR. */
3947 shrink_decoding_region (beg
, end
, coding
, str
)
3949 struct coding_system
*coding
;
3952 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
3954 Lisp_Object translation_table
;
3956 if (coding
->type
== coding_type_ccl
3957 || coding
->type
== coding_type_undecided
3958 || !NILP (coding
->post_read_conversion
))
3960 /* We can't skip any data. */
3963 else if (coding
->type
== coding_type_no_conversion
)
3965 /* We need no conversion, but don't have to skip any data here.
3966 Decoding routine handles them effectively anyway. */
3970 translation_table
= coding
->translation_table_for_decode
;
3971 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
3972 translation_table
= Vstandard_translation_table_for_decode
;
3973 if (CHAR_TABLE_P (translation_table
))
3976 for (i
= 0; i
< 128; i
++)
3977 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
3980 /* Some ASCII character should be tranlsated. We give up
3985 eol_conversion
= (coding
->eol_type
!= CODING_EOL_LF
);
3987 if ((! eol_conversion
) && (coding
->heading_ascii
>= 0))
3988 /* Detection routine has already found how much we can skip at the
3990 *beg
+= coding
->heading_ascii
;
3994 begp_orig
= begp
= str
+ *beg
;
3995 endp_orig
= endp
= str
+ *end
;
3999 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4000 endp_orig
= endp
= begp
+ *end
- *beg
;
4003 switch (coding
->type
)
4005 case coding_type_emacs_mule
:
4006 case coding_type_raw_text
:
4009 if (coding
->heading_ascii
< 0)
4010 while (begp
< endp
&& *begp
!= '\r' && *begp
< 0x80) begp
++;
4011 while (begp
< endp
&& endp
[-1] != '\r' && endp
[-1] < 0x80)
4013 /* Do not consider LF as ascii if preceded by CR, since that
4014 confuses eol decoding. */
4015 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4022 case coding_type_sjis
:
4023 case coding_type_big5
:
4024 /* We can skip all ASCII characters at the head. */
4025 if (coding
->heading_ascii
< 0)
4028 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
4030 while (begp
< endp
&& *begp
< 0x80) begp
++;
4032 /* We can skip all ASCII characters at the tail except for the
4033 second byte of SJIS or BIG5 code. */
4035 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
4037 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
4038 /* Do not consider LF as ascii if preceded by CR, since that
4039 confuses eol decoding. */
4040 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4042 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
4046 default: /* i.e. case coding_type_iso2022: */
4047 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4048 /* We can't skip any data. */
4050 if (coding
->heading_ascii
< 0)
4052 /* We can skip all ASCII characters at the head except for a
4053 few control codes. */
4054 while (begp
< endp
&& (c
= *begp
) < 0x80
4055 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
4056 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
4057 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
4060 switch (coding
->category_idx
)
4062 case CODING_CATEGORY_IDX_ISO_8_1
:
4063 case CODING_CATEGORY_IDX_ISO_8_2
:
4064 /* We can skip all ASCII characters at the tail. */
4066 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
4068 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
4069 /* Do not consider LF as ascii if preceded by CR, since that
4070 confuses eol decoding. */
4071 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4075 case CODING_CATEGORY_IDX_ISO_7
:
4076 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
4078 /* We can skip all charactes at the tail except for 8-bit
4079 codes and ESC and the following 2-byte at the tail. */
4080 unsigned char *eight_bit
= NULL
;
4084 && (c
= endp
[-1]) != ISO_CODE_ESC
&& c
!= '\r')
4086 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4091 && (c
= endp
[-1]) != ISO_CODE_ESC
)
4093 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4096 /* Do not consider LF as ascii if preceded by CR, since that
4097 confuses eol decoding. */
4098 if (begp
< endp
&& endp
< endp_orig
4099 && endp
[-1] == '\r' && endp
[0] == '\n')
4101 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
4103 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
4104 /* This is an ASCII designation sequence. We can
4105 surely skip the tail. But, if we have
4106 encountered an 8-bit code, skip only the codes
4108 endp
= eight_bit
? eight_bit
: endp
+ 2;
4110 /* Hmmm, we can't skip the tail. */
4118 *beg
+= begp
- begp_orig
;
4119 *end
+= endp
- endp_orig
;
4123 /* Like shrink_decoding_region but for encoding. */
4126 shrink_encoding_region (beg
, end
, coding
, str
)
4128 struct coding_system
*coding
;
4131 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
4133 Lisp_Object translation_table
;
4135 if (coding
->type
== coding_type_ccl
)
4136 /* We can't skip any data. */
4138 else if (coding
->type
== coding_type_no_conversion
)
4140 /* We need no conversion. */
4145 translation_table
= coding
->translation_table_for_encode
;
4146 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4147 translation_table
= Vstandard_translation_table_for_encode
;
4148 if (CHAR_TABLE_P (translation_table
))
4151 for (i
= 0; i
< 128; i
++)
4152 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4155 /* Some ASCII character should be tranlsated. We give up
4162 begp_orig
= begp
= str
+ *beg
;
4163 endp_orig
= endp
= str
+ *end
;
4167 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4168 endp_orig
= endp
= begp
+ *end
- *beg
;
4171 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4172 || coding
->eol_type
== CODING_EOL_CRLF
);
4174 /* Here, we don't have to check coding->pre_write_conversion because
4175 the caller is expected to have handled it already. */
4176 switch (coding
->type
)
4178 case coding_type_undecided
:
4179 case coding_type_emacs_mule
:
4180 case coding_type_raw_text
:
4183 while (begp
< endp
&& *begp
!= '\n') begp
++;
4184 while (begp
< endp
&& endp
[-1] != '\n') endp
--;
4190 case coding_type_iso2022
:
4191 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4192 /* We can't skip any data. */
4194 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
4196 unsigned char *bol
= begp
;
4197 while (begp
< endp
&& *begp
< 0x80)
4200 if (begp
[-1] == '\n')
4204 goto label_skip_tail
;
4209 /* We can skip all ASCII characters at the head and tail. */
4211 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
4213 while (begp
< endp
&& *begp
< 0x80) begp
++;
4216 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
4218 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
4222 *beg
+= begp
- begp_orig
;
4223 *end
+= endp
- endp_orig
;
4227 /* As shrinking conversion region requires some overhead, we don't try
4228 shrinking if the length of conversion region is less than this
4230 static int shrink_conversion_region_threshhold
= 1024;
4232 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4234 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4236 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4237 else shrink_decoding_region (beg, end, coding, str); \
4242 code_convert_region_unwind (dummy
)
4245 inhibit_pre_post_conversion
= 0;
4249 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4250 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4251 coding system CODING, and return the status code of code conversion
4252 (currently, this value has no meaning).
4254 How many characters (and bytes) are converted to how many
4255 characters (and bytes) are recorded in members of the structure
4258 If REPLACE is nonzero, we do various things as if the original text
4259 is deleted and a new text is inserted. See the comments in
4260 replace_range (insdel.c) to know what we are doing. */
4263 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
4264 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
4265 struct coding_system
*coding
;
4267 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
4268 int require
, inserted
, inserted_byte
;
4269 int head_skip
, tail_skip
, total_skip
;
4270 Lisp_Object saved_coding_symbol
;
4271 int multibyte
= !NILP (current_buffer
->enable_multibyte_characters
);
4273 int fake_multibyte
= 0;
4274 unsigned char *src
, *dst
;
4275 Lisp_Object deletion
;
4276 int orig_point
= PT
, orig_len
= len
;
4280 saved_coding_symbol
= Qnil
;
4282 if (from
< PT
&& PT
< to
)
4284 TEMP_SET_PT_BOTH (from
, from_byte
);
4290 int saved_from
= from
;
4292 prepare_to_modify_buffer (from
, to
, &from
);
4293 if (saved_from
!= from
)
4297 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
4299 from_byte
= from
, to_byte
= to
;
4300 len_byte
= to_byte
- from_byte
;
4304 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4306 /* We must detect encoding of text and eol format. */
4308 if (from
< GPT
&& to
> GPT
)
4309 move_gap_both (from
, from_byte
);
4310 if (coding
->type
== coding_type_undecided
)
4312 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4313 if (coding
->type
== coding_type_undecided
)
4314 /* It seems that the text contains only ASCII, but we
4315 should not left it undecided because the deeper
4316 decoding routine (decode_coding) tries to detect the
4317 encodings again in vain. */
4318 coding
->type
= coding_type_emacs_mule
;
4320 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4322 saved_coding_symbol
= coding
->symbol
;
4323 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4324 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4325 coding
->eol_type
= CODING_EOL_LF
;
4326 /* We had better recover the original eol format if we
4327 encounter an inconsitent eol format while decoding. */
4328 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4332 coding
->consumed_char
= len
, coding
->consumed
= len_byte
;
4335 ? ! CODING_REQUIRE_ENCODING (coding
)
4336 : ! CODING_REQUIRE_DECODING (coding
))
4338 coding
->produced
= len_byte
;
4341 /* See the comment of the member heading_ascii in coding.h. */
4342 && coding
->heading_ascii
< len_byte
)
4344 /* We still may have to combine byte at the head and the
4345 tail of the text in the region. */
4346 if (from
< GPT
&& GPT
< to
)
4347 move_gap_both (to
, to_byte
);
4348 len
= multibyte_chars_in_text (BYTE_POS_ADDR (from_byte
), len_byte
);
4349 adjust_after_insert (from
, from_byte
, to
, to_byte
, len
);
4350 coding
->produced_char
= len
;
4355 adjust_after_insert (from
, from_byte
, to
, to_byte
, len_byte
);
4356 coding
->produced_char
= len_byte
;
4361 /* Now we convert the text. */
4363 /* For encoding, we must process pre-write-conversion in advance. */
4365 && ! NILP (coding
->pre_write_conversion
)
4366 && SYMBOLP (coding
->pre_write_conversion
)
4367 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4369 /* The function in pre-write-conversion may put a new text in a
4371 struct buffer
*prev
= current_buffer
;
4373 int count
= specpdl_ptr
- specpdl
;
4375 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4376 /* We should not call any more pre-write/post-read-conversion
4377 functions while this pre-write-conversion is running. */
4378 inhibit_pre_post_conversion
= 1;
4379 call2 (coding
->pre_write_conversion
,
4380 make_number (from
), make_number (to
));
4381 inhibit_pre_post_conversion
= 0;
4382 /* Discard the unwind protect. */
4385 if (current_buffer
!= prev
)
4388 new = Fcurrent_buffer ();
4389 set_buffer_internal_1 (prev
);
4390 del_range_2 (from
, from_byte
, to
, to_byte
);
4391 TEMP_SET_PT_BOTH (from
, from_byte
);
4392 insert_from_buffer (XBUFFER (new), 1, len
, 0);
4394 if (orig_point
>= to
)
4395 orig_point
+= len
- orig_len
;
4396 else if (orig_point
> from
)
4400 from_byte
= multibyte
? CHAR_TO_BYTE (from
) : from_byte
;
4401 to_byte
= multibyte
? CHAR_TO_BYTE (to
) : to
;
4402 len_byte
= to_byte
- from_byte
;
4403 TEMP_SET_PT_BOTH (from
, from_byte
);
4408 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4410 /* Try to skip the heading and tailing ASCIIs. */
4412 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4414 if (from
< GPT
&& GPT
< to
)
4415 move_gap_both (from
, from_byte
);
4416 SHRINK_CONVERSION_REGION (&from_byte
, &to_byte
, coding
, NULL
, encodep
);
4417 if (from_byte
== to_byte
4418 && coding
->type
!= coding_type_ccl
4419 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
4420 && CODING_REQUIRE_FLUSHING (coding
)))
4422 coding
->produced
= len_byte
;
4423 coding
->produced_char
= multibyte
? len
: len_byte
;
4425 /* We must record and adjust for this new text now. */
4426 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4430 head_skip
= from_byte
- from_byte_orig
;
4431 tail_skip
= to_byte_orig
- to_byte
;
4432 total_skip
= head_skip
+ tail_skip
;
4435 len
-= total_skip
; len_byte
-= total_skip
;
4438 /* The code conversion routine can not preserve text properties for
4439 now. So, we must remove all text properties in the region.
4440 Here, we must suppress all modification hooks. */
4443 int saved_inhibit_modification_hooks
= inhibit_modification_hooks
;
4444 inhibit_modification_hooks
= 1;
4445 Fset_text_properties (make_number (from
), make_number (to
), Qnil
, Qnil
);
4446 inhibit_modification_hooks
= saved_inhibit_modification_hooks
;
4449 /* For converion, we must put the gap before the text in addition to
4450 making the gap larger for efficient decoding. The required gap
4451 size starts from 2000 which is the magic number used in make_gap.
4452 But, after one batch of conversion, it will be incremented if we
4453 find that it is not enough . */
4456 if (GAP_SIZE
< require
)
4457 make_gap (require
- GAP_SIZE
);
4458 move_gap_both (from
, from_byte
);
4460 inserted
= inserted_byte
= 0;
4461 src
= GAP_END_ADDR
, dst
= GPT_ADDR
;
4463 GAP_SIZE
+= len_byte
;
4466 ZV_BYTE
-= len_byte
;
4469 if (GPT
- BEG
< BEG_UNCHANGED
)
4470 BEG_UNCHANGED
= GPT
- BEG
;
4471 if (Z
- GPT
< END_UNCHANGED
)
4472 END_UNCHANGED
= Z
- GPT
;
4478 /* The buffer memory is changed from:
4479 +--------+converted-text+---------+-------original-text------+---+
4480 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4481 |<------------------- GAP_SIZE -------------------->| */
4483 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4485 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4487 +--------+-------converted-text--------+--+---original-text--+---+
4488 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4489 |<------------------- GAP_SIZE -------------------->| */
4490 if (coding
->fake_multibyte
)
4493 if (!encodep
&& !multibyte
)
4494 coding
->produced_char
= coding
->produced
;
4495 inserted
+= coding
->produced_char
;
4496 inserted_byte
+= coding
->produced
;
4497 len_byte
-= coding
->consumed
;
4498 src
+= coding
->consumed
;
4499 dst
+= coding
->produced
;
4501 if (result
== CODING_FINISH_NORMAL
)
4506 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4508 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4509 Lisp_Object eol_type
;
4511 /* Encode LFs back to the original eol format (CR or CRLF). */
4512 if (coding
->eol_type
== CODING_EOL_CR
)
4514 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4520 while (p
< pend
) if (*p
++ == '\n') count
++;
4521 if (src
- dst
< count
)
4523 /* We don't have sufficient room for encoding LFs
4524 back to CRLF. We must record converted and
4525 not-yet-converted text back to the buffer
4526 content, enlarge the gap, then record them out of
4527 the buffer contents again. */
4528 int add
= len_byte
+ inserted_byte
;
4531 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4532 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4533 make_gap (count
- GAP_SIZE
);
4535 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4536 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4537 /* Don't forget to update SRC, DST, and PEND. */
4538 src
= GAP_END_ADDR
- len_byte
;
4539 dst
= GPT_ADDR
+ inserted_byte
;
4543 inserted_byte
+= count
;
4544 coding
->produced
+= count
;
4545 p
= dst
= pend
+ count
;
4549 if (*p
== '\n') count
--, *--p
= '\r';
4553 /* Suppress eol-format conversion in the further conversion. */
4554 coding
->eol_type
= CODING_EOL_LF
;
4556 /* Set the coding system symbol to that for Unix-like EOL. */
4557 eol_type
= Fget (saved_coding_symbol
, Qeol_type
);
4558 if (VECTORP (eol_type
)
4559 && XVECTOR (eol_type
)->size
== 3
4560 && SYMBOLP (XVECTOR (eol_type
)->contents
[CODING_EOL_LF
]))
4561 coding
->symbol
= XVECTOR (eol_type
)->contents
[CODING_EOL_LF
];
4563 coding
->symbol
= saved_coding_symbol
;
4569 if (coding
->type
!= coding_type_ccl
4570 || coding
->mode
& CODING_MODE_LAST_BLOCK
)
4572 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
4575 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4577 /* The source text ends in invalid codes. Let's just
4578 make them valid buffer contents, and finish conversion. */
4579 inserted
+= len_byte
;
4580 inserted_byte
+= len_byte
;
4586 if (result
== CODING_FINISH_INTERRUPT
)
4588 /* The conversion procedure was interrupted by a user. */
4592 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4593 if (coding
->consumed
< 1)
4595 /* It's quite strange to require more memory without
4596 consuming any bytes. Perhaps CCL program bug. */
4602 /* We have just done the first batch of conversion which was
4603 stoped because of insufficient gap. Let's reconsider the
4604 required gap size (i.e. SRT - DST) now.
4606 We have converted ORIG bytes (== coding->consumed) into
4607 NEW bytes (coding->produced). To convert the remaining
4608 LEN bytes, we may need REQUIRE bytes of gap, where:
4609 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4610 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4611 Here, we are sure that NEW >= ORIG. */
4612 float ratio
= coding
->produced
- coding
->consumed
;
4613 ratio
/= coding
->consumed
;
4614 require
= len_byte
* ratio
;
4617 if ((src
- dst
) < (require
+ 2000))
4619 /* See the comment above the previous call of make_gap. */
4620 int add
= len_byte
+ inserted_byte
;
4623 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4624 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4625 make_gap (require
+ 2000);
4627 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4628 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4629 /* Don't forget to update SRC, DST. */
4630 src
= GAP_END_ADDR
- len_byte
;
4631 dst
= GPT_ADDR
+ inserted_byte
;
4634 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4639 || (to
- from
) != (to_byte
- from_byte
)))
4640 inserted
= multibyte_chars_in_text (GPT_ADDR
, inserted_byte
);
4642 /* If we have shrinked the conversion area, adjust it now. */
4646 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4647 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4648 GAP_SIZE
+= total_skip
;
4649 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4650 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4651 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4652 from
-= head_skip
; from_byte
-= head_skip
;
4653 to
+= tail_skip
; to_byte
+= tail_skip
;
4657 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4658 inserted
= Z
- prev_Z
;
4660 if (! encodep
&& ! NILP (coding
->post_read_conversion
))
4663 int count
= specpdl_ptr
- specpdl
;
4666 TEMP_SET_PT_BOTH (from
, from_byte
);
4668 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4669 /* We should not call any more pre-write/post-read-conversion
4670 functions while this post-read-conversion is running. */
4671 inhibit_pre_post_conversion
= 1;
4672 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4673 inhibit_pre_post_conversion
= 0;
4674 /* Discard the unwind protect. */
4676 CHECK_NUMBER (val
, 0);
4677 inserted
+= Z
- prev_Z
;
4680 if (orig_point
>= from
)
4682 if (orig_point
>= from
+ orig_len
)
4683 orig_point
+= inserted
- orig_len
;
4686 TEMP_SET_PT (orig_point
);
4689 signal_after_change (from
, to
- from
, inserted
);
4692 coding
->consumed
= to_byte
- from_byte
;
4693 coding
->consumed_char
= to
- from
;
4694 coding
->produced
= inserted_byte
;
4695 coding
->produced_char
= inserted
;
4702 code_convert_string (str
, coding
, encodep
, nocopy
)
4704 struct coding_system
*coding
;
4705 int encodep
, nocopy
;
4709 int from
= 0, to
= XSTRING (str
)->size
;
4710 int to_byte
= STRING_BYTES (XSTRING (str
));
4711 struct gcpro gcpro1
;
4712 Lisp_Object saved_coding_symbol
;
4715 saved_coding_symbol
= Qnil
;
4716 if ((encodep
&& !NILP (coding
->pre_write_conversion
)
4717 || !encodep
&& !NILP (coding
->post_read_conversion
)))
4719 /* Since we have to call Lisp functions which assume target text
4720 is in a buffer, after setting a temporary buffer, call
4721 code_convert_region. */
4722 int count
= specpdl_ptr
- specpdl
;
4723 struct buffer
*prev
= current_buffer
;
4724 int multibyte
= STRING_MULTIBYTE (str
);
4726 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4727 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4728 inhibit_pre_post_conversion
= 1;
4730 temp_output_buffer_setup (" *code-converting-work*");
4731 set_buffer_internal (XBUFFER (Vstandard_output
));
4732 /* We must insert the contents of STR as is without
4733 unibyte<->multibyte conversion. For that, we adjust the
4734 multibyteness of the working buffer to that of STR. */
4735 Ferase_buffer (); /* for safety */
4736 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
4737 insert_from_string (str
, 0, 0, to
, to_byte
, 0);
4739 code_convert_region (BEGV
, BEGV_BYTE
, ZV
, ZV_BYTE
, coding
, encodep
, 1);
4740 /* Make a unibyte string if we are encoding, otherwise make a
4741 multibyte string. */
4742 Fset_buffer_multibyte (encodep
? Qnil
: Qt
);
4743 str
= make_buffer_string (BEGV
, ZV
, 0);
4744 return unbind_to (count
, str
);
4747 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4749 /* See the comments in code_convert_region. */
4750 if (coding
->type
== coding_type_undecided
)
4752 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
4753 if (coding
->type
== coding_type_undecided
)
4754 coding
->type
= coding_type_emacs_mule
;
4756 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4758 saved_coding_symbol
= coding
->symbol
;
4759 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
4760 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4761 coding
->eol_type
= CODING_EOL_LF
;
4762 /* We had better recover the original eol format if we
4763 encounter an inconsitent eol format while decoding. */
4764 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4769 ? ! CODING_REQUIRE_ENCODING (coding
)
4770 : ! CODING_REQUIRE_DECODING (coding
))
4774 /* Try to skip the heading and tailing ASCIIs. */
4775 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
4779 && coding
->type
!= coding_type_ccl
)
4780 return (nocopy
? str
: Fcopy_sequence (str
));
4783 len
= encoding_buffer_size (coding
, to_byte
- from
);
4785 len
= decoding_buffer_size (coding
, to_byte
- from
);
4786 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4788 buf
= get_conversion_buffer (len
);
4792 bcopy (XSTRING (str
)->data
, buf
, from
);
4794 ? encode_coding (coding
, XSTRING (str
)->data
+ from
,
4795 buf
+ from
, to_byte
- from
, len
)
4796 : decode_coding (coding
, XSTRING (str
)->data
+ from
,
4797 buf
+ from
, to_byte
- from
, len
));
4798 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4800 /* We simple try to decode the whole string again but without
4801 eol-conversion this time. */
4802 coding
->eol_type
= CODING_EOL_LF
;
4803 coding
->symbol
= saved_coding_symbol
;
4804 return code_convert_string (str
, coding
, encodep
, nocopy
);
4807 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
4808 STRING_BYTES (XSTRING (str
)) - to_byte
);
4810 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4812 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
4815 int chars
= (coding
->fake_multibyte
4816 ? multibyte_chars_in_text (buf
+ from
, coding
->produced
)
4817 : coding
->produced_char
);
4818 str
= make_multibyte_string (buf
, len
+ chars
, len
+ coding
->produced
);
4826 /*** 8. Emacs Lisp library functions ***/
4828 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
4829 "Return t if OBJECT is nil or a coding-system.\n\
4830 See the documentation of `make-coding-system' for information\n\
4831 about coding-system objects.")
4839 /* Get coding-spec vector for OBJ. */
4840 obj
= Fget (obj
, Qcoding_system
);
4841 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
4845 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
4846 Sread_non_nil_coding_system
, 1, 1, 0,
4847 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4854 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4855 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
4857 while (XSTRING (val
)->size
== 0);
4858 return (Fintern (val
, Qnil
));
4861 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
4862 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4863 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4864 (prompt
, default_coding_system
)
4865 Lisp_Object prompt
, default_coding_system
;
4868 if (SYMBOLP (default_coding_system
))
4869 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
4870 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4871 Qt
, Qnil
, Qcoding_system_history
,
4872 default_coding_system
, Qnil
);
4873 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
4876 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
4878 "Check validity of CODING-SYSTEM.\n\
4879 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4880 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4881 The value of property should be a vector of length 5.")
4883 Lisp_Object coding_system
;
4885 CHECK_SYMBOL (coding_system
, 0);
4886 if (!NILP (Fcoding_system_p (coding_system
)))
4887 return coding_system
;
4889 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
4893 detect_coding_system (src
, src_bytes
, highest
)
4895 int src_bytes
, highest
;
4897 int coding_mask
, eol_type
;
4898 Lisp_Object val
, tmp
;
4901 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
4902 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
4903 if (eol_type
== CODING_EOL_INCONSISTENT
)
4904 eol_type
= CODING_EOL_UNDECIDED
;
4909 if (eol_type
!= CODING_EOL_UNDECIDED
)
4912 val2
= Fget (Qundecided
, Qeol_type
);
4914 val
= XVECTOR (val2
)->contents
[eol_type
];
4916 return (highest
? val
: Fcons (val
, Qnil
));
4919 /* At first, gather possible coding systems in VAL. */
4921 for (tmp
= Vcoding_category_list
; !NILP (tmp
); tmp
= XCDR (tmp
))
4924 = XFASTINT (Fget (XCAR (tmp
), Qcoding_category_index
));
4925 if (coding_mask
& (1 << idx
))
4927 val
= Fcons (Fsymbol_value (XCAR (tmp
)), val
);
4933 val
= Fnreverse (val
);
4935 /* Then, replace the elements with subsidiary coding systems. */
4936 for (tmp
= val
; !NILP (tmp
); tmp
= XCDR (tmp
))
4938 if (eol_type
!= CODING_EOL_UNDECIDED
4939 && eol_type
!= CODING_EOL_INCONSISTENT
)
4942 eol
= Fget (XCAR (tmp
), Qeol_type
);
4944 XCAR (tmp
) = XVECTOR (eol
)->contents
[eol_type
];
4947 return (highest
? XCAR (val
) : val
);
4950 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
4952 "Detect coding system of the text in the region between START and END.\n\
4953 Return a list of possible coding systems ordered by priority.\n\
4955 If only ASCII characters are found, it returns a list of single element\n\
4956 `undecided' or its subsidiary coding system according to a detected\n\
4957 end-of-line format.\n\
4959 If optional argument HIGHEST is non-nil, return the coding system of\n\
4961 (start
, end
, highest
)
4962 Lisp_Object start
, end
, highest
;
4965 int from_byte
, to_byte
;
4967 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4968 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4970 validate_region (&start
, &end
);
4971 from
= XINT (start
), to
= XINT (end
);
4972 from_byte
= CHAR_TO_BYTE (from
);
4973 to_byte
= CHAR_TO_BYTE (to
);
4975 if (from
< GPT
&& to
>= GPT
)
4976 move_gap_both (to
, to_byte
);
4978 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
4979 to_byte
- from_byte
,
4983 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
4985 "Detect coding system of the text in STRING.\n\
4986 Return a list of possible coding systems ordered by priority.\n\
4988 If only ASCII characters are found, it returns a list of single element\n\
4989 `undecided' or its subsidiary coding system according to a detected\n\
4990 end-of-line format.\n\
4992 If optional argument HIGHEST is non-nil, return the coding system of\n\
4995 Lisp_Object string
, highest
;
4997 CHECK_STRING (string
, 0);
4999 return detect_coding_system (XSTRING (string
)->data
,
5000 STRING_BYTES (XSTRING (string
)),
5005 code_convert_region1 (start
, end
, coding_system
, encodep
)
5006 Lisp_Object start
, end
, coding_system
;
5009 struct coding_system coding
;
5012 CHECK_NUMBER_COERCE_MARKER (start
, 0);
5013 CHECK_NUMBER_COERCE_MARKER (end
, 1);
5014 CHECK_SYMBOL (coding_system
, 2);
5016 validate_region (&start
, &end
);
5017 from
= XFASTINT (start
);
5018 to
= XFASTINT (end
);
5020 if (NILP (coding_system
))
5021 return make_number (to
- from
);
5023 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5024 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5026 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5027 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
5028 &coding
, encodep
, 1);
5029 Vlast_coding_system_used
= coding
.symbol
;
5030 return make_number (coding
.produced_char
);
5033 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
5034 3, 3, "r\nzCoding system: ",
5035 "Decode the current region by specified coding system.\n\
5036 When called from a program, takes three arguments:\n\
5037 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5038 This function sets `last-coding-system-used' to the precise coding system\n\
5039 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5040 not fully specified.)\n\
5041 It returns the length of the decoded text.")
5042 (start
, end
, coding_system
)
5043 Lisp_Object start
, end
, coding_system
;
5045 return code_convert_region1 (start
, end
, coding_system
, 0);
5048 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
5049 3, 3, "r\nzCoding system: ",
5050 "Encode the current region by specified coding system.\n\
5051 When called from a program, takes three arguments:\n\
5052 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5053 This function sets `last-coding-system-used' to the precise coding system\n\
5054 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5055 not fully specified.)\n\
5056 It returns the length of the encoded text.")
5057 (start
, end
, coding_system
)
5058 Lisp_Object start
, end
, coding_system
;
5060 return code_convert_region1 (start
, end
, coding_system
, 1);
5064 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
5065 Lisp_Object string
, coding_system
, nocopy
;
5068 struct coding_system coding
;
5070 CHECK_STRING (string
, 0);
5071 CHECK_SYMBOL (coding_system
, 1);
5073 if (NILP (coding_system
))
5074 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
5076 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5077 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5079 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5080 Vlast_coding_system_used
= coding
.symbol
;
5081 return code_convert_string (string
, &coding
, encodep
, !NILP (nocopy
));
5084 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
5086 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5087 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5088 if the decoding operation is trivial.\n\
5089 This function sets `last-coding-system-used' to the precise coding system\n\
5090 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5091 not fully specified.)")
5092 (string
, coding_system
, nocopy
)
5093 Lisp_Object string
, coding_system
, nocopy
;
5095 return code_convert_string1 (string
, coding_system
, nocopy
, 0);
5098 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
5100 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5101 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5102 if the encoding operation is trivial.\n\
5103 This function sets `last-coding-system-used' to the precise coding system\n\
5104 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5105 not fully specified.)")
5106 (string
, coding_system
, nocopy
)
5107 Lisp_Object string
, coding_system
, nocopy
;
5109 return code_convert_string1 (string
, coding_system
, nocopy
, 1);
5112 /* Encode or decode STRING according to CODING_SYSTEM.
5113 Do not set Vlast_coding_system_used. */
5116 code_convert_string_norecord (string
, coding_system
, encodep
)
5117 Lisp_Object string
, coding_system
;
5120 struct coding_system coding
;
5122 CHECK_STRING (string
, 0);
5123 CHECK_SYMBOL (coding_system
, 1);
5125 if (NILP (coding_system
))
5128 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5129 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5131 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5132 return code_convert_string (string
, &coding
, encodep
, Qt
);
5135 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
5136 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5137 Return the corresponding character.")
5141 unsigned char c1
, c2
, s1
, s2
;
5144 CHECK_NUMBER (code
, 0);
5145 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
5149 XSETFASTINT (val
, s2
);
5150 else if (s2
>= 0xA0 || s2
<= 0xDF)
5152 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201
, s2
, 0));
5154 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5158 if ((s1
< 0x80 || s1
> 0x9F && s1
< 0xE0 || s1
> 0xEF)
5159 || (s2
< 0x40 || s2
== 0x7F || s2
> 0xFC))
5160 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5161 DECODE_SJIS (s1
, s2
, c1
, c2
);
5162 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
5167 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
5168 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5169 Return the corresponding code in SJIS.")
5173 int charset
, c1
, c2
, s1
, s2
;
5176 CHECK_NUMBER (ch
, 0);
5177 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5178 if (charset
== CHARSET_ASCII
)
5182 else if (charset
== charset_jisx0208
5183 && c1
> 0x20 && c1
< 0x7F && c2
> 0x20 && c2
< 0x7F)
5185 ENCODE_SJIS (c1
, c2
, s1
, s2
);
5186 XSETFASTINT (val
, (s1
<< 8) | s2
);
5188 else if (charset
== charset_katakana_jisx0201
5189 && c1
> 0x20 && c2
< 0xE0)
5191 XSETFASTINT (val
, c1
| 0x80);
5194 error ("Can't encode to shift_jis: %d", XFASTINT (ch
));
5198 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
5199 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5200 Return the corresponding character.")
5205 unsigned char b1
, b2
, c1
, c2
;
5208 CHECK_NUMBER (code
, 0);
5209 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
5213 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5218 if ((b1
< 0xA1 || b1
> 0xFE)
5219 || (b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE))
5220 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5221 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
5222 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
5227 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
5228 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5229 Return the corresponding character code in Big5.")
5233 int charset
, c1
, c2
, b1
, b2
;
5236 CHECK_NUMBER (ch
, 0);
5237 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5238 if (charset
== CHARSET_ASCII
)
5242 else if ((charset
== charset_big5_1
5243 && (XFASTINT (ch
) >= 0x250a1 && XFASTINT (ch
) <= 0x271ec))
5244 || (charset
== charset_big5_2
5245 && XFASTINT (ch
) >= 0x290a1 && XFASTINT (ch
) <= 0x2bdb2))
5247 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
5248 XSETFASTINT (val
, (b1
<< 8) | b2
);
5251 error ("Can't encode to Big5: %d", XFASTINT (ch
));
5255 DEFUN ("set-terminal-coding-system-internal",
5256 Fset_terminal_coding_system_internal
,
5257 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
5259 Lisp_Object coding_system
;
5261 CHECK_SYMBOL (coding_system
, 0);
5262 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
5263 /* We had better not send unsafe characters to terminal. */
5264 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
5269 DEFUN ("set-safe-terminal-coding-system-internal",
5270 Fset_safe_terminal_coding_system_internal
,
5271 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
5273 Lisp_Object coding_system
;
5275 CHECK_SYMBOL (coding_system
, 0);
5276 setup_coding_system (Fcheck_coding_system (coding_system
),
5277 &safe_terminal_coding
);
5281 DEFUN ("terminal-coding-system",
5282 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
5283 "Return coding system specified for terminal output.")
5286 return terminal_coding
.symbol
;
5289 DEFUN ("set-keyboard-coding-system-internal",
5290 Fset_keyboard_coding_system_internal
,
5291 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
5293 Lisp_Object coding_system
;
5295 CHECK_SYMBOL (coding_system
, 0);
5296 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
5300 DEFUN ("keyboard-coding-system",
5301 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
5302 "Return coding system specified for decoding keyboard input.")
5305 return keyboard_coding
.symbol
;
5309 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
5310 Sfind_operation_coding_system
, 1, MANY
, 0,
5311 "Choose a coding system for an operation based on the target name.\n\
5312 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5313 DECODING-SYSTEM is the coding system to use for decoding\n\
5314 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5315 for encoding (in case OPERATION does encoding).\n\
5317 The first argument OPERATION specifies an I/O primitive:\n\
5318 For file I/O, `insert-file-contents' or `write-region'.\n\
5319 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5320 For network I/O, `open-network-stream'.\n\
5322 The remaining arguments should be the same arguments that were passed\n\
5323 to the primitive. Depending on which primitive, one of those arguments\n\
5324 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5325 whichever argument specifies the file name is TARGET.\n\
5327 TARGET has a meaning which depends on OPERATION:\n\
5328 For file I/O, TARGET is a file name.\n\
5329 For process I/O, TARGET is a process name.\n\
5330 For network I/O, TARGET is a service name or a port number\n\
5332 This function looks up what specified for TARGET in,\n\
5333 `file-coding-system-alist', `process-coding-system-alist',\n\
5334 or `network-coding-system-alist' depending on OPERATION.\n\
5335 They may specify a coding system, a cons of coding systems,\n\
5336 or a function symbol to call.\n\
5337 In the last case, we call the function with one argument,\n\
5338 which is a list of all the arguments given to this function.")
5343 Lisp_Object operation
, target_idx
, target
, val
;
5344 register Lisp_Object chain
;
5347 error ("Too few arguments");
5348 operation
= args
[0];
5349 if (!SYMBOLP (operation
)
5350 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
5351 error ("Invalid first arguement");
5352 if (nargs
< 1 + XINT (target_idx
))
5353 error ("Too few arguments for operation: %s",
5354 XSYMBOL (operation
)->name
->data
);
5355 target
= args
[XINT (target_idx
) + 1];
5356 if (!(STRINGP (target
)
5357 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
5358 error ("Invalid %dth argument", XINT (target_idx
) + 1);
5360 chain
= ((EQ (operation
, Qinsert_file_contents
)
5361 || EQ (operation
, Qwrite_region
))
5362 ? Vfile_coding_system_alist
5363 : (EQ (operation
, Qopen_network_stream
)
5364 ? Vnetwork_coding_system_alist
5365 : Vprocess_coding_system_alist
));
5369 for (; CONSP (chain
); chain
= XCDR (chain
))
5375 && ((STRINGP (target
)
5376 && STRINGP (XCAR (elt
))
5377 && fast_string_match (XCAR (elt
), target
) >= 0)
5378 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
5381 /* Here, if VAL is both a valid coding system and a valid
5382 function symbol, we return VAL as a coding system. */
5385 if (! SYMBOLP (val
))
5387 if (! NILP (Fcoding_system_p (val
)))
5388 return Fcons (val
, val
);
5389 if (! NILP (Ffboundp (val
)))
5391 val
= call1 (val
, Flist (nargs
, args
));
5394 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
5395 return Fcons (val
, val
);
5403 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal
,
5404 Supdate_coding_systems_internal
, 0, 0, 0,
5405 "Update internal database for ISO2022 and CCL based coding systems.\n\
5406 When values of the following coding categories are changed, you must\n\
5407 call this function:\n\
5408 coding-category-iso-7, coding-category-iso-7-tight,\n\
5409 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5410 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5411 coding-category-ccl")
5416 for (i
= CODING_CATEGORY_IDX_ISO_7
; i
<= CODING_CATEGORY_IDX_CCL
; i
++)
5420 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
;
5423 if (! coding_system_table
[i
])
5424 coding_system_table
[i
] = ((struct coding_system
*)
5425 xmalloc (sizeof (struct coding_system
)));
5426 setup_coding_system (val
, coding_system_table
[i
]);
5428 else if (coding_system_table
[i
])
5430 xfree (coding_system_table
[i
]);
5431 coding_system_table
[i
] = NULL
;
5438 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal
,
5439 Sset_coding_priority_internal
, 0, 0, 0,
5440 "Update internal database for the current value of `coding-category-list'.\n\
5441 This function is internal use only.")
5447 val
= Vcoding_category_list
;
5449 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
5451 if (! SYMBOLP (XCAR (val
)))
5453 idx
= XFASTINT (Fget (XCAR (val
), Qcoding_category_index
));
5454 if (idx
>= CODING_CATEGORY_IDX_MAX
)
5456 coding_priorities
[i
++] = (1 << idx
);
5459 /* If coding-category-list is valid and contains all coding
5460 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5461 the following code saves Emacs from craching. */
5462 while (i
< CODING_CATEGORY_IDX_MAX
)
5463 coding_priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
5471 /*** 9. Post-amble ***/
5476 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
5484 /* Emacs' internal format specific initialize routine. */
5485 for (i
= 0; i
<= 0x20; i
++)
5486 emacs_code_class
[i
] = EMACS_control_code
;
5487 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
5488 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
5489 for (i
= 0x21 ; i
< 0x7F; i
++)
5490 emacs_code_class
[i
] = EMACS_ascii_code
;
5491 emacs_code_class
[0x7F] = EMACS_control_code
;
5492 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
5493 for (i
= 0x81; i
< 0xFF; i
++)
5494 emacs_code_class
[i
] = EMACS_invalid_code
;
5495 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
5496 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
5497 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
5498 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
5500 /* ISO2022 specific initialize routine. */
5501 for (i
= 0; i
< 0x20; i
++)
5502 iso_code_class
[i
] = ISO_control_code
;
5503 for (i
= 0x21; i
< 0x7F; i
++)
5504 iso_code_class
[i
] = ISO_graphic_plane_0
;
5505 for (i
= 0x80; i
< 0xA0; i
++)
5506 iso_code_class
[i
] = ISO_control_code
;
5507 for (i
= 0xA1; i
< 0xFF; i
++)
5508 iso_code_class
[i
] = ISO_graphic_plane_1
;
5509 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
5510 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
5511 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
5512 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
5513 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
5514 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
5515 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
5516 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
5517 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
5518 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
5520 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
5522 setup_coding_system (Qnil
, &keyboard_coding
);
5523 setup_coding_system (Qnil
, &terminal_coding
);
5524 setup_coding_system (Qnil
, &safe_terminal_coding
);
5525 setup_coding_system (Qnil
, &default_buffer_file_coding
);
5527 bzero (coding_system_table
, sizeof coding_system_table
);
5529 bzero (ascii_skip_code
, sizeof ascii_skip_code
);
5530 for (i
= 0; i
< 128; i
++)
5531 ascii_skip_code
[i
] = 1;
5533 #if defined (MSDOS) || defined (WINDOWSNT)
5534 system_eol_type
= CODING_EOL_CRLF
;
5536 system_eol_type
= CODING_EOL_LF
;
5539 inhibit_pre_post_conversion
= 0;
5547 Qtarget_idx
= intern ("target-idx");
5548 staticpro (&Qtarget_idx
);
5550 Qcoding_system_history
= intern ("coding-system-history");
5551 staticpro (&Qcoding_system_history
);
5552 Fset (Qcoding_system_history
, Qnil
);
5554 /* Target FILENAME is the first argument. */
5555 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5556 /* Target FILENAME is the third argument. */
5557 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5559 Qcall_process
= intern ("call-process");
5560 staticpro (&Qcall_process
);
5561 /* Target PROGRAM is the first argument. */
5562 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5564 Qcall_process_region
= intern ("call-process-region");
5565 staticpro (&Qcall_process_region
);
5566 /* Target PROGRAM is the third argument. */
5567 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5569 Qstart_process
= intern ("start-process");
5570 staticpro (&Qstart_process
);
5571 /* Target PROGRAM is the third argument. */
5572 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5574 Qopen_network_stream
= intern ("open-network-stream");
5575 staticpro (&Qopen_network_stream
);
5576 /* Target SERVICE is the fourth argument. */
5577 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5579 Qcoding_system
= intern ("coding-system");
5580 staticpro (&Qcoding_system
);
5582 Qeol_type
= intern ("eol-type");
5583 staticpro (&Qeol_type
);
5585 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5586 staticpro (&Qbuffer_file_coding_system
);
5588 Qpost_read_conversion
= intern ("post-read-conversion");
5589 staticpro (&Qpost_read_conversion
);
5591 Qpre_write_conversion
= intern ("pre-write-conversion");
5592 staticpro (&Qpre_write_conversion
);
5594 Qno_conversion
= intern ("no-conversion");
5595 staticpro (&Qno_conversion
);
5597 Qundecided
= intern ("undecided");
5598 staticpro (&Qundecided
);
5600 Qcoding_system_p
= intern ("coding-system-p");
5601 staticpro (&Qcoding_system_p
);
5603 Qcoding_system_error
= intern ("coding-system-error");
5604 staticpro (&Qcoding_system_error
);
5606 Fput (Qcoding_system_error
, Qerror_conditions
,
5607 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5608 Fput (Qcoding_system_error
, Qerror_message
,
5609 build_string ("Invalid coding system"));
5611 Qcoding_category
= intern ("coding-category");
5612 staticpro (&Qcoding_category
);
5613 Qcoding_category_index
= intern ("coding-category-index");
5614 staticpro (&Qcoding_category_index
);
5616 Vcoding_category_table
5617 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
5618 staticpro (&Vcoding_category_table
);
5621 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5623 XVECTOR (Vcoding_category_table
)->contents
[i
]
5624 = intern (coding_category_name
[i
]);
5625 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
5626 Qcoding_category_index
, make_number (i
));
5630 Qtranslation_table
= intern ("translation-table");
5631 staticpro (&Qtranslation_table
);
5632 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
5634 Qtranslation_table_id
= intern ("translation-table-id");
5635 staticpro (&Qtranslation_table_id
);
5637 Qtranslation_table_for_decode
= intern ("translation-table-for-decode");
5638 staticpro (&Qtranslation_table_for_decode
);
5640 Qtranslation_table_for_encode
= intern ("translation-table-for-encode");
5641 staticpro (&Qtranslation_table_for_encode
);
5643 Qsafe_charsets
= intern ("safe-charsets");
5644 staticpro (&Qsafe_charsets
);
5646 Qvalid_codes
= intern ("valid-codes");
5647 staticpro (&Qvalid_codes
);
5649 Qemacs_mule
= intern ("emacs-mule");
5650 staticpro (&Qemacs_mule
);
5652 Qraw_text
= intern ("raw-text");
5653 staticpro (&Qraw_text
);
5655 defsubr (&Scoding_system_p
);
5656 defsubr (&Sread_coding_system
);
5657 defsubr (&Sread_non_nil_coding_system
);
5658 defsubr (&Scheck_coding_system
);
5659 defsubr (&Sdetect_coding_region
);
5660 defsubr (&Sdetect_coding_string
);
5661 defsubr (&Sdecode_coding_region
);
5662 defsubr (&Sencode_coding_region
);
5663 defsubr (&Sdecode_coding_string
);
5664 defsubr (&Sencode_coding_string
);
5665 defsubr (&Sdecode_sjis_char
);
5666 defsubr (&Sencode_sjis_char
);
5667 defsubr (&Sdecode_big5_char
);
5668 defsubr (&Sencode_big5_char
);
5669 defsubr (&Sset_terminal_coding_system_internal
);
5670 defsubr (&Sset_safe_terminal_coding_system_internal
);
5671 defsubr (&Sterminal_coding_system
);
5672 defsubr (&Sset_keyboard_coding_system_internal
);
5673 defsubr (&Skeyboard_coding_system
);
5674 defsubr (&Sfind_operation_coding_system
);
5675 defsubr (&Supdate_coding_systems_internal
);
5676 defsubr (&Sset_coding_priority_internal
);
5678 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
5679 "List of coding systems.\n\
5681 Do not alter the value of this variable manually. This variable should be\n\
5682 updated by the functions `make-coding-system' and\n\
5683 `define-coding-system-alias'.");
5684 Vcoding_system_list
= Qnil
;
5686 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
5687 "Alist of coding system names.\n\
5688 Each element is one element list of coding system name.\n\
5689 This variable is given to `completing-read' as TABLE argument.\n\
5691 Do not alter the value of this variable manually. This variable should be\n\
5692 updated by the functions `make-coding-system' and\n\
5693 `define-coding-system-alias'.");
5694 Vcoding_system_alist
= Qnil
;
5696 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
5697 "List of coding-categories (symbols) ordered by priority.");
5701 Vcoding_category_list
= Qnil
;
5702 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
5703 Vcoding_category_list
5704 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
5705 Vcoding_category_list
);
5708 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
5709 "Specify the coding system for read operations.\n\
5710 It is useful to bind this variable with `let', but do not set it globally.\n\
5711 If the value is a coding system, it is used for decoding on read operation.\n\
5712 If not, an appropriate element is used from one of the coding system alists:\n\
5713 There are three such tables, `file-coding-system-alist',\n\
5714 `process-coding-system-alist', and `network-coding-system-alist'.");
5715 Vcoding_system_for_read
= Qnil
;
5717 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
5718 "Specify the coding system for write operations.\n\
5719 Programs bind this variable with `let', but you should not set it globally.\n\
5720 If the value is a coding system, it is used for encoding of output,\n\
5721 when writing it to a file and when sending it to a file or subprocess.\n\
5723 If this does not specify a coding system, an appropriate element\n\
5724 is used from one of the coding system alists:\n\
5725 There are three such tables, `file-coding-system-alist',\n\
5726 `process-coding-system-alist', and `network-coding-system-alist'.\n\
5727 For output to files, if the above procedure does not specify a coding system,\n\
5728 the value of `buffer-file-coding-system' is used.");
5729 Vcoding_system_for_write
= Qnil
;
5731 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
5732 "Coding system used in the latest file or process I/O.");
5733 Vlast_coding_system_used
= Qnil
;
5735 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
5736 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5737 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5739 inhibit_eol_conversion
= 0;
5741 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
5742 "Non-nil means process buffer inherits coding system of process output.\n\
5743 Bind it to t if the process output is to be treated as if it were a file\n\
5744 read from some filesystem.");
5745 inherit_process_coding_system
= 0;
5747 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
5748 "Alist to decide a coding system to use for a file I/O operation.\n\
5749 The format is ((PATTERN . VAL) ...),\n\
5750 where PATTERN is a regular expression matching a file name,\n\
5751 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5752 If VAL is a coding system, it is used for both decoding and encoding\n\
5753 the file contents.\n\
5754 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5755 and the cdr part is used for encoding.\n\
5756 If VAL is a function symbol, the function must return a coding system\n\
5757 or a cons of coding systems which are used as above.\n\
5759 See also the function `find-operation-coding-system'\n\
5760 and the variable `auto-coding-alist'.");
5761 Vfile_coding_system_alist
= Qnil
;
5763 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
5764 "Alist to decide a coding system to use for a process I/O operation.\n\
5765 The format is ((PATTERN . VAL) ...),\n\
5766 where PATTERN is a regular expression matching a program name,\n\
5767 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5768 If VAL is a coding system, it is used for both decoding what received\n\
5769 from the program and encoding what sent to the program.\n\
5770 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5771 and the cdr part is used for encoding.\n\
5772 If VAL is a function symbol, the function must return a coding system\n\
5773 or a cons of coding systems which are used as above.\n\
5775 See also the function `find-operation-coding-system'.");
5776 Vprocess_coding_system_alist
= Qnil
;
5778 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
5779 "Alist to decide a coding system to use for a network I/O operation.\n\
5780 The format is ((PATTERN . VAL) ...),\n\
5781 where PATTERN is a regular expression matching a network service name\n\
5782 or is a port number to connect to,\n\
5783 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5784 If VAL is a coding system, it is used for both decoding what received\n\
5785 from the network stream and encoding what sent to the network stream.\n\
5786 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5787 and the cdr part is used for encoding.\n\
5788 If VAL is a function symbol, the function must return a coding system\n\
5789 or a cons of coding systems which are used as above.\n\
5791 See also the function `find-operation-coding-system'.");
5792 Vnetwork_coding_system_alist
= Qnil
;
5794 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
5795 "Coding system to use with system messages.");
5796 Vlocale_coding_system
= Qnil
;
5798 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
5799 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5800 eol_mnemonic_unix
= build_string (":");
5802 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
5803 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5804 eol_mnemonic_dos
= build_string ("\\");
5806 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
5807 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5808 eol_mnemonic_mac
= build_string ("/");
5810 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
5811 "*String displayed in mode line when end-of-line format is not yet determined.");
5812 eol_mnemonic_undecided
= build_string (":");
5814 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
5815 "*Non-nil enables character translation while encoding and decoding.");
5816 Venable_character_translation
= Qt
;
5818 DEFVAR_LISP ("standard-translation-table-for-decode",
5819 &Vstandard_translation_table_for_decode
,
5820 "Table for translating characters while decoding.");
5821 Vstandard_translation_table_for_decode
= Qnil
;
5823 DEFVAR_LISP ("standard-translation-table-for-encode",
5824 &Vstandard_translation_table_for_encode
,
5825 "Table for translationg characters while encoding.");
5826 Vstandard_translation_table_for_encode
= Qnil
;
5828 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
5829 "Alist of charsets vs revision numbers.\n\
5830 While encoding, if a charset (car part of an element) is found,\n\
5831 designate it with the escape sequence identifing revision (cdr part of the element).");
5832 Vcharset_revision_alist
= Qnil
;
5834 DEFVAR_LISP ("default-process-coding-system",
5835 &Vdefault_process_coding_system
,
5836 "Cons of coding systems used for process I/O by default.\n\
5837 The car part is used for decoding a process output,\n\
5838 the cdr part is used for encoding a text to be sent to a process.");
5839 Vdefault_process_coding_system
= Qnil
;
5841 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
5842 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5843 This is a vector of length 256.\n\
5844 If Nth element is non-nil, the existence of code N in a file\n\
5845 \(or output of subprocess) doesn't prevent it to be detected as\n\
5846 a coding system of ISO 2022 variant which has a flag\n\
5847 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5848 or reading output of a subprocess.\n\
5849 Only 128th through 159th elements has a meaning.");
5850 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
5852 DEFVAR_LISP ("select-safe-coding-system-function",
5853 &Vselect_safe_coding_system_function
,
5854 "Function to call to select safe coding system for encoding a text.\n\
5856 If set, this function is called to force a user to select a proper\n\
5857 coding system which can encode the text in the case that a default\n\
5858 coding system used in each operation can't encode the text.\n\
5860 The default value is `select-safe-coding-system' (which see).");
5861 Vselect_safe_coding_system_function
= Qnil
;
5866 emacs_strerror (error_number
)
5871 synchronize_system_messages_locale ();
5872 str
= strerror (error_number
);
5874 if (! NILP (Vlocale_coding_system
))
5876 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
5877 Vlocale_coding_system
,
5879 str
= (char *) XSTRING (dec
)->data
;