1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
45 0. Emacs' internal format (emacs-mule)
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
114 detect_coding_emacs_mule (src
, src_end
)
115 unsigned char *src
, *src_end
;
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
137 Below is a template of these functions. */
139 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
140 struct coding_system
*coding
;
141 unsigned char *source
, *destination
;
142 int src_bytes
, dst_bytes
;
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
164 Below is a template of these functions. */
166 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
167 struct coding_system
*coding
;
168 unsigned char *source
, *destination
;
169 int src_bytes
, dst_bytes
;
175 /*** COMMONLY USED MACROS ***/
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
183 #define ONE_MORE_BYTE(c1) \
188 goto label_end_of_loop; \
191 #define TWO_MORE_BYTES(c1, c2) \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
196 goto label_end_of_loop; \
199 #define THREE_MORE_BYTES(c1, c2, c3) \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
204 goto label_end_of_loop; \
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
214 /* Decode one ASCII character C. */
216 #define DECODE_CHARACTER_ASCII(c) \
218 if (COMPOSING_P (coding->composing)) \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
228 coding->produced_char++; \
230 coding->fake_multibyte = 1; \
234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
235 position-code is C. */
237 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
258 position-codes are C1 and C2. */
260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
269 /*** 1. Preamble ***/
283 #else /* not emacs */
287 #endif /* not emacs */
289 Lisp_Object Qcoding_system
, Qeol_type
;
290 Lisp_Object Qbuffer_file_coding_system
;
291 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
292 Lisp_Object Qno_conversion
, Qundecided
;
293 Lisp_Object Qcoding_system_history
;
294 Lisp_Object Qsafe_charsets
;
295 Lisp_Object Qvalid_codes
;
297 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
298 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
299 Lisp_Object Qstart_process
, Qopen_network_stream
;
300 Lisp_Object Qtarget_idx
;
302 Lisp_Object Vselect_safe_coding_system_function
;
304 /* Mnemonic character of each format of end-of-line. */
305 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
306 /* Mnemonic character to indicate format of end-of-line is not yet
308 int eol_mnemonic_undecided
;
310 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
316 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
318 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
320 /* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322 Lisp_Object Qemacs_mule
, Qraw_text
;
324 /* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326 /* Coding-system for reading files and receiving data from process. */
327 Lisp_Object Vcoding_system_for_read
;
328 /* Coding-system for writing files and sending data to process. */
329 Lisp_Object Vcoding_system_for_write
;
330 /* Coding-system actually used in the latest I/O. */
331 Lisp_Object Vlast_coding_system_used
;
333 /* A vector of length 256 which contains information about special
334 Latin codes (especially for dealing with Microsoft codes). */
335 Lisp_Object Vlatin_extra_code_table
;
337 /* Flag to inhibit code conversion of end-of-line format. */
338 int inhibit_eol_conversion
;
340 /* Flag to make buffer-file-coding-system inherit from process-coding. */
341 int inherit_process_coding_system
;
343 /* Coding system to be used to encode text for terminal display. */
344 struct coding_system terminal_coding
;
346 /* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348 struct coding_system safe_terminal_coding
;
350 /* Coding system of what is sent from terminal keyboard. */
351 struct coding_system keyboard_coding
;
353 /* Default coding system to be used to write a file. */
354 struct coding_system default_buffer_file_coding
;
356 Lisp_Object Vfile_coding_system_alist
;
357 Lisp_Object Vprocess_coding_system_alist
;
358 Lisp_Object Vnetwork_coding_system_alist
;
362 Lisp_Object Qcoding_category
, Qcoding_category_index
;
364 /* List of symbols `coding-category-xxx' ordered by priority. */
365 Lisp_Object Vcoding_category_list
;
367 /* Table of coding categories (Lisp symbols). */
368 Lisp_Object Vcoding_category_table
;
370 /* Table of names of symbol for each coding-category. */
371 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
372 "coding-category-emacs-mule",
373 "coding-category-sjis",
374 "coding-category-iso-7",
375 "coding-category-iso-7-tight",
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
380 "coding-category-ccl",
381 "coding-category-big5",
382 "coding-category-raw-text",
383 "coding-category-binary"
386 /* Table of pointers to coding systems corresponding to each coding
388 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
390 /* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
393 int coding_priorities
[CODING_CATEGORY_IDX_MAX
];
395 /* Flag to tell if we look up translation table on character code
397 Lisp_Object Venable_character_translation
;
398 /* Standard translation table to look up on decoding (reading). */
399 Lisp_Object Vstandard_translation_table_for_decode
;
400 /* Standard translation table to look up on encoding (writing). */
401 Lisp_Object Vstandard_translation_table_for_encode
;
403 Lisp_Object Qtranslation_table
;
404 Lisp_Object Qtranslation_table_id
;
405 Lisp_Object Qtranslation_table_for_decode
;
406 Lisp_Object Qtranslation_table_for_encode
;
408 /* Alist of charsets vs revision number. */
409 Lisp_Object Vcharset_revision_alist
;
411 /* Default coding systems used for process I/O. */
412 Lisp_Object Vdefault_process_coding_system
;
415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
417 /* Emacs' internal format for encoding multiple character sets is a
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
430 There's one exception to this rule. Special leading-code
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
448 enum emacs_code_class_type emacs_code_class
[256];
450 /* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452 #define CHECK_CODE_RANGE_A0_FF \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
465 detect_coding_emacs_mule (src
, src_end
)
466 unsigned char *src
, *src_end
;
471 while (src
< src_end
)
483 switch (emacs_code_class
[c
])
485 case EMACS_ascii_code
:
486 case EMACS_linefeed_code
:
489 case EMACS_control_code
:
490 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
494 case EMACS_invalid_code
:
497 case EMACS_leading_code_composition
: /* c == 0x80 */
499 CHECK_CODE_RANGE_A0_FF
;
504 case EMACS_leading_code_4
:
505 CHECK_CODE_RANGE_A0_FF
;
506 /* fall down to check it two more times ... */
508 case EMACS_leading_code_3
:
509 CHECK_CODE_RANGE_A0_FF
;
510 /* fall down to check it one more time ... */
512 case EMACS_leading_code_2
:
513 CHECK_CODE_RANGE_A0_FF
;
521 return CODING_CATEGORY_MASK_EMACS_MULE
;
525 /*** 3. ISO2022 handlers ***/
527 /* The following note describes the coding system ISO2022 briefly.
528 Since the intention of this note is to help in understanding of
529 the programs in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For the thorough understanding, please refer to the
531 original document of ISO2022.
533 ISO2022 provides many mechanisms to encode several character sets
534 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
535 all text is encoded by codes of less than 128. This may make the
536 encoded text a little bit longer, but the text gets more stability
537 to pass through several gateways (some of them strip off the MSB).
539 There are two kinds of character set: control character set and
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
542 functions are provided also by escape sequences). The latter
543 contains graphic characters such as ' A' and '-'. Emacs recognizes
544 two control character sets and many graphic character sets.
546 Graphic character sets are classified into one of the following
547 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
548 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
549 bytes (DIMENSION) and the number of characters in one dimension
550 (CHARS) of the set. In addition, each character set is assigned an
551 identification tag (called "final character" and denoted as <F>
552 here after) which is unique in each class. <F> of each character
553 set is decided by ECMA(*) when it is registered in ISO. Code range
554 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
556 Note (*): ECMA = European Computer Manufacturers Association
558 Here are examples of graphic character set [NAME(<F>)]:
559 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
560 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
561 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
562 o DIMENSION2_CHARS96 -- none for the moment
564 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
565 C0 [0x00..0x1F] -- control character plane 0
566 GL [0x20..0x7F] -- graphic character plane 0
567 C1 [0x80..0x9F] -- control character plane 1
568 GR [0xA0..0xFF] -- graphic character plane 1
570 A control character set is directly designated and invoked to C0 or
571 C1 by an escape sequence. The most common case is that ISO646's
572 control character set is designated/invoked to C0 and ISO6429's
573 control character set is designated/invoked to C1, and usually
574 these designations/invocations are omitted in a coded text. With
575 7-bit environment, only C0 can be used, and a control character for
576 C1 is encoded by an appropriate escape sequence to fit in the
577 environment. All control characters for C1 are defined the
578 corresponding escape sequences.
580 A graphic character set is at first designated to one of four
581 graphic registers (G0 through G3), then these graphic registers are
582 invoked to GL or GR. These designations and invocations can be
583 done independently. The most common case is that G0 is invoked to
584 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
585 these invocations and designations are omitted in a coded text.
586 With 7-bit environment, only GL can be used.
588 When a graphic character set of CHARS94 is invoked to GL, code 0x20
589 and 0x7F of GL area work as control characters SPACE and DEL
590 respectively, and code 0xA0 and 0xFF of GR area should not be used.
592 There are two ways of invocation: locking-shift and single-shift.
593 With locking-shift, the invocation lasts until the next different
594 invocation, whereas with single-shift, the invocation works only
595 for the following character and doesn't affect locking-shift.
596 Invocations are done by the following control characters or escape
599 ----------------------------------------------------------------------
600 function control char escape sequence description
601 ----------------------------------------------------------------------
602 SI (shift-in) 0x0F none invoke G0 to GL
603 SO (shift-out) 0x0E none invoke G1 to GL
604 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
605 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
606 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
607 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
608 ----------------------------------------------------------------------
609 The first four are for locking-shift. Control characters for these
610 functions are defined by macros ISO_CODE_XXX in `coding.h'.
612 Designations are done by the following escape sequences.
613 ----------------------------------------------------------------------
614 escape sequence description
615 ----------------------------------------------------------------------
616 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
617 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
618 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
619 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
620 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
621 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
622 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
623 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
624 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
625 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
626 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
627 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
628 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
629 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
630 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
631 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
632 ----------------------------------------------------------------------
634 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
635 of dimension 1, chars 94, and final character <F>, and etc.
637 Note (*): Although these designations are not allowed in ISO2022,
638 Emacs accepts them on decoding, and produces them on encoding
639 CHARS96 character set in a coding system which is characterized as
640 7-bit environment, non-locking-shift, and non-single-shift.
642 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
643 '(' can be omitted. We call this as "short-form" here after.
645 Now you may notice that there are a lot of ways for encoding the
646 same multilingual text in ISO2022. Actually, there exists many
647 coding systems such as Compound Text (used in X's inter client
648 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
649 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
650 localized platforms), and all of these are variants of ISO2022.
652 In addition to the above, Emacs handles two more kinds of escape
653 sequences: ISO6429's direction specification and Emacs' private
654 sequence for specifying character composition.
656 ISO6429's direction specification takes the following format:
657 o CSI ']' -- end of the current direction
658 o CSI '0' ']' -- end of the current direction
659 o CSI '1' ']' -- start of left-to-right text
660 o CSI '2' ']' -- start of right-to-left text
661 The control character CSI (0x9B: control sequence introducer) is
662 abbreviated to the escape sequence ESC '[' in 7-bit environment.
664 Character composition specification takes the following format:
665 o ESC '0' -- start character composition
666 o ESC '1' -- end character composition
667 Since these are not standard escape sequences of any ISO, the use
668 of them for these meaning is restricted to Emacs only. */
670 enum iso_code_class_type iso_code_class
[256];
672 #define CHARSET_OK(idx, charset) \
673 (coding_system_table[idx] \
674 && (coding_system_table[idx]->safe_charsets[charset] \
675 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
676 (coding_system_table[idx], charset) \
677 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
679 #define SHIFT_OUT_OK(idx) \
680 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
683 Check if a text is encoded in ISO2022. If it is, returns an
684 integer in which appropriate flag bits any of:
685 CODING_CATEGORY_MASK_ISO_7
686 CODING_CATEGORY_MASK_ISO_7_TIGHT
687 CODING_CATEGORY_MASK_ISO_8_1
688 CODING_CATEGORY_MASK_ISO_8_2
689 CODING_CATEGORY_MASK_ISO_7_ELSE
690 CODING_CATEGORY_MASK_ISO_8_ELSE
691 are set. If a code which should never appear in ISO2022 is found,
695 detect_coding_iso2022 (src
, src_end
)
696 unsigned char *src
, *src_end
;
698 int mask
= CODING_CATEGORY_MASK_ISO
;
700 int reg
[4], shift_out
= 0, single_shifting
= 0;
701 int c
, c1
, i
, charset
;
703 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
704 while (mask
&& src
< src_end
)
714 if (c
>= '(' && c
<= '/')
716 /* Designation sequence for a charset of dimension 1. */
720 if (c1
< ' ' || c1
>= 0x80
721 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
722 /* Invalid designation sequence. Just ignore. */
724 reg
[(c
- '(') % 4] = charset
;
728 /* Designation sequence for a charset of dimension 2. */
732 if (c
>= '@' && c
<= 'B')
733 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
734 reg
[0] = charset
= iso_charset_table
[1][0][c
];
735 else if (c
>= '(' && c
<= '/')
740 if (c1
< ' ' || c1
>= 0x80
741 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
742 /* Invalid designation sequence. Just ignore. */
744 reg
[(c
- '(') % 4] = charset
;
747 /* Invalid designation sequence. Just ignore. */
750 else if (c
== 'N' || c
== 'O')
752 /* ESC <Fe> for SS2 or SS3. */
753 mask
&= CODING_CATEGORY_MASK_ISO_7_ELSE
;
756 else if (c
== '0' || c
== '1' || c
== '2')
757 /* ESC <Fp> for start/end composition. Just ignore. */
760 /* Invalid escape sequence. Just ignore. */
763 /* We found a valid designation sequence for CHARSET. */
764 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
765 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
766 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
768 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
769 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
770 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
772 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
773 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
774 mask_found
|= CODING_CATEGORY_MASK_ISO_7_ELSE
;
776 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
777 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
778 mask_found
|= CODING_CATEGORY_MASK_ISO_8_ELSE
;
780 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
787 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
788 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
790 /* Locking shift out. */
791 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
792 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
800 /* Locking shift in. */
801 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
802 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
811 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
813 if (c
!= ISO_CODE_CSI
)
815 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
816 & CODING_FLAG_ISO_SINGLE_SHIFT
)
817 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
818 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
819 & CODING_FLAG_ISO_SINGLE_SHIFT
)
820 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
823 if (VECTORP (Vlatin_extra_code_table
)
824 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
826 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
827 & CODING_FLAG_ISO_LATIN_EXTRA
)
828 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
829 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
830 & CODING_FLAG_ISO_LATIN_EXTRA
)
831 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
834 mask_found
|= newmask
;
847 if (VECTORP (Vlatin_extra_code_table
)
848 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
852 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
853 & CODING_FLAG_ISO_LATIN_EXTRA
)
854 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
855 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
856 & CODING_FLAG_ISO_LATIN_EXTRA
)
857 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
859 mask_found
|= newmask
;
866 unsigned char *src_begin
= src
;
868 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
869 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
870 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
871 /* Check the length of succeeding codes of the range
872 0xA0..0FF. If the byte length is odd, we exclude
873 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
874 when we are not single shifting. */
875 if (!single_shifting
)
877 while (src
< src_end
&& *src
>= 0xA0)
879 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
880 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
882 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
889 return (mask
& mask_found
);
892 /* Decode a character of which charset is CHARSET and the 1st position
893 code is C1. If dimension of CHARSET is 2, the 2nd position code is
894 fetched from SRC and set to C2. If CHARSET is negative, it means
895 that we are decoding ill formed text, and what we can do is just to
898 #define DECODE_ISO_CHARACTER(charset, c1) \
900 int c_alt, charset_alt = (charset); \
901 if (COMPOSING_HEAD_P (coding->composing)) \
903 *dst++ = LEADING_CODE_COMPOSITION; \
904 if (COMPOSING_WITH_RULE_P (coding->composing)) \
905 /* To tell composition rules are embeded. */ \
907 coding->composing += 2; \
909 if (charset_alt >= 0) \
911 if (CHARSET_DIMENSION (charset_alt) == 2) \
913 ONE_MORE_BYTE (c2); \
914 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
915 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
918 charset_alt = CHARSET_ASCII; \
921 if (!NILP (translation_table) \
922 && ((c_alt = translate_char (translation_table, \
923 -1, charset_alt, c1, c2)) >= 0)) \
924 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
926 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
927 DECODE_CHARACTER_ASCII (c1); \
928 else if (CHARSET_DIMENSION (charset_alt) == 1) \
929 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
931 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
932 if (COMPOSING_WITH_RULE_P (coding->composing)) \
933 /* To tell a composition rule follows. */ \
934 coding->composing = COMPOSING_WITH_RULE_RULE; \
937 /* Set designation state into CODING. */
938 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
942 if (final_char < '0' || final_char >= 128) \
943 goto label_invalid_code; \
944 charset = ISO_CHARSET_TABLE (make_number (dimension), \
945 make_number (chars), \
946 make_number (final_char)); \
948 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
949 || coding->safe_charsets[charset])) \
951 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
953 && charset == CHARSET_ASCII) \
955 /* We should insert this designation sequence as is so \
956 that it is surely written back to a file. */ \
957 coding->spec.iso2022.last_invalid_designation_register = -1; \
958 goto label_invalid_code; \
960 coding->spec.iso2022.last_invalid_designation_register = -1; \
961 if ((coding->mode & CODING_MODE_DIRECTION) \
962 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
963 charset = CHARSET_REVERSE_CHARSET (charset); \
964 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
968 coding->spec.iso2022.last_invalid_designation_register = reg; \
969 goto label_invalid_code; \
973 /* Return 0 if there's a valid composing sequence starting at SRC and
974 ending before SRC_END, else return -1. */
977 check_composing_code (coding
, src
, src_end
)
978 struct coding_system
*coding
;
979 unsigned char *src
, *src_end
;
981 int charset
, c
, c1
, dim
;
983 while (src
< src_end
)
988 if (c
!= ISO_CODE_ESC
|| src
>= src_end
)
991 if (c
== '1') /* end of compsition */
993 if (src
+ 2 >= src_end
994 || !coding
->flags
& CODING_FLAG_ISO_DESIGNATION
)
999 c
= (*src
>= '@' && *src
<= 'B') ? '(' : *src
++;
1000 if (c
>= '(' && c
<= '/')
1003 if ((c1
< ' ' || c1
>= 0x80)
1004 || (charset
= iso_charset_table
[dim
][c
>= ','][c1
]) < 0
1005 || ! coding
->safe_charsets
[charset
]
1006 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
1007 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
1014 /* We have not found the sequence "ESC 1". */
1018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1021 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1022 struct coding_system
*coding
;
1023 unsigned char *source
, *destination
;
1024 int src_bytes
, dst_bytes
;
1026 unsigned char *src
= source
;
1027 unsigned char *src_end
= source
+ src_bytes
;
1028 unsigned char *dst
= destination
;
1029 unsigned char *dst_end
= destination
+ dst_bytes
;
1030 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1031 from DST_END to assure that overflow checking is necessary only
1032 at the head of loop. */
1033 unsigned char *adjusted_dst_end
= dst_end
- 6;
1035 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1036 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1037 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1038 Lisp_Object translation_table
1039 = coding
->translation_table_for_decode
;
1040 int result
= CODING_FINISH_NORMAL
;
1042 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1043 translation_table
= Vstandard_translation_table_for_decode
;
1045 coding
->produced_char
= 0;
1046 coding
->composed_chars
= 0;
1047 coding
->fake_multibyte
= 0;
1048 while (src
< src_end
&& (dst_bytes
1049 ? (dst
< adjusted_dst_end
)
1052 /* SRC_BASE remembers the start position in source in each loop.
1053 The loop will be exited when there's not enough source text
1054 to analyze long escape sequence or 2-byte code (within macros
1055 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1056 to SRC_BASE before exiting. */
1057 unsigned char *src_base
= src
;
1058 int c1
= *src
++, c2
;
1060 switch (iso_code_class
[c1
])
1062 case ISO_0x20_or_0x7F
:
1063 if (!coding
->composing
1064 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
1066 /* This is SPACE or DEL. */
1068 coding
->produced_char
++;
1071 /* This is a graphic character, we fall down ... */
1073 case ISO_graphic_plane_0
:
1074 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1076 /* This is a composition rule. */
1078 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
1081 DECODE_ISO_CHARACTER (charset0
, c1
);
1084 case ISO_0xA0_or_0xFF
:
1085 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1086 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1087 goto label_invalid_code
;
1088 /* This is a graphic character, we fall down ... */
1090 case ISO_graphic_plane_1
:
1091 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1092 goto label_invalid_code
;
1094 DECODE_ISO_CHARACTER (charset1
, c1
);
1097 case ISO_control_code
:
1098 /* All ISO2022 control characters in this class have the
1099 same representation in Emacs internal format. */
1101 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1102 && (coding
->eol_type
== CODING_EOL_CR
1103 || coding
->eol_type
== CODING_EOL_CRLF
))
1105 result
= CODING_FINISH_INCONSISTENT_EOL
;
1106 goto label_end_of_loop_2
;
1109 coding
->produced_char
++;
1111 coding
->fake_multibyte
= 1;
1114 case ISO_carriage_return
:
1115 if (coding
->eol_type
== CODING_EOL_CR
)
1117 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1120 if (c1
== ISO_CODE_LF
)
1124 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1126 result
= CODING_FINISH_INCONSISTENT_EOL
;
1127 goto label_end_of_loop_2
;
1135 coding
->produced_char
++;
1139 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1140 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1141 goto label_invalid_code
;
1142 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1143 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1147 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1148 goto label_invalid_code
;
1149 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1150 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1153 case ISO_single_shift_2_7
:
1154 case ISO_single_shift_2
:
1155 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1156 goto label_invalid_code
;
1157 /* SS2 is handled as an escape sequence of ESC 'N' */
1159 goto label_escape_sequence
;
1161 case ISO_single_shift_3
:
1162 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1163 goto label_invalid_code
;
1164 /* SS2 is handled as an escape sequence of ESC 'O' */
1166 goto label_escape_sequence
;
1168 case ISO_control_sequence_introducer
:
1169 /* CSI is handled as an escape sequence of ESC '[' ... */
1171 goto label_escape_sequence
;
1175 label_escape_sequence
:
1176 /* Escape sequences handled by Emacs are invocation,
1177 designation, direction specification, and character
1178 composition specification. */
1181 case '&': /* revision of following character set */
1183 if (!(c1
>= '@' && c1
<= '~'))
1184 goto label_invalid_code
;
1186 if (c1
!= ISO_CODE_ESC
)
1187 goto label_invalid_code
;
1189 goto label_escape_sequence
;
1191 case '$': /* designation of 2-byte character set */
1192 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1193 goto label_invalid_code
;
1195 if (c1
>= '@' && c1
<= 'B')
1196 { /* designation of JISX0208.1978, GB2312.1980,
1198 DECODE_DESIGNATION (0, 2, 94, c1
);
1200 else if (c1
>= 0x28 && c1
<= 0x2B)
1201 { /* designation of DIMENSION2_CHARS94 character set */
1203 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1205 else if (c1
>= 0x2C && c1
<= 0x2F)
1206 { /* designation of DIMENSION2_CHARS96 character set */
1208 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1211 goto label_invalid_code
;
1214 case 'n': /* invocation of locking-shift-2 */
1215 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1216 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1217 goto label_invalid_code
;
1218 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1219 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1222 case 'o': /* invocation of locking-shift-3 */
1223 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1224 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1225 goto label_invalid_code
;
1226 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1227 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1230 case 'N': /* invocation of single-shift-2 */
1231 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1232 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1233 goto label_invalid_code
;
1235 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1236 DECODE_ISO_CHARACTER (charset
, c1
);
1239 case 'O': /* invocation of single-shift-3 */
1240 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1241 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1242 goto label_invalid_code
;
1244 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1245 DECODE_ISO_CHARACTER (charset
, c1
);
1248 case '0': case '2': /* start composing */
1249 /* Before processing composing, we must be sure that all
1250 characters being composed are supported by CODING.
1251 If not, we must give up composing. */
1252 if (check_composing_code (coding
, src
, src_end
) == 0)
1254 /* We are looking at a valid composition sequence. */
1255 coding
->composing
= (c1
== '0'
1256 ? COMPOSING_NO_RULE_HEAD
1257 : COMPOSING_WITH_RULE_HEAD
);
1258 coding
->composed_chars
= 0;
1262 *dst
++ = ISO_CODE_ESC
;
1264 coding
->produced_char
+= 2;
1268 case '1': /* end composing */
1269 if (!coding
->composing
)
1271 *dst
++ = ISO_CODE_ESC
;
1273 coding
->produced_char
+= 2;
1277 if (coding
->composed_chars
> 0)
1279 if (coding
->composed_chars
== 1)
1281 unsigned char *this_char_start
= dst
;
1284 /* Only one character is in the composing
1285 sequence. Make it a normal character. */
1286 while (*--this_char_start
!= LEADING_CODE_COMPOSITION
);
1287 dst
= (this_char_start
1288 + (coding
->composing
== COMPOSING_NO_RULE_TAIL
1293 this_bytes
= BYTES_BY_CHAR_HEAD (*dst
);
1294 while (this_bytes
--) *this_char_start
++ = *dst
++;
1295 dst
= this_char_start
;
1297 coding
->produced_char
++;
1299 coding
->composing
= COMPOSING_NO
;
1302 case '[': /* specification of direction */
1303 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1304 goto label_invalid_code
;
1305 /* For the moment, nested direction is not supported.
1306 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1307 left-to-right, and nozero means right-to-left. */
1311 case ']': /* end of the current direction */
1312 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1314 case '0': /* end of the current direction */
1315 case '1': /* start of left-to-right direction */
1318 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1320 goto label_invalid_code
;
1323 case '2': /* start of right-to-left direction */
1326 coding
->mode
|= CODING_MODE_DIRECTION
;
1328 goto label_invalid_code
;
1332 goto label_invalid_code
;
1337 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1338 goto label_invalid_code
;
1339 if (c1
>= 0x28 && c1
<= 0x2B)
1340 { /* designation of DIMENSION1_CHARS94 character set */
1342 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1344 else if (c1
>= 0x2C && c1
<= 0x2F)
1345 { /* designation of DIMENSION1_CHARS96 character set */
1347 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1351 goto label_invalid_code
;
1354 /* We must update these variables now. */
1355 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1356 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1360 while (src_base
< src
)
1361 *dst
++ = *src_base
++;
1362 coding
->fake_multibyte
= 1;
1367 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1368 label_end_of_loop_2
:
1375 if (result
== CODING_FINISH_NORMAL
)
1376 result
= CODING_FINISH_INSUFFICIENT_DST
;
1377 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
1378 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
1380 /* This is the last block of the text to be decoded. We had
1381 better just flush out all remaining codes in the text
1382 although they are not valid characters. */
1383 src_bytes
= src_end
- src
;
1384 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
1385 src_bytes
= dst_end
- dst
;
1386 bcopy (src
, dst
, src_bytes
);
1389 coding
->fake_multibyte
= 1;
1393 coding
->consumed
= coding
->consumed_char
= src
- source
;
1394 coding
->produced
= dst
- destination
;
1398 /* ISO2022 encoding stuff. */
1401 It is not enough to say just "ISO2022" on encoding, we have to
1402 specify more details. In Emacs, each coding system of ISO2022
1403 variant has the following specifications:
1404 1. Initial designation to G0 thru G3.
1405 2. Allows short-form designation?
1406 3. ASCII should be designated to G0 before control characters?
1407 4. ASCII should be designated to G0 at end of line?
1408 5. 7-bit environment or 8-bit environment?
1409 6. Use locking-shift?
1410 7. Use Single-shift?
1411 And the following two are only for Japanese:
1412 8. Use ASCII in place of JIS0201-1976-Roman?
1413 9. Use JISX0208-1983 in place of JISX0208-1978?
1414 These specifications are encoded in `coding->flags' as flag bits
1415 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1419 /* Produce codes (escape sequence) for designating CHARSET to graphic
1420 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1421 the coding system CODING allows, produce designation sequence of
1424 #define ENCODE_DESIGNATION(charset, reg, coding) \
1426 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1427 char *intermediate_char_94 = "()*+"; \
1428 char *intermediate_char_96 = ",-./"; \
1429 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1430 if (revision < 255) \
1432 *dst++ = ISO_CODE_ESC; \
1434 *dst++ = '@' + revision; \
1436 *dst++ = ISO_CODE_ESC; \
1437 if (CHARSET_DIMENSION (charset) == 1) \
1439 if (CHARSET_CHARS (charset) == 94) \
1440 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1442 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1447 if (CHARSET_CHARS (charset) == 94) \
1449 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1451 || final_char < '@' || final_char > 'B') \
1452 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1455 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1457 *dst++ = final_char; \
1458 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1461 /* The following two macros produce codes (control character or escape
1462 sequence) for ISO2022 single-shift functions (single-shift-2 and
1465 #define ENCODE_SINGLE_SHIFT_2 \
1467 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1468 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1471 *dst++ = ISO_CODE_SS2; \
1472 coding->fake_multibyte = 1; \
1474 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1477 #define ENCODE_SINGLE_SHIFT_3 \
1479 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1480 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1483 *dst++ = ISO_CODE_SS3; \
1484 coding->fake_multibyte = 1; \
1486 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1489 /* The following four macros produce codes (control character or
1490 escape sequence) for ISO2022 locking-shift functions (shift-in,
1491 shift-out, locking-shift-2, and locking-shift-3). */
1493 #define ENCODE_SHIFT_IN \
1495 *dst++ = ISO_CODE_SI; \
1496 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1499 #define ENCODE_SHIFT_OUT \
1501 *dst++ = ISO_CODE_SO; \
1502 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1505 #define ENCODE_LOCKING_SHIFT_2 \
1507 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1508 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1511 #define ENCODE_LOCKING_SHIFT_3 \
1513 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1514 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1517 /* Produce codes for a DIMENSION1 character whose character set is
1518 CHARSET and whose position-code is C1. Designation and invocation
1519 sequences are also produced in advance if necessary. */
1522 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1524 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1526 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1527 *dst++ = c1 & 0x7F; \
1529 *dst++ = c1 | 0x80; \
1530 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1533 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1535 *dst++ = c1 & 0x7F; \
1538 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1540 *dst++ = c1 | 0x80; \
1543 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1544 && !coding->safe_charsets[charset]) \
1546 /* We should not encode this character, instead produce one or \
1548 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1549 if (CHARSET_WIDTH (charset) == 2) \
1550 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1554 /* Since CHARSET is not yet invoked to any graphic planes, we \
1555 must invoke it, or, at first, designate it to some graphic \
1556 register. Then repeat the loop to actually produce the \
1558 dst = encode_invocation_designation (charset, coding, dst); \
1561 /* Produce codes for a DIMENSION2 character whose character set is
1562 CHARSET and whose position-codes are C1 and C2. Designation and
1563 invocation codes are also produced in advance if necessary. */
1565 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1567 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1569 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1570 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1572 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1573 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1576 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1578 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1581 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1583 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1586 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1587 && !coding->safe_charsets[charset]) \
1589 /* We should not encode this character, instead produce one or \
1591 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1592 if (CHARSET_WIDTH (charset) == 2) \
1593 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1597 /* Since CHARSET is not yet invoked to any graphic planes, we \
1598 must invoke it, or, at first, designate it to some graphic \
1599 register. Then repeat the loop to actually produce the \
1601 dst = encode_invocation_designation (charset, coding, dst); \
1604 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1606 int c_alt, charset_alt; \
1607 if (!NILP (translation_table) \
1608 && ((c_alt = translate_char (translation_table, -1, \
1611 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1613 charset_alt = charset; \
1614 if (CHARSET_DIMENSION (charset_alt) == 1) \
1616 if (charset == CHARSET_ASCII \
1617 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1618 charset_alt = charset_latin_jisx0201; \
1619 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1623 if (charset == charset_jisx0208 \
1624 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1625 charset_alt = charset_jisx0208_1978; \
1626 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1628 if (! COMPOSING_P (coding->composing)) \
1629 coding->consumed_char++; \
1632 /* Produce designation and invocation codes at a place pointed by DST
1633 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1637 encode_invocation_designation (charset
, coding
, dst
)
1639 struct coding_system
*coding
;
1642 int reg
; /* graphic register number */
1644 /* At first, check designations. */
1645 for (reg
= 0; reg
< 4; reg
++)
1646 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1651 /* CHARSET is not yet designated to any graphic registers. */
1652 /* At first check the requested designation. */
1653 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1654 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1655 /* Since CHARSET requests no special designation, designate it
1656 to graphic register 0. */
1659 ENCODE_DESIGNATION (charset
, reg
, coding
);
1662 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1663 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1665 /* Since the graphic register REG is not invoked to any graphic
1666 planes, invoke it to graphic plane 0. */
1669 case 0: /* graphic register 0 */
1673 case 1: /* graphic register 1 */
1677 case 2: /* graphic register 2 */
1678 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1679 ENCODE_SINGLE_SHIFT_2
;
1681 ENCODE_LOCKING_SHIFT_2
;
1684 case 3: /* graphic register 3 */
1685 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1686 ENCODE_SINGLE_SHIFT_3
;
1688 ENCODE_LOCKING_SHIFT_3
;
1695 /* The following two macros produce codes for indicating composition. */
1696 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1697 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1698 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1700 /* The following three macros produce codes for indicating direction
1702 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1704 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1705 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1707 *dst++ = ISO_CODE_CSI; \
1710 #define ENCODE_DIRECTION_R2L \
1711 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1713 #define ENCODE_DIRECTION_L2R \
1714 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1716 /* Produce codes for designation and invocation to reset the graphic
1717 planes and registers to initial state. */
1718 #define ENCODE_RESET_PLANE_AND_REGISTER \
1721 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1723 for (reg = 0; reg < 4; reg++) \
1724 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1725 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1726 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1727 ENCODE_DESIGNATION \
1728 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1731 /* Produce designation sequences of charsets in the line started from
1732 SRC to a place pointed by *DSTP, and update DSTP.
1734 If the current block ends before any end-of-line, we may fail to
1735 find all the necessary designations. */
1738 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1739 struct coding_system
*coding
;
1741 unsigned char *src
, *src_end
, **dstp
;
1743 int charset
, c
, found
= 0, reg
;
1744 /* Table of charsets to be designated to each graphic register. */
1746 unsigned char *dst
= *dstp
;
1748 for (reg
= 0; reg
< 4; reg
++)
1751 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1753 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1756 charset
= CHARSET_AT (src
);
1760 unsigned char c1
, c2
;
1762 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1763 if ((c_alt
= translate_char (table
, -1, charset
, c1
, c2
)) >= 0)
1764 charset
= CHAR_CHARSET (c_alt
);
1767 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1768 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1779 for (reg
= 0; reg
< 4; reg
++)
1781 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1782 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1787 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1790 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1791 struct coding_system
*coding
;
1792 unsigned char *source
, *destination
;
1793 int src_bytes
, dst_bytes
;
1795 unsigned char *src
= source
;
1796 unsigned char *src_end
= source
+ src_bytes
;
1797 unsigned char *dst
= destination
;
1798 unsigned char *dst_end
= destination
+ dst_bytes
;
1799 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1800 from DST_END to assure overflow checking is necessary only at the
1802 unsigned char *adjusted_dst_end
= dst_end
- 19;
1803 Lisp_Object translation_table
1804 = coding
->translation_table_for_encode
;
1805 int result
= CODING_FINISH_NORMAL
;
1807 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1808 translation_table
= Vstandard_translation_table_for_encode
;
1810 coding
->consumed_char
= 0;
1811 coding
->fake_multibyte
= 0;
1812 while (src
< src_end
&& (dst_bytes
1813 ? (dst
< adjusted_dst_end
)
1814 : (dst
< src
- 19)))
1816 /* SRC_BASE remembers the start position in source in each loop.
1817 The loop will be exited when there's not enough source text
1818 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1819 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1820 reset to SRC_BASE before exiting. */
1821 unsigned char *src_base
= src
;
1822 int charset
, c1
, c2
, c3
, c4
;
1824 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1825 && CODING_SPEC_ISO_BOL (coding
))
1827 /* We have to produce designation sequences if any now. */
1828 encode_designation_at_bol (coding
, translation_table
,
1829 src
, src_end
, &dst
);
1830 CODING_SPEC_ISO_BOL (coding
) = 0;
1834 /* If we are seeing a component of a composite character, we are
1835 seeing a leading-code encoded irregularly for composition, or
1836 a composition rule if composing with rule. We must set C1 to
1837 a normal leading-code or an ASCII code. If we are not seeing
1838 a composite character, we must reset composition,
1839 designation, and invocation states. */
1840 if (COMPOSING_P (coding
->composing
))
1844 /* We are not in a composite character any longer. */
1845 coding
->composing
= COMPOSING_NO
;
1846 ENCODE_RESET_PLANE_AND_REGISTER
;
1847 ENCODE_COMPOSITION_END
;
1851 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1854 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1857 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1858 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1861 /* This is an ASCII component. */
1866 /* This is a leading-code of non ASCII component. */
1871 /* Now encode one character. C1 is a control character, an
1872 ASCII character, or a leading-code of multi-byte character. */
1873 switch (emacs_code_class
[c1
])
1875 case EMACS_ascii_code
:
1876 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1879 case EMACS_control_code
:
1880 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1881 ENCODE_RESET_PLANE_AND_REGISTER
;
1883 coding
->consumed_char
++;
1886 case EMACS_carriage_return_code
:
1887 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
1889 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1890 ENCODE_RESET_PLANE_AND_REGISTER
;
1892 coding
->consumed_char
++;
1895 /* fall down to treat '\r' as '\n' ... */
1897 case EMACS_linefeed_code
:
1898 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1899 ENCODE_RESET_PLANE_AND_REGISTER
;
1900 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1901 bcopy (coding
->spec
.iso2022
.initial_designation
,
1902 coding
->spec
.iso2022
.current_designation
,
1903 sizeof coding
->spec
.iso2022
.initial_designation
);
1904 if (coding
->eol_type
== CODING_EOL_LF
1905 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1906 *dst
++ = ISO_CODE_LF
;
1907 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1908 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1910 *dst
++ = ISO_CODE_CR
;
1911 CODING_SPEC_ISO_BOL (coding
) = 1;
1912 coding
->consumed_char
++;
1915 case EMACS_leading_code_2
:
1919 /* invalid sequence */
1922 coding
->consumed_char
++;
1925 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1928 case EMACS_leading_code_3
:
1929 TWO_MORE_BYTES (c2
, c3
);
1930 if (c2
< 0xA0 || c3
< 0xA0)
1932 /* invalid sequence */
1935 coding
->consumed_char
++;
1937 else if (c1
< LEADING_CODE_PRIVATE_11
)
1938 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1940 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1943 case EMACS_leading_code_4
:
1944 THREE_MORE_BYTES (c2
, c3
, c4
);
1945 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1947 /* invalid sequence */
1950 coding
->consumed_char
++;
1953 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1956 case EMACS_leading_code_composition
:
1960 /* invalid sequence */
1963 coding
->consumed_char
++;
1965 else if (c2
== 0xFF)
1967 ENCODE_RESET_PLANE_AND_REGISTER
;
1968 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1969 ENCODE_COMPOSITION_WITH_RULE_START
;
1970 coding
->consumed_char
++;
1974 ENCODE_RESET_PLANE_AND_REGISTER
;
1975 /* Rewind one byte because it is a character code of
1976 composition elements. */
1978 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1979 ENCODE_COMPOSITION_NO_RULE_START
;
1980 coding
->consumed_char
++;
1984 case EMACS_invalid_code
:
1986 coding
->consumed_char
++;
1991 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1996 if (src
< src_end
&& result
== CODING_FINISH_NORMAL
)
1997 result
= CODING_FINISH_INSUFFICIENT_DST
;
1999 /* If this is the last block of the text to be encoded, we must
2000 reset graphic planes and registers to the initial state, and
2001 flush out the carryover if any. */
2002 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2004 ENCODE_RESET_PLANE_AND_REGISTER
;
2005 if (COMPOSING_P (coding
->composing
))
2006 ENCODE_COMPOSITION_END
;
2007 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
2009 while (src
< src_end
&& dst
< dst_end
)
2013 coding
->consumed
= src
- source
;
2014 coding
->produced
= coding
->produced_char
= dst
- destination
;
2019 /*** 4. SJIS and BIG5 handlers ***/
2021 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2022 quite widely. So, for the moment, Emacs supports them in the bare
2023 C code. But, in the future, they may be supported only by CCL. */
2025 /* SJIS is a coding system encoding three character sets: ASCII, right
2026 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2027 as is. A character of charset katakana-jisx0201 is encoded by
2028 "position-code + 0x80". A character of charset japanese-jisx0208
2029 is encoded in 2-byte but two position-codes are divided and shifted
2030 so that it fit in the range below.
2032 --- CODE RANGE of SJIS ---
2033 (character set) (range)
2035 KATAKANA-JISX0201 0xA0 .. 0xDF
2036 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2037 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2038 -------------------------------
2042 /* BIG5 is a coding system encoding two character sets: ASCII and
2043 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2044 character set and is encoded in two-byte.
2046 --- CODE RANGE of BIG5 ---
2047 (character set) (range)
2049 Big5 (1st byte) 0xA1 .. 0xFE
2050 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2051 --------------------------
2053 Since the number of characters in Big5 is larger than maximum
2054 characters in Emacs' charset (96x96), it can't be handled as one
2055 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2056 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2057 contains frequently used characters and the latter contains less
2058 frequently used characters. */
2060 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2061 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2062 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2063 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2065 /* Number of Big5 characters which have the same code in 1st byte. */
2066 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2068 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2071 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2073 charset = charset_big5_1; \
2076 charset = charset_big5_2; \
2077 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2079 c1 = temp / (0xFF - 0xA1) + 0x21; \
2080 c2 = temp % (0xFF - 0xA1) + 0x21; \
2083 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2085 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2086 if (charset == charset_big5_2) \
2087 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2088 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2089 b2 = temp % BIG5_SAME_ROW; \
2090 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2093 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2095 int c_alt, charset_alt = (charset); \
2096 if (!NILP (translation_table) \
2097 && ((c_alt = translate_char (translation_table, \
2098 -1, (charset), c1, c2)) >= 0)) \
2099 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2100 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2101 DECODE_CHARACTER_ASCII (c1); \
2102 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2103 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2105 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2108 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2110 int c_alt, charset_alt; \
2111 if (!NILP (translation_table) \
2112 && ((c_alt = translate_char (translation_table, -1, \
2115 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2117 charset_alt = charset; \
2118 if (charset_alt == charset_ascii) \
2120 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2122 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2126 *dst++ = charset_alt, *dst++ = c1; \
2127 coding->fake_multibyte = 1; \
2132 c1 &= 0x7F, c2 &= 0x7F; \
2133 if (sjis_p && charset_alt == charset_jisx0208) \
2135 unsigned char s1, s2; \
2137 ENCODE_SJIS (c1, c2, s1, s2); \
2138 *dst++ = s1, *dst++ = s2; \
2139 coding->fake_multibyte = 1; \
2142 && (charset_alt == charset_big5_1 \
2143 || charset_alt == charset_big5_2)) \
2145 unsigned char b1, b2; \
2147 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2148 *dst++ = b1, *dst++ = b2; \
2152 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2153 coding->fake_multibyte = 1; \
2156 coding->consumed_char++; \
2159 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2160 Check if a text is encoded in SJIS. If it is, return
2161 CODING_CATEGORY_MASK_SJIS, else return 0. */
2164 detect_coding_sjis (src
, src_end
)
2165 unsigned char *src
, *src_end
;
2169 while (src
< src_end
)
2172 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2174 if (src
< src_end
&& *src
++ < 0x40)
2178 return CODING_CATEGORY_MASK_SJIS
;
2181 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2182 Check if a text is encoded in BIG5. If it is, return
2183 CODING_CATEGORY_MASK_BIG5, else return 0. */
2186 detect_coding_big5 (src
, src_end
)
2187 unsigned char *src
, *src_end
;
2191 while (src
< src_end
)
2199 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2203 return CODING_CATEGORY_MASK_BIG5
;
2206 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2207 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2210 decode_coding_sjis_big5 (coding
, source
, destination
,
2211 src_bytes
, dst_bytes
, sjis_p
)
2212 struct coding_system
*coding
;
2213 unsigned char *source
, *destination
;
2214 int src_bytes
, dst_bytes
;
2217 unsigned char *src
= source
;
2218 unsigned char *src_end
= source
+ src_bytes
;
2219 unsigned char *dst
= destination
;
2220 unsigned char *dst_end
= destination
+ dst_bytes
;
2221 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2222 from DST_END to assure overflow checking is necessary only at the
2224 unsigned char *adjusted_dst_end
= dst_end
- 3;
2225 Lisp_Object translation_table
2226 = coding
->translation_table_for_decode
;
2227 int result
= CODING_FINISH_NORMAL
;
2229 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2230 translation_table
= Vstandard_translation_table_for_decode
;
2232 coding
->produced_char
= 0;
2233 coding
->fake_multibyte
= 0;
2234 while (src
< src_end
&& (dst_bytes
2235 ? (dst
< adjusted_dst_end
)
2238 /* SRC_BASE remembers the start position in source in each loop.
2239 The loop will be exited when there's not enough source text
2240 to analyze two-byte character (within macro ONE_MORE_BYTE).
2241 In that case, SRC is reset to SRC_BASE before exiting. */
2242 unsigned char *src_base
= src
;
2243 unsigned char c1
= *src
++, c2
, c3
, c4
;
2249 if (coding
->eol_type
== CODING_EOL_CRLF
)
2254 else if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2256 result
= CODING_FINISH_INCONSISTENT_EOL
;
2257 goto label_end_of_loop_2
;
2260 /* To process C2 again, SRC is subtracted by 1. */
2263 else if (coding
->eol_type
== CODING_EOL_CR
)
2269 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2270 && (coding
->eol_type
== CODING_EOL_CR
2271 || coding
->eol_type
== CODING_EOL_CRLF
))
2273 result
= CODING_FINISH_INCONSISTENT_EOL
;
2274 goto label_end_of_loop_2
;
2278 coding
->produced_char
++;
2281 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2286 if (c1
< 0xA0 || (c1
>= 0xE0 && c1
< 0xF0))
2288 /* SJIS -> JISX0208 */
2290 if (c2
>= 0x40 && c2
!= 0x7F && c2
<= 0xFC)
2292 DECODE_SJIS (c1
, c2
, c3
, c4
);
2293 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2296 goto label_invalid_code_2
;
2299 /* SJIS -> JISX0201-Kana */
2300 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
,
2303 goto label_invalid_code_1
;
2308 if (c1
>= 0xA1 && c1
<= 0xFE)
2311 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2315 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2316 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2319 goto label_invalid_code_2
;
2322 goto label_invalid_code_1
;
2327 label_invalid_code_1
:
2329 coding
->produced_char
++;
2330 coding
->fake_multibyte
= 1;
2333 label_invalid_code_2
:
2334 *dst
++ = c1
; *dst
++= c2
;
2335 coding
->produced_char
+= 2;
2336 coding
->fake_multibyte
= 1;
2340 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2341 label_end_of_loop_2
:
2348 if (result
== CODING_FINISH_NORMAL
)
2349 result
= CODING_FINISH_INSUFFICIENT_DST
;
2350 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2351 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2353 src_bytes
= src_end
- src
;
2354 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2355 src_bytes
= dst_end
- dst
;
2356 bcopy (dst
, src
, src_bytes
);
2359 coding
->fake_multibyte
= 1;
2363 coding
->consumed
= coding
->consumed_char
= src
- source
;
2364 coding
->produced
= dst
- destination
;
2368 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2369 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2370 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2371 sure that all these charsets are registered as official charset
2372 (i.e. do not have extended leading-codes). Characters of other
2373 charsets are produced without any encoding. If SJIS_P is 1, encode
2374 SJIS text, else encode BIG5 text. */
2377 encode_coding_sjis_big5 (coding
, source
, destination
,
2378 src_bytes
, dst_bytes
, sjis_p
)
2379 struct coding_system
*coding
;
2380 unsigned char *source
, *destination
;
2381 int src_bytes
, dst_bytes
;
2384 unsigned char *src
= source
;
2385 unsigned char *src_end
= source
+ src_bytes
;
2386 unsigned char *dst
= destination
;
2387 unsigned char *dst_end
= destination
+ dst_bytes
;
2388 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2389 from DST_END to assure overflow checking is necessary only at the
2391 unsigned char *adjusted_dst_end
= dst_end
- 1;
2392 Lisp_Object translation_table
2393 = coding
->translation_table_for_encode
;
2394 int result
= CODING_FINISH_NORMAL
;
2396 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2397 translation_table
= Vstandard_translation_table_for_encode
;
2399 coding
->consumed_char
= 0;
2400 coding
->fake_multibyte
= 0;
2401 while (src
< src_end
&& (dst_bytes
2402 ? (dst
< adjusted_dst_end
)
2405 /* SRC_BASE remembers the start position in source in each loop.
2406 The loop will be exited when there's not enough source text
2407 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2408 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2410 unsigned char *src_base
= src
;
2411 unsigned char c1
= *src
++, c2
, c3
, c4
;
2413 if (coding
->composing
)
2420 else if (c1
>= 0xA0)
2423 coding
->composing
= 0;
2426 switch (emacs_code_class
[c1
])
2428 case EMACS_ascii_code
:
2429 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2432 case EMACS_control_code
:
2434 coding
->consumed_char
++;
2437 case EMACS_carriage_return_code
:
2438 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2441 coding
->consumed_char
++;
2444 /* fall down to treat '\r' as '\n' ... */
2446 case EMACS_linefeed_code
:
2447 if (coding
->eol_type
== CODING_EOL_LF
2448 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2450 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2451 *dst
++ = '\r', *dst
++ = '\n';
2454 coding
->consumed_char
++;
2457 case EMACS_leading_code_2
:
2459 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2462 case EMACS_leading_code_3
:
2463 TWO_MORE_BYTES (c2
, c3
);
2464 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2467 case EMACS_leading_code_4
:
2468 THREE_MORE_BYTES (c2
, c3
, c4
);
2469 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2472 case EMACS_leading_code_composition
:
2473 coding
->composing
= 1;
2476 default: /* i.e. case EMACS_invalid_code: */
2478 coding
->consumed_char
++;
2483 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2488 if (result
== CODING_FINISH_NORMAL
2490 result
= CODING_FINISH_INSUFFICIENT_DST
;
2491 coding
->consumed
= src
- source
;
2492 coding
->produced
= coding
->produced_char
= dst
- destination
;
2497 /*** 5. CCL handlers ***/
2499 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2500 Check if a text is encoded in a coding system of which
2501 encoder/decoder are written in CCL program. If it is, return
2502 CODING_CATEGORY_MASK_CCL, else return 0. */
2505 detect_coding_ccl (src
, src_end
)
2506 unsigned char *src
, *src_end
;
2508 unsigned char *valid
;
2510 /* No coding system is assigned to coding-category-ccl. */
2511 if (!coding_system_table
[CODING_CATEGORY_IDX_CCL
])
2514 valid
= coding_system_table
[CODING_CATEGORY_IDX_CCL
]->spec
.ccl
.valid_codes
;
2515 while (src
< src_end
)
2517 if (! valid
[*src
]) return 0;
2520 return CODING_CATEGORY_MASK_CCL
;
2524 /*** 6. End-of-line handlers ***/
2526 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2527 This function is called only when `coding->eol_type' is
2528 CODING_EOL_CRLF or CODING_EOL_CR. */
2531 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2532 struct coding_system
*coding
;
2533 unsigned char *source
, *destination
;
2534 int src_bytes
, dst_bytes
;
2536 unsigned char *src
= source
;
2537 unsigned char *src_end
= source
+ src_bytes
;
2538 unsigned char *dst
= destination
;
2539 unsigned char *dst_end
= destination
+ dst_bytes
;
2541 int result
= CODING_FINISH_NORMAL
;
2543 coding
->fake_multibyte
= 0;
2548 switch (coding
->eol_type
)
2550 case CODING_EOL_CRLF
:
2552 /* Since the maximum bytes produced by each loop is 2, we
2553 subtract 1 from DST_END to assure overflow checking is
2554 necessary only at the head of loop. */
2555 unsigned char *adjusted_dst_end
= dst_end
- 1;
2557 while (src
< src_end
&& (dst_bytes
2558 ? (dst
< adjusted_dst_end
)
2561 unsigned char *src_base
= src
;
2571 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2573 result
= CODING_FINISH_INCONSISTENT_EOL
;
2574 goto label_end_of_loop_2
;
2578 if (BASE_LEADING_CODE_P (c
))
2579 coding
->fake_multibyte
= 1;
2583 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2585 result
= CODING_FINISH_INCONSISTENT_EOL
;
2586 goto label_end_of_loop_2
;
2591 if (BASE_LEADING_CODE_P (c
))
2592 coding
->fake_multibyte
= 1;
2597 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2598 label_end_of_loop_2
:
2604 if (result
== CODING_FINISH_NORMAL
)
2605 result
= CODING_FINISH_INSUFFICIENT_DST
;
2606 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2607 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2609 /* This is the last block of the text to be decoded.
2610 We flush out all remaining codes. */
2611 src_bytes
= src_end
- src
;
2612 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2613 src_bytes
= dst_end
- dst
;
2614 bcopy (src
, dst
, src_bytes
);
2623 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2625 while (src
< src_end
)
2627 if ((c
= *src
++) == '\n')
2629 if (BASE_LEADING_CODE_P (c
))
2630 coding
->fake_multibyte
= 1;
2634 src_bytes
= src
- source
;
2635 result
= CODING_FINISH_INCONSISTENT_EOL
;
2638 if (dst_bytes
&& src_bytes
> dst_bytes
)
2640 result
= CODING_FINISH_INSUFFICIENT_DST
;
2641 src_bytes
= dst_bytes
;
2644 bcopy (source
, destination
, src_bytes
);
2646 safe_bcopy (source
, destination
, src_bytes
);
2647 src
= source
+ src_bytes
;
2648 while (src_bytes
--) if (*dst
++ == '\r') dst
[-1] = '\n';
2651 default: /* i.e. case: CODING_EOL_LF */
2652 if (dst_bytes
&& src_bytes
> dst_bytes
)
2654 result
= CODING_FINISH_INSUFFICIENT_DST
;
2655 src_bytes
= dst_bytes
;
2658 bcopy (source
, destination
, src_bytes
);
2660 safe_bcopy (source
, destination
, src_bytes
);
2663 coding
->fake_multibyte
= 1;
2667 coding
->consumed
= coding
->consumed_char
= src
- source
;
2668 coding
->produced
= coding
->produced_char
= dst
- destination
;
2672 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2673 format of end-of-line according to `coding->eol_type'. If
2674 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2675 '\r' in source text also means end-of-line. */
2678 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2679 struct coding_system
*coding
;
2680 unsigned char *source
, *destination
;
2681 int src_bytes
, dst_bytes
;
2683 unsigned char *src
= source
;
2684 unsigned char *dst
= destination
;
2685 int result
= CODING_FINISH_NORMAL
;
2687 coding
->fake_multibyte
= 0;
2689 if (coding
->eol_type
== CODING_EOL_CRLF
)
2692 unsigned char *src_end
= source
+ src_bytes
;
2693 unsigned char *dst_end
= destination
+ dst_bytes
;
2694 /* Since the maximum bytes produced by each loop is 2, we
2695 subtract 1 from DST_END to assure overflow checking is
2696 necessary only at the head of loop. */
2697 unsigned char *adjusted_dst_end
= dst_end
- 1;
2699 while (src
< src_end
&& (dst_bytes
2700 ? (dst
< adjusted_dst_end
)
2705 || (c
== '\r' && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)))
2706 *dst
++ = '\r', *dst
++ = '\n';
2710 if (BASE_LEADING_CODE_P (c
))
2711 coding
->fake_multibyte
= 1;
2715 result
= CODING_FINISH_INSUFFICIENT_DST
;
2721 if (dst_bytes
&& src_bytes
> dst_bytes
)
2723 src_bytes
= dst_bytes
;
2724 result
= CODING_FINISH_INSUFFICIENT_DST
;
2727 bcopy (source
, destination
, src_bytes
);
2729 safe_bcopy (source
, destination
, src_bytes
);
2730 dst_bytes
= src_bytes
;
2731 if (coding
->eol_type
== CODING_EOL_CR
)
2735 if ((c
= *dst
++) == '\n')
2737 else if (BASE_LEADING_CODE_P (c
))
2738 coding
->fake_multibyte
= 1;
2743 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2746 if (*dst
++ == '\r') dst
[-1] = '\n';
2748 coding
->fake_multibyte
= 1;
2750 src
= source
+ dst_bytes
;
2751 dst
= destination
+ dst_bytes
;
2754 coding
->consumed
= coding
->consumed_char
= src
- source
;
2755 coding
->produced
= coding
->produced_char
= dst
- destination
;
2760 /*** 7. C library functions ***/
2762 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2763 has a property `coding-system'. The value of this property is a
2764 vector of length 5 (called as coding-vector). Among elements of
2765 this vector, the first (element[0]) and the fifth (element[4])
2766 carry important information for decoding/encoding. Before
2767 decoding/encoding, this information should be set in fields of a
2768 structure of type `coding_system'.
2770 A value of property `coding-system' can be a symbol of another
2771 subsidiary coding-system. In that case, Emacs gets coding-vector
2774 `element[0]' contains information to be set in `coding->type'. The
2775 value and its meaning is as follows:
2777 0 -- coding_type_emacs_mule
2778 1 -- coding_type_sjis
2779 2 -- coding_type_iso2022
2780 3 -- coding_type_big5
2781 4 -- coding_type_ccl encoder/decoder written in CCL
2782 nil -- coding_type_no_conversion
2783 t -- coding_type_undecided (automatic conversion on decoding,
2784 no-conversion on encoding)
2786 `element[4]' contains information to be set in `coding->flags' and
2787 `coding->spec'. The meaning varies by `coding->type'.
2789 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2790 of length 32 (of which the first 13 sub-elements are used now).
2791 Meanings of these sub-elements are:
2793 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2794 If the value is an integer of valid charset, the charset is
2795 assumed to be designated to graphic register N initially.
2797 If the value is minus, it is a minus value of charset which
2798 reserves graphic register N, which means that the charset is
2799 not designated initially but should be designated to graphic
2800 register N just before encoding a character in that charset.
2802 If the value is nil, graphic register N is never used on
2805 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2806 Each value takes t or nil. See the section ISO2022 of
2807 `coding.h' for more information.
2809 If `coding->type' is `coding_type_big5', element[4] is t to denote
2810 BIG5-ETen or nil to denote BIG5-HKU.
2812 If `coding->type' takes the other value, element[4] is ignored.
2814 Emacs Lisp's coding system also carries information about format of
2815 end-of-line in a value of property `eol-type'. If the value is
2816 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2817 means CODING_EOL_CR. If it is not integer, it should be a vector
2818 of subsidiary coding systems of which property `eol-type' has one
2823 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2824 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2825 is setup so that no conversion is necessary and return -1, else
2829 setup_coding_system (coding_system
, coding
)
2830 Lisp_Object coding_system
;
2831 struct coding_system
*coding
;
2833 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2837 /* Initialize some fields required for all kinds of coding systems. */
2838 coding
->symbol
= coding_system
;
2839 coding
->common_flags
= 0;
2841 coding
->heading_ascii
= -1;
2842 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2843 coding_spec
= Fget (coding_system
, Qcoding_system
);
2844 if (!VECTORP (coding_spec
)
2845 || XVECTOR (coding_spec
)->size
!= 5
2846 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2847 goto label_invalid_coding_system
;
2849 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2850 if (VECTORP (eol_type
))
2852 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2853 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2855 else if (XFASTINT (eol_type
) == 1)
2857 coding
->eol_type
= CODING_EOL_CRLF
;
2858 coding
->common_flags
2859 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2861 else if (XFASTINT (eol_type
) == 2)
2863 coding
->eol_type
= CODING_EOL_CR
;
2864 coding
->common_flags
2865 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2868 coding
->eol_type
= CODING_EOL_LF
;
2870 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2871 /* Try short cut. */
2872 if (SYMBOLP (coding_type
))
2874 if (EQ (coding_type
, Qt
))
2876 coding
->type
= coding_type_undecided
;
2877 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2880 coding
->type
= coding_type_no_conversion
;
2884 /* Initialize remaining fields. */
2885 coding
->composing
= 0;
2887 /* Get values of coding system properties:
2888 `post-read-conversion', `pre-write-conversion',
2889 `translation-table-for-decode', `translation-table-for-encode'. */
2890 plist
= XVECTOR (coding_spec
)->contents
[3];
2891 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2892 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2893 val
= Fplist_get (plist
, Qtranslation_table_for_decode
);
2895 val
= Fget (val
, Qtranslation_table_for_decode
);
2896 coding
->translation_table_for_decode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2897 val
= Fplist_get (plist
, Qtranslation_table_for_encode
);
2899 val
= Fget (val
, Qtranslation_table_for_encode
);
2900 coding
->translation_table_for_encode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2901 val
= Fplist_get (plist
, Qcoding_category
);
2904 val
= Fget (val
, Qcoding_category_index
);
2906 coding
->category_idx
= XINT (val
);
2908 goto label_invalid_coding_system
;
2911 goto label_invalid_coding_system
;
2913 val
= Fplist_get (plist
, Qsafe_charsets
);
2916 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2917 coding
->safe_charsets
[i
] = 1;
2921 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2924 if ((i
= get_charset_id (XCONS (val
)->car
)) >= 0)
2925 coding
->safe_charsets
[i
] = 1;
2926 val
= XCONS (val
)->cdr
;
2930 switch (XFASTINT (coding_type
))
2933 coding
->type
= coding_type_emacs_mule
;
2934 if (!NILP (coding
->post_read_conversion
))
2935 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
2936 if (!NILP (coding
->pre_write_conversion
))
2937 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
2941 coding
->type
= coding_type_sjis
;
2942 coding
->common_flags
2943 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2947 coding
->type
= coding_type_iso2022
;
2948 coding
->common_flags
2949 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2951 Lisp_Object val
, temp
;
2953 int i
, charset
, reg_bits
= 0;
2955 val
= XVECTOR (coding_spec
)->contents
[4];
2957 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2958 goto label_invalid_coding_system
;
2960 flags
= XVECTOR (val
)->contents
;
2962 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2963 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2964 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2965 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2966 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2967 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2968 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2969 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2970 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2971 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2972 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2973 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2974 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
2977 /* Invoke graphic register 0 to plane 0. */
2978 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2979 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2980 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2981 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2982 /* Not single shifting at first. */
2983 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2984 /* Beginning of buffer should also be regarded as bol. */
2985 CODING_SPEC_ISO_BOL (coding
) = 1;
2987 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2988 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
2989 val
= Vcharset_revision_alist
;
2992 charset
= get_charset_id (Fcar_safe (XCONS (val
)->car
));
2994 && (temp
= Fcdr_safe (XCONS (val
)->car
), INTEGERP (temp
))
2995 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
2996 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
2997 val
= XCONS (val
)->cdr
;
3000 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3001 FLAGS[REG] can be one of below:
3002 integer CHARSET: CHARSET occupies register I,
3003 t: designate nothing to REG initially, but can be used
3005 list of integer, nil, or t: designate the first
3006 element (if integer) to REG initially, the remaining
3007 elements (if integer) is designated to REG on request,
3008 if an element is t, REG can be used by any charsets,
3009 nil: REG is never used. */
3010 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3011 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3012 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
3013 for (i
= 0; i
< 4; i
++)
3015 if (INTEGERP (flags
[i
])
3016 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
3017 || (charset
= get_charset_id (flags
[i
])) >= 0)
3019 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3020 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
3022 else if (EQ (flags
[i
], Qt
))
3024 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3026 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3028 else if (CONSP (flags
[i
]))
3033 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3034 if (INTEGERP (XCONS (tail
)->car
)
3035 && (charset
= XINT (XCONS (tail
)->car
),
3036 CHARSET_VALID_P (charset
))
3037 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
3039 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3040 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
3043 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3044 tail
= XCONS (tail
)->cdr
;
3045 while (CONSP (tail
))
3047 if (INTEGERP (XCONS (tail
)->car
)
3048 && (charset
= XINT (XCONS (tail
)->car
),
3049 CHARSET_VALID_P (charset
))
3050 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
3051 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3053 else if (EQ (XCONS (tail
)->car
, Qt
))
3055 tail
= XCONS (tail
)->cdr
;
3059 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3061 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
3062 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
3065 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
3067 /* REG 1 can be used only by locking shift in 7-bit env. */
3068 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
3070 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3071 /* Without any shifting, only REG 0 and 1 can be used. */
3076 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3078 if (CHARSET_VALID_P (charset
))
3080 /* There exist some default graphic registers to be
3083 /* We had better avoid designating a charset of
3084 CHARS96 to REG 0 as far as possible. */
3085 if (CHARSET_CHARS (charset
) == 96)
3086 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3088 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3090 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3092 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3096 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3097 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3101 coding
->type
= coding_type_big5
;
3102 coding
->common_flags
3103 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3105 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3106 ? CODING_FLAG_BIG5_HKU
3107 : CODING_FLAG_BIG5_ETEN
);
3111 coding
->type
= coding_type_ccl
;
3112 coding
->common_flags
3113 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3116 Lisp_Object decoder
, encoder
;
3118 val
= XVECTOR (coding_spec
)->contents
[4];
3120 && SYMBOLP (XCONS (val
)->car
)
3121 && !NILP (decoder
= Fget (XCONS (val
)->car
, Qccl_program_idx
))
3122 && !NILP (decoder
= Fcdr (Faref (Vccl_program_table
, decoder
)))
3123 && SYMBOLP (XCONS (val
)->cdr
)
3124 && !NILP (encoder
= Fget (XCONS (val
)->cdr
, Qccl_program_idx
))
3125 && !NILP (encoder
= Fcdr (Faref (Vccl_program_table
, encoder
))))
3127 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), decoder
);
3128 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), encoder
);
3131 goto label_invalid_coding_system
;
3133 bzero (coding
->spec
.ccl
.valid_codes
, 256);
3134 val
= Fplist_get (plist
, Qvalid_codes
);
3139 for (; CONSP (val
); val
= XCONS (val
)->cdr
)
3141 this = XCONS (val
)->car
;
3143 && XINT (this) >= 0 && XINT (this) < 256)
3144 coding
->spec
.ccl
.valid_codes
[XINT (this)] = 1;
3145 else if (CONSP (this)
3146 && INTEGERP (XCONS (this)->car
)
3147 && INTEGERP (XCONS (this)->cdr
))
3149 int start
= XINT (XCONS (this)->car
);
3150 int end
= XINT (XCONS (this)->cdr
);
3152 if (start
>= 0 && start
<= end
&& end
< 256)
3153 while (start
<= end
)
3154 coding
->spec
.ccl
.valid_codes
[start
++] = 1;
3159 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3163 coding
->type
= coding_type_raw_text
;
3167 goto label_invalid_coding_system
;
3171 label_invalid_coding_system
:
3172 coding
->type
= coding_type_no_conversion
;
3173 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3174 coding
->common_flags
= 0;
3175 coding
->eol_type
= CODING_EOL_LF
;
3176 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3180 /* Setup raw-text or one of its subsidiaries in the structure
3181 coding_system CODING according to the already setup value eol_type
3182 in CODING. CODING should be setup for some coding system in
3186 setup_raw_text_coding_system (coding
)
3187 struct coding_system
*coding
;
3189 if (coding
->type
!= coding_type_raw_text
)
3191 coding
->symbol
= Qraw_text
;
3192 coding
->type
= coding_type_raw_text
;
3193 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3195 Lisp_Object subsidiaries
;
3196 subsidiaries
= Fget (Qraw_text
, Qeol_type
);
3198 if (VECTORP (subsidiaries
)
3199 && XVECTOR (subsidiaries
)->size
== 3)
3201 = XVECTOR (subsidiaries
)->contents
[coding
->eol_type
];
3207 /* Emacs has a mechanism to automatically detect a coding system if it
3208 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3209 it's impossible to distinguish some coding systems accurately
3210 because they use the same range of codes. So, at first, coding
3211 systems are categorized into 7, those are:
3213 o coding-category-emacs-mule
3215 The category for a coding system which has the same code range
3216 as Emacs' internal format. Assigned the coding-system (Lisp
3217 symbol) `emacs-mule' by default.
3219 o coding-category-sjis
3221 The category for a coding system which has the same code range
3222 as SJIS. Assigned the coding-system (Lisp
3223 symbol) `japanese-shift-jis' by default.
3225 o coding-category-iso-7
3227 The category for a coding system which has the same code range
3228 as ISO2022 of 7-bit environment. This doesn't use any locking
3229 shift and single shift functions. This can encode/decode all
3230 charsets. Assigned the coding-system (Lisp symbol)
3231 `iso-2022-7bit' by default.
3233 o coding-category-iso-7-tight
3235 Same as coding-category-iso-7 except that this can
3236 encode/decode only the specified charsets.
3238 o coding-category-iso-8-1
3240 The category for a coding system which has the same code range
3241 as ISO2022 of 8-bit environment and graphic plane 1 used only
3242 for DIMENSION1 charset. This doesn't use any locking shift
3243 and single shift functions. Assigned the coding-system (Lisp
3244 symbol) `iso-latin-1' by default.
3246 o coding-category-iso-8-2
3248 The category for a coding system which has the same code range
3249 as ISO2022 of 8-bit environment and graphic plane 1 used only
3250 for DIMENSION2 charset. This doesn't use any locking shift
3251 and single shift functions. Assigned the coding-system (Lisp
3252 symbol) `japanese-iso-8bit' by default.
3254 o coding-category-iso-7-else
3256 The category for a coding system which has the same code range
3257 as ISO2022 of 7-bit environemnt but uses locking shift or
3258 single shift functions. Assigned the coding-system (Lisp
3259 symbol) `iso-2022-7bit-lock' by default.
3261 o coding-category-iso-8-else
3263 The category for a coding system which has the same code range
3264 as ISO2022 of 8-bit environemnt but uses locking shift or
3265 single shift functions. Assigned the coding-system (Lisp
3266 symbol) `iso-2022-8bit-ss2' by default.
3268 o coding-category-big5
3270 The category for a coding system which has the same code range
3271 as BIG5. Assigned the coding-system (Lisp symbol)
3272 `cn-big5' by default.
3274 o coding-category-ccl
3276 The category for a coding system of which encoder/decoder is
3277 written in CCL programs. The default value is nil, i.e., no
3278 coding system is assigned.
3280 o coding-category-binary
3282 The category for a coding system not categorized in any of the
3283 above. Assigned the coding-system (Lisp symbol)
3284 `no-conversion' by default.
3286 Each of them is a Lisp symbol and the value is an actual
3287 `coding-system's (this is also a Lisp symbol) assigned by a user.
3288 What Emacs does actually is to detect a category of coding system.
3289 Then, it uses a `coding-system' assigned to it. If Emacs can't
3290 decide only one possible category, it selects a category of the
3291 highest priority. Priorities of categories are also specified by a
3292 user in a Lisp variable `coding-category-list'.
3297 int ascii_skip_code
[256];
3299 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3300 If it detects possible coding systems, return an integer in which
3301 appropriate flag bits are set. Flag bits are defined by macros
3302 CODING_CATEGORY_MASK_XXX in `coding.h'.
3304 How many ASCII characters are at the head is returned as *SKIP. */
3307 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3308 unsigned char *source
;
3309 int src_bytes
, *priorities
, *skip
;
3311 register unsigned char c
;
3312 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3316 /* At first, skip all ASCII characters and control characters except
3317 for three ISO2022 specific control characters. */
3318 ascii_skip_code
[ISO_CODE_SO
] = 0;
3319 ascii_skip_code
[ISO_CODE_SI
] = 0;
3320 ascii_skip_code
[ISO_CODE_ESC
] = 0;
3322 label_loop_detect_coding
:
3323 while (src
< src_end
&& ascii_skip_code
[*src
]) src
++;
3324 *skip
= src
- source
;
3327 /* We found nothing other than ASCII. There's nothing to do. */
3331 /* The text seems to be encoded in some multilingual coding system.
3332 Now, try to find in which coding system the text is encoded. */
3335 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3336 /* C is an ISO2022 specific control code of C0. */
3337 mask
= detect_coding_iso2022 (src
, src_end
);
3340 /* No valid ISO2022 code follows C. Try again. */
3342 if (c
== ISO_CODE_ESC
)
3343 ascii_skip_code
[ISO_CODE_ESC
] = 1;
3345 ascii_skip_code
[ISO_CODE_SO
] = ascii_skip_code
[ISO_CODE_SI
] = 1;
3346 goto label_loop_detect_coding
;
3349 goto label_return_highest_only
;
3357 /* C is the first byte of SJIS character code,
3358 or a leading-code of Emacs' internal format (emacs-mule). */
3359 try = CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
;
3361 /* Or, if C is a special latin extra code,
3362 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3363 or is an ISO2022 control-sequence-introducer (CSI),
3364 we should also consider the possibility of ISO2022 codings. */
3365 if ((VECTORP (Vlatin_extra_code_table
)
3366 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3367 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3368 || (c
== ISO_CODE_CSI
3371 || ((*src
== '0' || *src
== '1' || *src
== '2')
3372 && src
+ 1 < src_end
3373 && src
[1] == ']')))))
3374 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3375 | CODING_CATEGORY_MASK_ISO_8BIT
);
3378 /* C is a character of ISO2022 in graphic plane right,
3379 or a SJIS's 1-byte character code (i.e. JISX0201),
3380 or the first byte of BIG5's 2-byte code. */
3381 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3382 | CODING_CATEGORY_MASK_ISO_8BIT
3383 | CODING_CATEGORY_MASK_SJIS
3384 | CODING_CATEGORY_MASK_BIG5
);
3386 /* Or, we may have to consider the possibility of CCL. */
3387 if (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3388 && (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3389 ->spec
.ccl
.valid_codes
)[c
])
3390 try |= CODING_CATEGORY_MASK_CCL
;
3395 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3397 if (priorities
[i
] & try & CODING_CATEGORY_MASK_ISO
)
3398 mask
= detect_coding_iso2022 (src
, src_end
);
3399 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_SJIS
)
3400 mask
= detect_coding_sjis (src
, src_end
);
3401 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_BIG5
)
3402 mask
= detect_coding_big5 (src
, src_end
);
3403 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_EMACS_MULE
)
3404 mask
= detect_coding_emacs_mule (src
, src_end
);
3405 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_CCL
)
3406 mask
= detect_coding_ccl (src
, src_end
);
3407 else if (priorities
[i
] & CODING_CATEGORY_MASK_RAW_TEXT
)
3408 mask
= CODING_CATEGORY_MASK_RAW_TEXT
;
3409 else if (priorities
[i
] & CODING_CATEGORY_MASK_BINARY
)
3410 mask
= CODING_CATEGORY_MASK_BINARY
;
3412 goto label_return_highest_only
;
3414 return CODING_CATEGORY_MASK_RAW_TEXT
;
3416 if (try & CODING_CATEGORY_MASK_ISO
)
3417 mask
|= detect_coding_iso2022 (src
, src_end
);
3418 if (try & CODING_CATEGORY_MASK_SJIS
)
3419 mask
|= detect_coding_sjis (src
, src_end
);
3420 if (try & CODING_CATEGORY_MASK_BIG5
)
3421 mask
|= detect_coding_big5 (src
, src_end
);
3422 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3423 mask
|= detect_coding_emacs_mule (src
, src_end
);
3424 if (try & CODING_CATEGORY_MASK_CCL
)
3425 mask
|= detect_coding_ccl (src
, src_end
);
3427 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
| CODING_CATEGORY_MASK_BINARY
);
3429 label_return_highest_only
:
3430 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3432 if (mask
& priorities
[i
])
3433 return priorities
[i
];
3435 return CODING_CATEGORY_MASK_RAW_TEXT
;
3438 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3439 The information of the detected coding system is set in CODING. */
3442 detect_coding (coding
, src
, src_bytes
)
3443 struct coding_system
*coding
;
3451 val
= Vcoding_category_list
;
3452 mask
= detect_coding_mask (src
, src_bytes
, coding_priorities
, &skip
);
3453 coding
->heading_ascii
= skip
;
3457 /* We found a single coding system of the highest priority in MASK. */
3459 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3461 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3463 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3465 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3469 tmp
= Fget (val
, Qeol_type
);
3471 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3473 setup_coding_system (val
, coding
);
3474 /* Set this again because setup_coding_system reset this member. */
3475 coding
->heading_ascii
= skip
;
3478 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3479 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3480 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3482 How many non-eol characters are at the head is returned as *SKIP. */
3484 #define MAX_EOL_CHECK_COUNT 3
3487 detect_eol_type (source
, src_bytes
, skip
)
3488 unsigned char *source
;
3489 int src_bytes
, *skip
;
3491 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3493 int total
= 0; /* How many end-of-lines are found so far. */
3494 int eol_type
= CODING_EOL_UNDECIDED
;
3499 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3502 if (c
== '\n' || c
== '\r')
3505 *skip
= src
- 1 - source
;
3508 this_eol_type
= CODING_EOL_LF
;
3509 else if (src
>= src_end
|| *src
!= '\n')
3510 this_eol_type
= CODING_EOL_CR
;
3512 this_eol_type
= CODING_EOL_CRLF
, src
++;
3514 if (eol_type
== CODING_EOL_UNDECIDED
)
3515 /* This is the first end-of-line. */
3516 eol_type
= this_eol_type
;
3517 else if (eol_type
!= this_eol_type
)
3519 /* The found type is different from what found before. */
3520 eol_type
= CODING_EOL_INCONSISTENT
;
3527 *skip
= src_end
- source
;
3531 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3532 is encoded. If it detects an appropriate format of end-of-line, it
3533 sets the information in *CODING. */
3536 detect_eol (coding
, src
, src_bytes
)
3537 struct coding_system
*coding
;
3543 int eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3545 if (coding
->heading_ascii
> skip
)
3546 coding
->heading_ascii
= skip
;
3548 skip
= coding
->heading_ascii
;
3550 if (eol_type
== CODING_EOL_UNDECIDED
)
3552 if (eol_type
== CODING_EOL_INCONSISTENT
)
3555 /* This code is suppressed until we find a better way to
3556 distinguish raw text file and binary file. */
3558 /* If we have already detected that the coding is raw-text, the
3559 coding should actually be no-conversion. */
3560 if (coding
->type
== coding_type_raw_text
)
3562 setup_coding_system (Qno_conversion
, coding
);
3565 /* Else, let's decode only text code anyway. */
3567 eol_type
= CODING_EOL_LF
;
3570 val
= Fget (coding
->symbol
, Qeol_type
);
3571 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3573 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3574 coding
->heading_ascii
= skip
;
3578 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3580 #define DECODING_BUFFER_MAG(coding) \
3581 (coding->type == coding_type_iso2022 \
3583 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3585 : (coding->type == coding_type_raw_text \
3587 : (coding->type == coding_type_ccl \
3588 ? coding->spec.ccl.decoder.buf_magnification \
3591 /* Return maximum size (bytes) of a buffer enough for decoding
3592 SRC_BYTES of text encoded in CODING. */
3595 decoding_buffer_size (coding
, src_bytes
)
3596 struct coding_system
*coding
;
3599 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3600 + CONVERSION_BUFFER_EXTRA_ROOM
);
3603 /* Return maximum size (bytes) of a buffer enough for encoding
3604 SRC_BYTES of text to CODING. */
3607 encoding_buffer_size (coding
, src_bytes
)
3608 struct coding_system
*coding
;
3613 if (coding
->type
== coding_type_ccl
)
3614 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3618 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3621 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3622 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3625 char *conversion_buffer
;
3626 int conversion_buffer_size
;
3628 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3629 or decoding. Sufficient memory is allocated automatically. If we
3630 run out of memory, return NULL. */
3633 get_conversion_buffer (size
)
3636 if (size
> conversion_buffer_size
)
3639 int real_size
= conversion_buffer_size
* 2;
3641 while (real_size
< size
) real_size
*= 2;
3642 buf
= (char *) xmalloc (real_size
);
3643 xfree (conversion_buffer
);
3644 conversion_buffer
= buf
;
3645 conversion_buffer_size
= real_size
;
3647 return conversion_buffer
;
3651 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3652 struct coding_system
*coding
;
3653 unsigned char *source
, *destination
;
3654 int src_bytes
, dst_bytes
, encodep
;
3656 struct ccl_program
*ccl
3657 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3660 ccl
->last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
3662 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3663 src_bytes
, dst_bytes
, &(coding
->consumed
));
3664 coding
->produced_char
3665 = multibyte_chars_in_text (destination
, coding
->produced
);
3666 coding
->consumed_char
3667 = multibyte_chars_in_text (source
, coding
->consumed
);
3669 switch (ccl
->status
)
3671 case CCL_STAT_SUSPEND_BY_SRC
:
3672 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3674 case CCL_STAT_SUSPEND_BY_DST
:
3675 result
= CODING_FINISH_INSUFFICIENT_DST
;
3678 case CCL_STAT_INVALID_CMD
:
3679 result
= CODING_FINISH_INTERRUPT
;
3682 result
= CODING_FINISH_NORMAL
;
3688 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3689 decoding, it may detect coding system and format of end-of-line if
3690 those are not yet decided. */
3693 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3694 struct coding_system
*coding
;
3695 unsigned char *source
, *destination
;
3696 int src_bytes
, dst_bytes
;
3701 && coding
->type
!= coding_type_ccl
3702 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3703 && CODING_REQUIRE_FLUSHING (coding
)))
3705 coding
->produced
= coding
->produced_char
= 0;
3706 coding
->consumed
= coding
->consumed_char
= 0;
3707 coding
->fake_multibyte
= 0;
3708 return CODING_FINISH_NORMAL
;
3711 if (coding
->type
== coding_type_undecided
)
3712 detect_coding (coding
, source
, src_bytes
);
3714 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3715 detect_eol (coding
, source
, src_bytes
);
3717 switch (coding
->type
)
3719 case coding_type_emacs_mule
:
3720 case coding_type_undecided
:
3721 case coding_type_raw_text
:
3722 if (coding
->eol_type
== CODING_EOL_LF
3723 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3724 goto label_no_conversion
;
3725 result
= decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3728 case coding_type_sjis
:
3729 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3730 src_bytes
, dst_bytes
, 1);
3733 case coding_type_iso2022
:
3734 result
= decode_coding_iso2022 (coding
, source
, destination
,
3735 src_bytes
, dst_bytes
);
3738 case coding_type_big5
:
3739 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3740 src_bytes
, dst_bytes
, 0);
3743 case coding_type_ccl
:
3744 result
= ccl_coding_driver (coding
, source
, destination
,
3745 src_bytes
, dst_bytes
, 0);
3748 default: /* i.e. case coding_type_no_conversion: */
3749 label_no_conversion
:
3750 if (dst_bytes
&& src_bytes
> dst_bytes
)
3752 coding
->produced
= dst_bytes
;
3753 result
= CODING_FINISH_INSUFFICIENT_DST
;
3757 coding
->produced
= src_bytes
;
3758 result
= CODING_FINISH_NORMAL
;
3761 bcopy (source
, destination
, coding
->produced
);
3763 safe_bcopy (source
, destination
, coding
->produced
);
3764 coding
->fake_multibyte
= 1;
3766 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3773 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3776 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3777 struct coding_system
*coding
;
3778 unsigned char *source
, *destination
;
3779 int src_bytes
, dst_bytes
;
3784 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3785 && CODING_REQUIRE_FLUSHING (coding
)))
3787 coding
->produced
= coding
->produced_char
= 0;
3788 coding
->consumed
= coding
->consumed_char
= 0;
3789 coding
->fake_multibyte
= 0;
3790 return CODING_FINISH_NORMAL
;
3793 switch (coding
->type
)
3795 case coding_type_emacs_mule
:
3796 case coding_type_undecided
:
3797 case coding_type_raw_text
:
3798 if (coding
->eol_type
== CODING_EOL_LF
3799 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3800 goto label_no_conversion
;
3801 result
= encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3804 case coding_type_sjis
:
3805 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3806 src_bytes
, dst_bytes
, 1);
3809 case coding_type_iso2022
:
3810 result
= encode_coding_iso2022 (coding
, source
, destination
,
3811 src_bytes
, dst_bytes
);
3814 case coding_type_big5
:
3815 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3816 src_bytes
, dst_bytes
, 0);
3819 case coding_type_ccl
:
3820 result
= ccl_coding_driver (coding
, source
, destination
,
3821 src_bytes
, dst_bytes
, 1);
3824 default: /* i.e. case coding_type_no_conversion: */
3825 label_no_conversion
:
3826 if (dst_bytes
&& src_bytes
> dst_bytes
)
3828 coding
->produced
= dst_bytes
;
3829 result
= CODING_FINISH_INSUFFICIENT_DST
;
3833 coding
->produced
= src_bytes
;
3834 result
= CODING_FINISH_NORMAL
;
3837 bcopy (source
, destination
, coding
->produced
);
3839 safe_bcopy (source
, destination
, coding
->produced
);
3840 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
3842 unsigned char *p
= destination
, *pend
= p
+ coding
->produced
;
3844 if (*p
++ == '\015') p
[-1] = '\n';
3846 coding
->fake_multibyte
= 1;
3848 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3855 /* Scan text in the region between *BEG and *END (byte positions),
3856 skip characters which we don't have to decode by coding system
3857 CODING at the head and tail, then set *BEG and *END to the region
3858 of the text we actually have to convert. The caller should move
3859 the gap out of the region in advance.
3861 If STR is not NULL, *BEG and *END are indices into STR. */
3864 shrink_decoding_region (beg
, end
, coding
, str
)
3866 struct coding_system
*coding
;
3869 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
3871 Lisp_Object translation_table
;
3873 if (coding
->type
== coding_type_ccl
3874 || coding
->type
== coding_type_undecided
3875 || !NILP (coding
->post_read_conversion
))
3877 /* We can't skip any data. */
3880 else if (coding
->type
== coding_type_no_conversion
)
3882 /* We need no conversion, but don't have to skip any data here.
3883 Decoding routine handles them effectively anyway. */
3887 translation_table
= coding
->translation_table_for_decode
;
3888 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
3889 translation_table
= Vstandard_translation_table_for_decode
;
3890 if (CHAR_TABLE_P (translation_table
))
3893 for (i
= 0; i
< 128; i
++)
3894 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
3897 /* Some ASCII character should be tranlsated. We give up
3902 eol_conversion
= (coding
->eol_type
!= CODING_EOL_LF
);
3904 if ((! eol_conversion
) && (coding
->heading_ascii
>= 0))
3905 /* Detection routine has already found how much we can skip at the
3907 *beg
+= coding
->heading_ascii
;
3911 begp_orig
= begp
= str
+ *beg
;
3912 endp_orig
= endp
= str
+ *end
;
3916 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3917 endp_orig
= endp
= begp
+ *end
- *beg
;
3920 switch (coding
->type
)
3922 case coding_type_emacs_mule
:
3923 case coding_type_raw_text
:
3926 if (coding
->heading_ascii
< 0)
3927 while (begp
< endp
&& *begp
!= '\r' && *begp
< 0x80) begp
++;
3928 while (begp
< endp
&& endp
[-1] != '\r' && endp
[-1] < 0x80)
3930 /* Do not consider LF as ascii if preceded by CR, since that
3931 confuses eol decoding. */
3932 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3939 case coding_type_sjis
:
3940 case coding_type_big5
:
3941 /* We can skip all ASCII characters at the head. */
3942 if (coding
->heading_ascii
< 0)
3945 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
3947 while (begp
< endp
&& *begp
< 0x80) begp
++;
3949 /* We can skip all ASCII characters at the tail except for the
3950 second byte of SJIS or BIG5 code. */
3952 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
3954 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3955 /* Do not consider LF as ascii if preceded by CR, since that
3956 confuses eol decoding. */
3957 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3959 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
3963 default: /* i.e. case coding_type_iso2022: */
3964 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
3965 /* We can't skip any data. */
3967 if (coding
->heading_ascii
< 0)
3969 /* We can skip all ASCII characters at the head except for a
3970 few control codes. */
3971 while (begp
< endp
&& (c
= *begp
) < 0x80
3972 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3973 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
3974 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
3977 switch (coding
->category_idx
)
3979 case CODING_CATEGORY_IDX_ISO_8_1
:
3980 case CODING_CATEGORY_IDX_ISO_8_2
:
3981 /* We can skip all ASCII characters at the tail. */
3983 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
3985 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3986 /* Do not consider LF as ascii if preceded by CR, since that
3987 confuses eol decoding. */
3988 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3992 case CODING_CATEGORY_IDX_ISO_7
:
3993 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
3995 /* We can skip all charactes at the tail except for 8-bit
3996 codes and ESC and the following 2-byte at the tail. */
3997 unsigned char *eight_bit
= NULL
;
4001 && (c
= endp
[-1]) != ISO_CODE_ESC
&& c
!= '\r')
4003 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4008 && (c
= endp
[-1]) != ISO_CODE_ESC
)
4010 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4013 /* Do not consider LF as ascii if preceded by CR, since that
4014 confuses eol decoding. */
4015 if (begp
< endp
&& endp
< endp_orig
4016 && endp
[-1] == '\r' && endp
[0] == '\n')
4018 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
4020 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
4021 /* This is an ASCII designation sequence. We can
4022 surely skip the tail. But, if we have
4023 encountered an 8-bit code, skip only the codes
4025 endp
= eight_bit
? eight_bit
: endp
+ 2;
4027 /* Hmmm, we can't skip the tail. */
4035 *beg
+= begp
- begp_orig
;
4036 *end
+= endp
- endp_orig
;
4040 /* Like shrink_decoding_region but for encoding. */
4043 shrink_encoding_region (beg
, end
, coding
, str
)
4045 struct coding_system
*coding
;
4048 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
4050 Lisp_Object translation_table
;
4052 if (coding
->type
== coding_type_ccl
)
4053 /* We can't skip any data. */
4055 else if (coding
->type
== coding_type_no_conversion
)
4057 /* We need no conversion. */
4062 translation_table
= coding
->translation_table_for_encode
;
4063 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4064 translation_table
= Vstandard_translation_table_for_encode
;
4065 if (CHAR_TABLE_P (translation_table
))
4068 for (i
= 0; i
< 128; i
++)
4069 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4072 /* Some ASCII character should be tranlsated. We give up
4079 begp_orig
= begp
= str
+ *beg
;
4080 endp_orig
= endp
= str
+ *end
;
4084 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4085 endp_orig
= endp
= begp
+ *end
- *beg
;
4088 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4089 || coding
->eol_type
== CODING_EOL_CRLF
);
4091 /* Here, we don't have to check coding->pre_write_conversion because
4092 the caller is expected to have handled it already. */
4093 switch (coding
->type
)
4095 case coding_type_undecided
:
4096 case coding_type_emacs_mule
:
4097 case coding_type_raw_text
:
4100 while (begp
< endp
&& *begp
!= '\n') begp
++;
4101 while (begp
< endp
&& endp
[-1] != '\n') endp
--;
4107 case coding_type_iso2022
:
4108 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4109 /* We can't skip any data. */
4111 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
4113 unsigned char *bol
= begp
;
4114 while (begp
< endp
&& *begp
< 0x80)
4117 if (begp
[-1] == '\n')
4121 goto label_skip_tail
;
4126 /* We can skip all ASCII characters at the head and tail. */
4128 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
4130 while (begp
< endp
&& *begp
< 0x80) begp
++;
4133 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
4135 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
4139 *beg
+= begp
- begp_orig
;
4140 *end
+= endp
- endp_orig
;
4144 /* As shrinking conversion region requires some overhead, we don't try
4145 shrinking if the length of conversion region is less than this
4147 static int shrink_conversion_region_threshhold
= 1024;
4149 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4151 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4153 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4154 else shrink_decoding_region (beg, end, coding, str); \
4158 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4159 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4160 coding system CODING, and return the status code of code conversion
4161 (currently, this value has no meaning).
4163 How many characters (and bytes) are converted to how many
4164 characters (and bytes) are recorded in members of the structure
4167 If REPLACE is nonzero, we do various things as if the original text
4168 is deleted and a new text is inserted. See the comments in
4169 replace_range (insdel.c) to know what we are doing. */
4172 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
4173 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
4174 struct coding_system
*coding
;
4176 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
4177 int require
, inserted
, inserted_byte
;
4178 int head_skip
, tail_skip
, total_skip
;
4179 Lisp_Object saved_coding_symbol
;
4180 int multibyte
= !NILP (current_buffer
->enable_multibyte_characters
);
4182 int fake_multibyte
= 0;
4183 unsigned char *src
, *dst
;
4184 Lisp_Object deletion
;
4185 int orig_point
= PT
, orig_len
= len
;
4189 saved_coding_symbol
= Qnil
;
4191 if (from
< PT
&& PT
< to
)
4193 TEMP_SET_PT_BOTH (from
, from_byte
);
4199 int saved_from
= from
;
4201 prepare_to_modify_buffer (from
, to
, &from
);
4202 if (saved_from
!= from
)
4206 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
4208 from_byte
= from
, to_byte
= to
;
4209 len_byte
= to_byte
- from_byte
;
4213 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4215 /* We must detect encoding of text and eol format. */
4217 if (from
< GPT
&& to
> GPT
)
4218 move_gap_both (from
, from_byte
);
4219 if (coding
->type
== coding_type_undecided
)
4221 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4222 if (coding
->type
== coding_type_undecided
)
4223 /* It seems that the text contains only ASCII, but we
4224 should not left it undecided because the deeper
4225 decoding routine (decode_coding) tries to detect the
4226 encodings again in vain. */
4227 coding
->type
= coding_type_emacs_mule
;
4229 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4231 saved_coding_symbol
= coding
->symbol
;
4232 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4233 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4234 coding
->eol_type
= CODING_EOL_LF
;
4235 /* We had better recover the original eol format if we
4236 encounter an inconsitent eol format while decoding. */
4237 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4241 coding
->consumed_char
= len
, coding
->consumed
= len_byte
;
4244 ? ! CODING_REQUIRE_ENCODING (coding
)
4245 : ! CODING_REQUIRE_DECODING (coding
))
4247 coding
->produced
= len_byte
;
4250 /* See the comment of the member heading_ascii in coding.h. */
4251 && coding
->heading_ascii
< len_byte
)
4253 /* We still may have to combine byte at the head and the
4254 tail of the text in the region. */
4255 if (from
< GPT
&& GPT
< to
)
4256 move_gap_both (to
, to_byte
);
4257 len
= multibyte_chars_in_text (BYTE_POS_ADDR (from_byte
), len_byte
);
4258 adjust_after_insert (from
, from_byte
, to
, to_byte
, len
);
4259 coding
->produced_char
= len
;
4264 adjust_after_insert (from
, from_byte
, to
, to_byte
, len_byte
);
4265 coding
->produced_char
= len_byte
;
4270 /* Now we convert the text. */
4272 /* For encoding, we must process pre-write-conversion in advance. */
4274 && ! NILP (coding
->pre_write_conversion
)
4275 && SYMBOLP (coding
->pre_write_conversion
)
4276 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4278 /* The function in pre-write-conversion may put a new text in a
4280 struct buffer
*prev
= current_buffer
;
4283 call2 (coding
->pre_write_conversion
,
4284 make_number (from
), make_number (to
));
4285 if (current_buffer
!= prev
)
4288 new = Fcurrent_buffer ();
4289 set_buffer_internal_1 (prev
);
4290 del_range_2 (from
, from_byte
, to
, to_byte
);
4291 TEMP_SET_PT_BOTH (from
, from_byte
);
4292 insert_from_buffer (XBUFFER (new), 1, len
, 0);
4294 if (orig_point
>= to
)
4295 orig_point
+= len
- orig_len
;
4296 else if (orig_point
> from
)
4300 from_byte
= multibyte
? CHAR_TO_BYTE (from
) : from_byte
;
4301 to_byte
= multibyte
? CHAR_TO_BYTE (to
) : to
;
4302 len_byte
= to_byte
- from_byte
;
4303 TEMP_SET_PT_BOTH (from
, from_byte
);
4308 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4310 /* Try to skip the heading and tailing ASCIIs. */
4312 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4314 if (from
< GPT
&& GPT
< to
)
4315 move_gap_both (from
, from_byte
);
4316 SHRINK_CONVERSION_REGION (&from_byte
, &to_byte
, coding
, NULL
, encodep
);
4317 if (from_byte
== to_byte
4318 && coding
->type
!= coding_type_ccl
4319 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
4320 && CODING_REQUIRE_FLUSHING (coding
)))
4322 coding
->produced
= len_byte
;
4323 coding
->produced_char
= multibyte
? len
: len_byte
;
4325 /* We must record and adjust for this new text now. */
4326 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4330 head_skip
= from_byte
- from_byte_orig
;
4331 tail_skip
= to_byte_orig
- to_byte
;
4332 total_skip
= head_skip
+ tail_skip
;
4335 len
-= total_skip
; len_byte
-= total_skip
;
4338 /* The code conversion routine can not preserve text properties for
4339 now. So, we must remove all text properties in the region.
4340 Here, we must suppress all modification hooks. */
4343 int saved_inhibit_modification_hooks
= inhibit_modification_hooks
;
4344 inhibit_modification_hooks
= 1;
4345 Fset_text_properties (make_number (from
), make_number (to
), Qnil
, Qnil
);
4346 inhibit_modification_hooks
= saved_inhibit_modification_hooks
;
4349 /* For converion, we must put the gap before the text in addition to
4350 making the gap larger for efficient decoding. The required gap
4351 size starts from 2000 which is the magic number used in make_gap.
4352 But, after one batch of conversion, it will be incremented if we
4353 find that it is not enough . */
4356 if (GAP_SIZE
< require
)
4357 make_gap (require
- GAP_SIZE
);
4358 move_gap_both (from
, from_byte
);
4360 inserted
= inserted_byte
= 0;
4361 src
= GAP_END_ADDR
, dst
= GPT_ADDR
;
4363 GAP_SIZE
+= len_byte
;
4366 ZV_BYTE
-= len_byte
;
4369 if (GPT
- BEG
< beg_unchanged
)
4370 beg_unchanged
= GPT
- BEG
;
4371 if (Z
- GPT
< end_unchanged
)
4372 end_unchanged
= Z
- GPT
;
4378 /* The buffer memory is changed from:
4379 +--------+converted-text+---------+-------original-text------+---+
4380 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4381 |<------------------- GAP_SIZE -------------------->| */
4383 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4385 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4387 +--------+-------converted-text--------+--+---original-text--+---+
4388 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4389 |<------------------- GAP_SIZE -------------------->| */
4390 if (coding
->fake_multibyte
)
4393 if (!encodep
&& !multibyte
)
4394 coding
->produced_char
= coding
->produced
;
4395 inserted
+= coding
->produced_char
;
4396 inserted_byte
+= coding
->produced
;
4397 len_byte
-= coding
->consumed
;
4398 src
+= coding
->consumed
;
4399 dst
+= inserted_byte
;
4401 if (result
== CODING_FINISH_NORMAL
)
4406 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4408 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4410 /* Encode LFs back to the original eol format (CR or CRLF). */
4411 if (coding
->eol_type
== CODING_EOL_CR
)
4413 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4419 while (p
< pend
) if (*p
++ == '\n') count
++;
4420 if (src
- dst
< count
)
4422 /* We don't have sufficient room for putting LFs
4423 back to CRLF. We must record converted and
4424 not-yet-converted text back to the buffer
4425 content, enlarge the gap, then record them out of
4426 the buffer contents again. */
4427 int add
= len_byte
+ inserted_byte
;
4430 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4431 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4432 make_gap (count
- GAP_SIZE
);
4434 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4435 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4436 /* Don't forget to update SRC, DST, and PEND. */
4437 src
= GAP_END_ADDR
- len_byte
;
4438 dst
= GPT_ADDR
+ inserted_byte
;
4442 inserted_byte
+= count
;
4443 coding
->produced
+= count
;
4444 p
= dst
= pend
+ count
;
4448 if (*p
== '\n') count
--, *--p
= '\r';
4452 /* Suppress eol-format conversion in the further conversion. */
4453 coding
->eol_type
= CODING_EOL_LF
;
4455 /* Restore the original symbol. */
4456 coding
->symbol
= saved_coding_symbol
;
4462 if (coding
->type
!= coding_type_ccl
4463 || coding
->mode
& CODING_MODE_LAST_BLOCK
)
4465 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
4468 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4470 /* The source text ends in invalid codes. Let's just
4471 make them valid buffer contents, and finish conversion. */
4472 inserted
+= len_byte
;
4473 inserted_byte
+= len_byte
;
4479 if (result
== CODING_FINISH_INTERRUPT
)
4481 /* The conversion procedure was interrupted by a user. */
4485 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4486 if (coding
->consumed
< 1)
4488 /* It's quite strange to require more memory without
4489 consuming any bytes. Perhaps CCL program bug. */
4495 /* We have just done the first batch of conversion which was
4496 stoped because of insufficient gap. Let's reconsider the
4497 required gap size (i.e. SRT - DST) now.
4499 We have converted ORIG bytes (== coding->consumed) into
4500 NEW bytes (coding->produced). To convert the remaining
4501 LEN bytes, we may need REQUIRE bytes of gap, where:
4502 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4503 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4504 Here, we are sure that NEW >= ORIG. */
4505 float ratio
= coding
->produced
- coding
->consumed
;
4506 ratio
/= coding
->consumed
;
4507 require
= len_byte
* ratio
;
4510 if ((src
- dst
) < (require
+ 2000))
4512 /* See the comment above the previous call of make_gap. */
4513 int add
= len_byte
+ inserted_byte
;
4516 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4517 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4518 make_gap (require
+ 2000);
4520 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4521 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4522 /* Don't forget to update SRC, DST. */
4523 src
= GAP_END_ADDR
- len_byte
;
4524 dst
= GPT_ADDR
+ inserted_byte
;
4527 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4532 || (to
- from
) != (to_byte
- from_byte
)))
4533 inserted
= multibyte_chars_in_text (GPT_ADDR
, inserted_byte
);
4535 /* If we have shrinked the conversion area, adjust it now. */
4539 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4540 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4541 GAP_SIZE
+= total_skip
;
4542 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4543 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4544 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4545 from
-= head_skip
; from_byte
-= head_skip
;
4546 to
+= tail_skip
; to_byte
+= tail_skip
;
4550 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4551 inserted
= Z
- prev_Z
;
4553 if (! encodep
&& ! NILP (coding
->post_read_conversion
))
4558 TEMP_SET_PT_BOTH (from
, from_byte
);
4560 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4561 CHECK_NUMBER (val
, 0);
4562 inserted
+= Z
- prev_Z
;
4565 if (orig_point
>= from
)
4567 if (orig_point
>= from
+ orig_len
)
4568 orig_point
+= inserted
- orig_len
;
4571 TEMP_SET_PT (orig_point
);
4574 signal_after_change (from
, to
- from
, inserted
);
4577 coding
->consumed
= to_byte
- from_byte
;
4578 coding
->consumed_char
= to
- from
;
4579 coding
->produced
= inserted_byte
;
4580 coding
->produced_char
= inserted
;
4587 code_convert_string (str
, coding
, encodep
, nocopy
)
4589 struct coding_system
*coding
;
4590 int encodep
, nocopy
;
4594 int from
= 0, to
= XSTRING (str
)->size
;
4595 int to_byte
= STRING_BYTES (XSTRING (str
));
4596 struct gcpro gcpro1
;
4597 Lisp_Object saved_coding_symbol
;
4600 saved_coding_symbol
= Qnil
;
4601 if (encodep
&& !NILP (coding
->pre_write_conversion
)
4602 || !encodep
&& !NILP (coding
->post_read_conversion
))
4604 /* Since we have to call Lisp functions which assume target text
4605 is in a buffer, after setting a temporary buffer, call
4606 code_convert_region. */
4607 int count
= specpdl_ptr
- specpdl
;
4608 struct buffer
*prev
= current_buffer
;
4610 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4611 temp_output_buffer_setup (" *code-converting-work*");
4612 set_buffer_internal (XBUFFER (Vstandard_output
));
4614 insert_from_string (str
, 0, 0, to
, to_byte
, 0);
4617 /* We must insert the contents of STR as is without
4618 unibyte<->multibyte conversion. */
4619 current_buffer
->enable_multibyte_characters
= Qnil
;
4620 insert_from_string (str
, 0, 0, to_byte
, to_byte
, 0);
4621 current_buffer
->enable_multibyte_characters
= Qt
;
4623 code_convert_region (BEGV
, BEGV_BYTE
, ZV
, ZV_BYTE
, coding
, encodep
, 1);
4625 /* We must return the buffer contents as unibyte string. */
4626 current_buffer
->enable_multibyte_characters
= Qnil
;
4627 str
= make_buffer_string (BEGV
, ZV
, 0);
4628 set_buffer_internal (prev
);
4629 return unbind_to (count
, str
);
4632 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4634 /* See the comments in code_convert_region. */
4635 if (coding
->type
== coding_type_undecided
)
4637 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
4638 if (coding
->type
== coding_type_undecided
)
4639 coding
->type
= coding_type_emacs_mule
;
4641 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4643 saved_coding_symbol
= coding
->symbol
;
4644 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
4645 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4646 coding
->eol_type
= CODING_EOL_LF
;
4647 /* We had better recover the original eol format if we
4648 encounter an inconsitent eol format while decoding. */
4649 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4654 ? ! CODING_REQUIRE_ENCODING (coding
)
4655 : ! CODING_REQUIRE_DECODING (coding
))
4659 /* Try to skip the heading and tailing ASCIIs. */
4660 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
4664 && coding
->type
!= coding_type_ccl
)
4665 return (nocopy
? str
: Fcopy_sequence (str
));
4668 len
= encoding_buffer_size (coding
, to_byte
- from
);
4670 len
= decoding_buffer_size (coding
, to_byte
- from
);
4671 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4673 buf
= get_conversion_buffer (len
);
4677 bcopy (XSTRING (str
)->data
, buf
, from
);
4679 ? encode_coding (coding
, XSTRING (str
)->data
+ from
,
4680 buf
+ from
, to_byte
- from
, len
)
4681 : decode_coding (coding
, XSTRING (str
)->data
+ from
,
4682 buf
+ from
, to_byte
- from
, len
));
4683 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4685 /* We simple try to decode the whole string again but without
4686 eol-conversion this time. */
4687 coding
->eol_type
= CODING_EOL_LF
;
4688 coding
->symbol
= saved_coding_symbol
;
4689 return code_convert_string (str
, coding
, encodep
, nocopy
);
4692 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
4693 STRING_BYTES (XSTRING (str
)) - to_byte
);
4695 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4697 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
4700 int chars
= (coding
->fake_multibyte
4701 ? multibyte_chars_in_text (buf
+ from
, coding
->produced
)
4702 : coding
->produced_char
);
4703 str
= make_multibyte_string (buf
, len
+ chars
, len
+ coding
->produced
);
4711 /*** 8. Emacs Lisp library functions ***/
4713 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
4714 "Return t if OBJECT is nil or a coding-system.\n\
4715 See the documentation of `make-coding-system' for information\n\
4716 about coding-system objects.")
4724 /* Get coding-spec vector for OBJ. */
4725 obj
= Fget (obj
, Qcoding_system
);
4726 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
4730 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
4731 Sread_non_nil_coding_system
, 1, 1, 0,
4732 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4739 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4740 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
4742 while (XSTRING (val
)->size
== 0);
4743 return (Fintern (val
, Qnil
));
4746 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
4747 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4748 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4749 (prompt
, default_coding_system
)
4750 Lisp_Object prompt
, default_coding_system
;
4753 if (SYMBOLP (default_coding_system
))
4754 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
4755 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4756 Qt
, Qnil
, Qcoding_system_history
,
4757 default_coding_system
, Qnil
);
4758 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
4761 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
4763 "Check validity of CODING-SYSTEM.\n\
4764 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4765 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4766 The value of property should be a vector of length 5.")
4768 Lisp_Object coding_system
;
4770 CHECK_SYMBOL (coding_system
, 0);
4771 if (!NILP (Fcoding_system_p (coding_system
)))
4772 return coding_system
;
4774 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
4778 detect_coding_system (src
, src_bytes
, highest
)
4780 int src_bytes
, highest
;
4782 int coding_mask
, eol_type
;
4783 Lisp_Object val
, tmp
;
4786 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
4787 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
4788 if (eol_type
== CODING_EOL_INCONSISTENT
)
4789 eol_type
= CODING_EOL_UNDECIDED
;
4794 if (eol_type
!= CODING_EOL_UNDECIDED
)
4797 val2
= Fget (Qundecided
, Qeol_type
);
4799 val
= XVECTOR (val2
)->contents
[eol_type
];
4801 return (highest
? val
: Fcons (val
, Qnil
));
4804 /* At first, gather possible coding systems in VAL. */
4806 for (tmp
= Vcoding_category_list
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4809 = XFASTINT (Fget (XCONS (tmp
)->car
, Qcoding_category_index
));
4810 if (coding_mask
& (1 << idx
))
4812 val
= Fcons (Fsymbol_value (XCONS (tmp
)->car
), val
);
4818 val
= Fnreverse (val
);
4820 /* Then, replace the elements with subsidiary coding systems. */
4821 for (tmp
= val
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4823 if (eol_type
!= CODING_EOL_UNDECIDED
4824 && eol_type
!= CODING_EOL_INCONSISTENT
)
4827 eol
= Fget (XCONS (tmp
)->car
, Qeol_type
);
4829 XCONS (tmp
)->car
= XVECTOR (eol
)->contents
[eol_type
];
4832 return (highest
? XCONS (val
)->car
: val
);
4835 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
4837 "Detect coding system of the text in the region between START and END.\n\
4838 Return a list of possible coding systems ordered by priority.\n\
4840 If only ASCII characters are found, it returns a list of single element\n\
4841 `undecided' or its subsidiary coding system according to a detected\n\
4842 end-of-line format.\n\
4844 If optional argument HIGHEST is non-nil, return the coding system of\n\
4846 (start
, end
, highest
)
4847 Lisp_Object start
, end
, highest
;
4850 int from_byte
, to_byte
;
4852 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4853 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4855 validate_region (&start
, &end
);
4856 from
= XINT (start
), to
= XINT (end
);
4857 from_byte
= CHAR_TO_BYTE (from
);
4858 to_byte
= CHAR_TO_BYTE (to
);
4860 if (from
< GPT
&& to
>= GPT
)
4861 move_gap_both (to
, to_byte
);
4863 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
4864 to_byte
- from_byte
,
4868 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
4870 "Detect coding system of the text in STRING.\n\
4871 Return a list of possible coding systems ordered by priority.\n\
4873 If only ASCII characters are found, it returns a list of single element\n\
4874 `undecided' or its subsidiary coding system according to a detected\n\
4875 end-of-line format.\n\
4877 If optional argument HIGHEST is non-nil, return the coding system of\n\
4880 Lisp_Object string
, highest
;
4882 CHECK_STRING (string
, 0);
4884 return detect_coding_system (XSTRING (string
)->data
,
4885 STRING_BYTES (XSTRING (string
)),
4890 code_convert_region1 (start
, end
, coding_system
, encodep
)
4891 Lisp_Object start
, end
, coding_system
;
4894 struct coding_system coding
;
4897 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4898 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4899 CHECK_SYMBOL (coding_system
, 2);
4901 validate_region (&start
, &end
);
4902 from
= XFASTINT (start
);
4903 to
= XFASTINT (end
);
4905 if (NILP (coding_system
))
4906 return make_number (to
- from
);
4908 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4909 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4911 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4912 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
4913 &coding
, encodep
, 1);
4914 Vlast_coding_system_used
= coding
.symbol
;
4915 return make_number (coding
.produced_char
);
4918 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
4919 3, 3, "r\nzCoding system: ",
4920 "Decode the current region by specified coding system.\n\
4921 When called from a program, takes three arguments:\n\
4922 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4923 This function sets `last-coding-system-used' to the precise coding system\n\
4924 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4925 not fully specified.)\n\
4926 It returns the length of the decoded text.")
4927 (start
, end
, coding_system
)
4928 Lisp_Object start
, end
, coding_system
;
4930 return code_convert_region1 (start
, end
, coding_system
, 0);
4933 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
4934 3, 3, "r\nzCoding system: ",
4935 "Encode the current region by specified coding system.\n\
4936 When called from a program, takes three arguments:\n\
4937 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4938 This function sets `last-coding-system-used' to the precise coding system\n\
4939 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4940 not fully specified.)\n\
4941 It returns the length of the encoded text.")
4942 (start
, end
, coding_system
)
4943 Lisp_Object start
, end
, coding_system
;
4945 return code_convert_region1 (start
, end
, coding_system
, 1);
4949 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
4950 Lisp_Object string
, coding_system
, nocopy
;
4953 struct coding_system coding
;
4955 CHECK_STRING (string
, 0);
4956 CHECK_SYMBOL (coding_system
, 1);
4958 if (NILP (coding_system
))
4959 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
4961 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4962 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4964 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4965 Vlast_coding_system_used
= coding
.symbol
;
4966 return code_convert_string (string
, &coding
, encodep
, !NILP (nocopy
));
4969 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
4971 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4972 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4973 if the decoding operation is trivial.\n\
4974 This function sets `last-coding-system-used' to the precise coding system\n\
4975 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4976 not fully specified.)")
4977 (string
, coding_system
, nocopy
)
4978 Lisp_Object string
, coding_system
, nocopy
;
4980 return code_convert_string1 (string
, coding_system
, nocopy
, 0);
4983 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
4985 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4986 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4987 if the encoding operation is trivial.\n\
4988 This function sets `last-coding-system-used' to the precise coding system\n\
4989 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4990 not fully specified.)")
4991 (string
, coding_system
, nocopy
)
4992 Lisp_Object string
, coding_system
, nocopy
;
4994 return code_convert_string1 (string
, coding_system
, nocopy
, 1);
4997 /* Encode or decode STRING according to CODING_SYSTEM.
4998 Do not set Vlast_coding_system_used. */
5001 code_convert_string_norecord (string
, coding_system
, encodep
)
5002 Lisp_Object string
, coding_system
;
5005 struct coding_system coding
;
5007 CHECK_STRING (string
, 0);
5008 CHECK_SYMBOL (coding_system
, 1);
5010 if (NILP (coding_system
))
5013 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5014 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5016 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5017 return code_convert_string (string
, &coding
, encodep
, Qt
);
5020 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
5021 "Decode a JISX0208 character of shift-jis encoding.\n\
5022 CODE is the character code in SJIS.\n\
5023 Return the corresponding character.")
5027 unsigned char c1
, c2
, s1
, s2
;
5030 CHECK_NUMBER (code
, 0);
5031 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
5032 DECODE_SJIS (s1
, s2
, c1
, c2
);
5033 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
5037 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
5038 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
5039 Return the corresponding character code in SJIS.")
5043 int charset
, c1
, c2
, s1
, s2
;
5046 CHECK_NUMBER (ch
, 0);
5047 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5048 if (charset
== charset_jisx0208
)
5050 ENCODE_SJIS (c1
, c2
, s1
, s2
);
5051 XSETFASTINT (val
, (s1
<< 8) | s2
);
5054 XSETFASTINT (val
, 0);
5058 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
5059 "Decode a Big5 character CODE of BIG5 coding system.\n\
5060 CODE is the character code in BIG5.\n\
5061 Return the corresponding character.")
5066 unsigned char b1
, b2
, c1
, c2
;
5069 CHECK_NUMBER (code
, 0);
5070 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
5071 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
5072 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
5076 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
5077 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5078 Return the corresponding character code in Big5.")
5082 int charset
, c1
, c2
, b1
, b2
;
5085 CHECK_NUMBER (ch
, 0);
5086 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5087 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
5089 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
5090 XSETFASTINT (val
, (b1
<< 8) | b2
);
5093 XSETFASTINT (val
, 0);
5097 DEFUN ("set-terminal-coding-system-internal",
5098 Fset_terminal_coding_system_internal
,
5099 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
5101 Lisp_Object coding_system
;
5103 CHECK_SYMBOL (coding_system
, 0);
5104 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
5105 /* We had better not send unsafe characters to terminal. */
5106 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
5111 DEFUN ("set-safe-terminal-coding-system-internal",
5112 Fset_safe_terminal_coding_system_internal
,
5113 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
5115 Lisp_Object coding_system
;
5117 CHECK_SYMBOL (coding_system
, 0);
5118 setup_coding_system (Fcheck_coding_system (coding_system
),
5119 &safe_terminal_coding
);
5123 DEFUN ("terminal-coding-system",
5124 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
5125 "Return coding system specified for terminal output.")
5128 return terminal_coding
.symbol
;
5131 DEFUN ("set-keyboard-coding-system-internal",
5132 Fset_keyboard_coding_system_internal
,
5133 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
5135 Lisp_Object coding_system
;
5137 CHECK_SYMBOL (coding_system
, 0);
5138 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
5142 DEFUN ("keyboard-coding-system",
5143 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
5144 "Return coding system specified for decoding keyboard input.")
5147 return keyboard_coding
.symbol
;
5151 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
5152 Sfind_operation_coding_system
, 1, MANY
, 0,
5153 "Choose a coding system for an operation based on the target name.\n\
5154 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5155 DECODING-SYSTEM is the coding system to use for decoding\n\
5156 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5157 for encoding (in case OPERATION does encoding).\n\
5159 The first argument OPERATION specifies an I/O primitive:\n\
5160 For file I/O, `insert-file-contents' or `write-region'.\n\
5161 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5162 For network I/O, `open-network-stream'.\n\
5164 The remaining arguments should be the same arguments that were passed\n\
5165 to the primitive. Depending on which primitive, one of those arguments\n\
5166 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5167 whichever argument specifies the file name is TARGET.\n\
5169 TARGET has a meaning which depends on OPERATION:\n\
5170 For file I/O, TARGET is a file name.\n\
5171 For process I/O, TARGET is a process name.\n\
5172 For network I/O, TARGET is a service name or a port number\n\
5174 This function looks up what specified for TARGET in,\n\
5175 `file-coding-system-alist', `process-coding-system-alist',\n\
5176 or `network-coding-system-alist' depending on OPERATION.\n\
5177 They may specify a coding system, a cons of coding systems,\n\
5178 or a function symbol to call.\n\
5179 In the last case, we call the function with one argument,\n\
5180 which is a list of all the arguments given to this function.")
5185 Lisp_Object operation
, target_idx
, target
, val
;
5186 register Lisp_Object chain
;
5189 error ("Too few arguments");
5190 operation
= args
[0];
5191 if (!SYMBOLP (operation
)
5192 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
5193 error ("Invalid first arguement");
5194 if (nargs
< 1 + XINT (target_idx
))
5195 error ("Too few arguments for operation: %s",
5196 XSYMBOL (operation
)->name
->data
);
5197 target
= args
[XINT (target_idx
) + 1];
5198 if (!(STRINGP (target
)
5199 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
5200 error ("Invalid %dth argument", XINT (target_idx
) + 1);
5202 chain
= ((EQ (operation
, Qinsert_file_contents
)
5203 || EQ (operation
, Qwrite_region
))
5204 ? Vfile_coding_system_alist
5205 : (EQ (operation
, Qopen_network_stream
)
5206 ? Vnetwork_coding_system_alist
5207 : Vprocess_coding_system_alist
));
5211 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
5214 elt
= XCONS (chain
)->car
;
5217 && ((STRINGP (target
)
5218 && STRINGP (XCONS (elt
)->car
)
5219 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
5220 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
5222 val
= XCONS (elt
)->cdr
;
5223 /* Here, if VAL is both a valid coding system and a valid
5224 function symbol, we return VAL as a coding system. */
5227 if (! SYMBOLP (val
))
5229 if (! NILP (Fcoding_system_p (val
)))
5230 return Fcons (val
, val
);
5231 if (! NILP (Ffboundp (val
)))
5233 val
= call1 (val
, Flist (nargs
, args
));
5236 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
5237 return Fcons (val
, val
);
5245 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal
,
5246 Supdate_coding_systems_internal
, 0, 0, 0,
5247 "Update internal database for ISO2022 and CCL based coding systems.\n\
5248 When values of the following coding categories are changed, you must\n\
5249 call this function:\n\
5250 coding-category-iso-7, coding-category-iso-7-tight,\n\
5251 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5252 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5253 coding-category-ccl")
5258 for (i
= CODING_CATEGORY_IDX_ISO_7
; i
<= CODING_CATEGORY_IDX_CCL
; i
++)
5262 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
;
5265 if (! coding_system_table
[i
])
5266 coding_system_table
[i
] = ((struct coding_system
*)
5267 xmalloc (sizeof (struct coding_system
)));
5268 setup_coding_system (val
, coding_system_table
[i
]);
5270 else if (coding_system_table
[i
])
5272 xfree (coding_system_table
[i
]);
5273 coding_system_table
[i
] = NULL
;
5280 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal
,
5281 Sset_coding_priority_internal
, 0, 0, 0,
5282 "Update internal database for the current value of `coding-category-list'.\n\
5283 This function is internal use only.")
5289 val
= Vcoding_category_list
;
5291 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
5293 if (! SYMBOLP (XCONS (val
)->car
))
5295 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
5296 if (idx
>= CODING_CATEGORY_IDX_MAX
)
5298 coding_priorities
[i
++] = (1 << idx
);
5299 val
= XCONS (val
)->cdr
;
5301 /* If coding-category-list is valid and contains all coding
5302 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5303 the following code saves Emacs from craching. */
5304 while (i
< CODING_CATEGORY_IDX_MAX
)
5305 coding_priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
5313 /*** 9. Post-amble ***/
5318 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
5326 /* Emacs' internal format specific initialize routine. */
5327 for (i
= 0; i
<= 0x20; i
++)
5328 emacs_code_class
[i
] = EMACS_control_code
;
5329 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
5330 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
5331 for (i
= 0x21 ; i
< 0x7F; i
++)
5332 emacs_code_class
[i
] = EMACS_ascii_code
;
5333 emacs_code_class
[0x7F] = EMACS_control_code
;
5334 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
5335 for (i
= 0x81; i
< 0xFF; i
++)
5336 emacs_code_class
[i
] = EMACS_invalid_code
;
5337 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
5338 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
5339 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
5340 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
5342 /* ISO2022 specific initialize routine. */
5343 for (i
= 0; i
< 0x20; i
++)
5344 iso_code_class
[i
] = ISO_control_code
;
5345 for (i
= 0x21; i
< 0x7F; i
++)
5346 iso_code_class
[i
] = ISO_graphic_plane_0
;
5347 for (i
= 0x80; i
< 0xA0; i
++)
5348 iso_code_class
[i
] = ISO_control_code
;
5349 for (i
= 0xA1; i
< 0xFF; i
++)
5350 iso_code_class
[i
] = ISO_graphic_plane_1
;
5351 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
5352 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
5353 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
5354 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
5355 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
5356 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
5357 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
5358 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
5359 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
5360 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
5362 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
5364 setup_coding_system (Qnil
, &keyboard_coding
);
5365 setup_coding_system (Qnil
, &terminal_coding
);
5366 setup_coding_system (Qnil
, &safe_terminal_coding
);
5367 setup_coding_system (Qnil
, &default_buffer_file_coding
);
5369 bzero (coding_system_table
, sizeof coding_system_table
);
5371 bzero (ascii_skip_code
, sizeof ascii_skip_code
);
5372 for (i
= 0; i
< 128; i
++)
5373 ascii_skip_code
[i
] = 1;
5375 #if defined (MSDOS) || defined (WINDOWSNT)
5376 system_eol_type
= CODING_EOL_CRLF
;
5378 system_eol_type
= CODING_EOL_LF
;
5387 Qtarget_idx
= intern ("target-idx");
5388 staticpro (&Qtarget_idx
);
5390 Qcoding_system_history
= intern ("coding-system-history");
5391 staticpro (&Qcoding_system_history
);
5392 Fset (Qcoding_system_history
, Qnil
);
5394 /* Target FILENAME is the first argument. */
5395 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5396 /* Target FILENAME is the third argument. */
5397 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5399 Qcall_process
= intern ("call-process");
5400 staticpro (&Qcall_process
);
5401 /* Target PROGRAM is the first argument. */
5402 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5404 Qcall_process_region
= intern ("call-process-region");
5405 staticpro (&Qcall_process_region
);
5406 /* Target PROGRAM is the third argument. */
5407 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5409 Qstart_process
= intern ("start-process");
5410 staticpro (&Qstart_process
);
5411 /* Target PROGRAM is the third argument. */
5412 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5414 Qopen_network_stream
= intern ("open-network-stream");
5415 staticpro (&Qopen_network_stream
);
5416 /* Target SERVICE is the fourth argument. */
5417 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5419 Qcoding_system
= intern ("coding-system");
5420 staticpro (&Qcoding_system
);
5422 Qeol_type
= intern ("eol-type");
5423 staticpro (&Qeol_type
);
5425 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5426 staticpro (&Qbuffer_file_coding_system
);
5428 Qpost_read_conversion
= intern ("post-read-conversion");
5429 staticpro (&Qpost_read_conversion
);
5431 Qpre_write_conversion
= intern ("pre-write-conversion");
5432 staticpro (&Qpre_write_conversion
);
5434 Qno_conversion
= intern ("no-conversion");
5435 staticpro (&Qno_conversion
);
5437 Qundecided
= intern ("undecided");
5438 staticpro (&Qundecided
);
5440 Qcoding_system_p
= intern ("coding-system-p");
5441 staticpro (&Qcoding_system_p
);
5443 Qcoding_system_error
= intern ("coding-system-error");
5444 staticpro (&Qcoding_system_error
);
5446 Fput (Qcoding_system_error
, Qerror_conditions
,
5447 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5448 Fput (Qcoding_system_error
, Qerror_message
,
5449 build_string ("Invalid coding system"));
5451 Qcoding_category
= intern ("coding-category");
5452 staticpro (&Qcoding_category
);
5453 Qcoding_category_index
= intern ("coding-category-index");
5454 staticpro (&Qcoding_category_index
);
5456 Vcoding_category_table
5457 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
5458 staticpro (&Vcoding_category_table
);
5461 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5463 XVECTOR (Vcoding_category_table
)->contents
[i
]
5464 = intern (coding_category_name
[i
]);
5465 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
5466 Qcoding_category_index
, make_number (i
));
5470 Qtranslation_table
= intern ("translation-table");
5471 staticpro (&Qtranslation_table
);
5472 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
5474 Qtranslation_table_id
= intern ("translation-table-id");
5475 staticpro (&Qtranslation_table_id
);
5477 Qtranslation_table_for_decode
= intern ("translation-table-for-decode");
5478 staticpro (&Qtranslation_table_for_decode
);
5480 Qtranslation_table_for_encode
= intern ("translation-table-for-encode");
5481 staticpro (&Qtranslation_table_for_encode
);
5483 Qsafe_charsets
= intern ("safe-charsets");
5484 staticpro (&Qsafe_charsets
);
5486 Qvalid_codes
= intern ("valid-codes");
5487 staticpro (&Qvalid_codes
);
5489 Qemacs_mule
= intern ("emacs-mule");
5490 staticpro (&Qemacs_mule
);
5492 Qraw_text
= intern ("raw-text");
5493 staticpro (&Qraw_text
);
5495 defsubr (&Scoding_system_p
);
5496 defsubr (&Sread_coding_system
);
5497 defsubr (&Sread_non_nil_coding_system
);
5498 defsubr (&Scheck_coding_system
);
5499 defsubr (&Sdetect_coding_region
);
5500 defsubr (&Sdetect_coding_string
);
5501 defsubr (&Sdecode_coding_region
);
5502 defsubr (&Sencode_coding_region
);
5503 defsubr (&Sdecode_coding_string
);
5504 defsubr (&Sencode_coding_string
);
5505 defsubr (&Sdecode_sjis_char
);
5506 defsubr (&Sencode_sjis_char
);
5507 defsubr (&Sdecode_big5_char
);
5508 defsubr (&Sencode_big5_char
);
5509 defsubr (&Sset_terminal_coding_system_internal
);
5510 defsubr (&Sset_safe_terminal_coding_system_internal
);
5511 defsubr (&Sterminal_coding_system
);
5512 defsubr (&Sset_keyboard_coding_system_internal
);
5513 defsubr (&Skeyboard_coding_system
);
5514 defsubr (&Sfind_operation_coding_system
);
5515 defsubr (&Supdate_coding_systems_internal
);
5516 defsubr (&Sset_coding_priority_internal
);
5518 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
5519 "List of coding systems.\n\
5521 Do not alter the value of this variable manually. This variable should be\n\
5522 updated by the functions `make-coding-system' and\n\
5523 `define-coding-system-alias'.");
5524 Vcoding_system_list
= Qnil
;
5526 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
5527 "Alist of coding system names.\n\
5528 Each element is one element list of coding system name.\n\
5529 This variable is given to `completing-read' as TABLE argument.\n\
5531 Do not alter the value of this variable manually. This variable should be\n\
5532 updated by the functions `make-coding-system' and\n\
5533 `define-coding-system-alias'.");
5534 Vcoding_system_alist
= Qnil
;
5536 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
5537 "List of coding-categories (symbols) ordered by priority.");
5541 Vcoding_category_list
= Qnil
;
5542 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
5543 Vcoding_category_list
5544 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
5545 Vcoding_category_list
);
5548 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
5549 "Specify the coding system for read operations.\n\
5550 It is useful to bind this variable with `let', but do not set it globally.\n\
5551 If the value is a coding system, it is used for decoding on read operation.\n\
5552 If not, an appropriate element is used from one of the coding system alists:\n\
5553 There are three such tables, `file-coding-system-alist',\n\
5554 `process-coding-system-alist', and `network-coding-system-alist'.");
5555 Vcoding_system_for_read
= Qnil
;
5557 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
5558 "Specify the coding system for write operations.\n\
5559 It is useful to bind this variable with `let', but do not set it globally.\n\
5560 If the value is a coding system, it is used for encoding on write operation.\n\
5561 If not, an appropriate element is used from one of the coding system alists:\n\
5562 There are three such tables, `file-coding-system-alist',\n\
5563 `process-coding-system-alist', and `network-coding-system-alist'.");
5564 Vcoding_system_for_write
= Qnil
;
5566 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
5567 "Coding system used in the latest file or process I/O.");
5568 Vlast_coding_system_used
= Qnil
;
5570 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
5571 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5572 inhibit_eol_conversion
= 0;
5574 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
5575 "Non-nil means process buffer inherits coding system of process output.\n\
5576 Bind it to t if the process output is to be treated as if it were a file\n\
5577 read from some filesystem.");
5578 inherit_process_coding_system
= 0;
5580 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
5581 "Alist to decide a coding system to use for a file I/O operation.\n\
5582 The format is ((PATTERN . VAL) ...),\n\
5583 where PATTERN is a regular expression matching a file name,\n\
5584 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5585 If VAL is a coding system, it is used for both decoding and encoding\n\
5586 the file contents.\n\
5587 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5588 and the cdr part is used for encoding.\n\
5589 If VAL is a function symbol, the function must return a coding system\n\
5590 or a cons of coding systems which are used as above.\n\
5592 See also the function `find-operation-coding-system'\n\
5593 and the variable `auto-coding-alist'.");
5594 Vfile_coding_system_alist
= Qnil
;
5596 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
5597 "Alist to decide a coding system to use for a process I/O operation.\n\
5598 The format is ((PATTERN . VAL) ...),\n\
5599 where PATTERN is a regular expression matching a program name,\n\
5600 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5601 If VAL is a coding system, it is used for both decoding what received\n\
5602 from the program and encoding what sent to the program.\n\
5603 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5604 and the cdr part is used for encoding.\n\
5605 If VAL is a function symbol, the function must return a coding system\n\
5606 or a cons of coding systems which are used as above.\n\
5608 See also the function `find-operation-coding-system'.");
5609 Vprocess_coding_system_alist
= Qnil
;
5611 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
5612 "Alist to decide a coding system to use for a network I/O operation.\n\
5613 The format is ((PATTERN . VAL) ...),\n\
5614 where PATTERN is a regular expression matching a network service name\n\
5615 or is a port number to connect to,\n\
5616 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5617 If VAL is a coding system, it is used for both decoding what received\n\
5618 from the network stream and encoding what sent to the network stream.\n\
5619 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5620 and the cdr part is used for encoding.\n\
5621 If VAL is a function symbol, the function must return a coding system\n\
5622 or a cons of coding systems which are used as above.\n\
5624 See also the function `find-operation-coding-system'.");
5625 Vnetwork_coding_system_alist
= Qnil
;
5627 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
5628 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5629 eol_mnemonic_unix
= ':';
5631 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
5632 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5633 eol_mnemonic_dos
= '\\';
5635 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
5636 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5637 eol_mnemonic_mac
= '/';
5639 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
5640 "Mnemonic character indicating end-of-line format is not yet decided.");
5641 eol_mnemonic_undecided
= ':';
5643 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
5644 "*Non-nil enables character translation while encoding and decoding.");
5645 Venable_character_translation
= Qt
;
5647 DEFVAR_LISP ("standard-translation-table-for-decode",
5648 &Vstandard_translation_table_for_decode
,
5649 "Table for translating characters while decoding.");
5650 Vstandard_translation_table_for_decode
= Qnil
;
5652 DEFVAR_LISP ("standard-translation-table-for-encode",
5653 &Vstandard_translation_table_for_encode
,
5654 "Table for translationg characters while encoding.");
5655 Vstandard_translation_table_for_encode
= Qnil
;
5657 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
5658 "Alist of charsets vs revision numbers.\n\
5659 While encoding, if a charset (car part of an element) is found,\n\
5660 designate it with the escape sequence identifing revision (cdr part of the element).");
5661 Vcharset_revision_alist
= Qnil
;
5663 DEFVAR_LISP ("default-process-coding-system",
5664 &Vdefault_process_coding_system
,
5665 "Cons of coding systems used for process I/O by default.\n\
5666 The car part is used for decoding a process output,\n\
5667 the cdr part is used for encoding a text to be sent to a process.");
5668 Vdefault_process_coding_system
= Qnil
;
5670 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
5671 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5672 This is a vector of length 256.\n\
5673 If Nth element is non-nil, the existence of code N in a file\n\
5674 \(or output of subprocess) doesn't prevent it to be detected as\n\
5675 a coding system of ISO 2022 variant which has a flag\n\
5676 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5677 or reading output of a subprocess.\n\
5678 Only 128th through 159th elements has a meaning.");
5679 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
5681 DEFVAR_LISP ("select-safe-coding-system-function",
5682 &Vselect_safe_coding_system_function
,
5683 "Function to call to select safe coding system for encoding a text.\n\
5685 If set, this function is called to force a user to select a proper\n\
5686 coding system which can encode the text in the case that a default\n\
5687 coding system used in each operation can't encode the text.\n\
5689 The default value is `select-safe-coding-system' (which see).");
5690 Vselect_safe_coding_system_function
= Qnil
;