1 /* Coding system handler (conversion, detection, and etc).
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /*** TABLE OF CONTENTS ***
26 2. Emacs' internal format handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format, and when we say "encode", it means
42 converting Emacs' internal format to some other coding system.
44 0. Emacs' internal format
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
109 detect_coding_internal (src
, src_end
)
110 unsigned char *src
, *src_end
;
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format. The resulting text goes to a
120 place pointed by DESTINATION, the length of which should not exceed
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
122 The return value is the length of the decoded text. Below is a
123 template of these functions. */
125 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
126 struct coding_system
*coding
;
127 unsigned char *source
, *destination
;
128 int src_bytes
, dst_bytes
;
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
137 These functions encode SRC_BYTES length text at SOURCE of Emacs
138 internal format to CODING. The resulting text goes to a place
139 pointed by DESTINATION, the length of which should not exceed
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
141 The return value is the length of the encoded text. Below is a
142 template of these functions. */
144 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
145 struct coding_system
*coding
;
146 unsigned char *source
, *destination
;
147 int src_bytes
, dst_bytes
;
154 /*** COMMONLY USED MACROS ***/
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
162 #define ONE_MORE_BYTE(c1) \
167 goto label_end_of_loop; \
170 #define TWO_MORE_BYTES(c1, c2) \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
175 goto label_end_of_loop; \
178 #define THREE_MORE_BYTES(c1, c2, c3) \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
183 goto label_end_of_loop; \
186 /* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
193 /* Decode one ASCII character C. */
195 #define DECODE_CHARACTER_ASCII(c) \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
228 /*** 1. Preamble ***/
242 #else /* not emacs */
246 #endif /* not emacs */
248 Lisp_Object Qcoding_system
, Qeol_type
;
249 Lisp_Object Qbuffer_file_coding_system
;
250 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
252 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
253 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
254 Lisp_Object Qstart_process
, Qopen_network_stream
;
255 Lisp_Object Qtarget_idx
;
257 /* Mnemonic character of each format of end-of-line. */
258 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
259 /* Mnemonic character to indicate format of end-of-line is not yet
261 int eol_mnemonic_undecided
;
265 Lisp_Object Qcoding_system_vector
, Qcoding_system_p
, Qcoding_system_error
;
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269 /* Coding-system for reading files and receiving data from process. */
270 Lisp_Object Vcoding_system_for_read
;
271 /* Coding-system for writing files and sending data to process. */
272 Lisp_Object Vcoding_system_for_write
;
273 /* Coding-system actually used in the latest I/O. */
274 Lisp_Object Vlast_coding_system_used
;
276 /* Coding-system of what terminal accept for displaying. */
277 struct coding_system terminal_coding
;
279 /* Coding-system of what is sent from terminal keyboard. */
280 struct coding_system keyboard_coding
;
282 Lisp_Object Vcoding_system_alist
;
286 Lisp_Object Qcoding_category_index
;
288 /* List of symbols `coding-category-xxx' ordered by priority. */
289 Lisp_Object Vcoding_category_list
;
291 /* Table of coding-systems currently assigned to each coding-category. */
292 Lisp_Object coding_category_table
[CODING_CATEGORY_IDX_MAX
];
294 /* Table of names of symbol for each coding-category. */
295 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
296 "coding-category-internal",
297 "coding-category-sjis",
298 "coding-category-iso-7",
299 "coding-category-iso-8-1",
300 "coding-category-iso-8-2",
301 "coding-category-iso-else",
302 "coding-category-big5",
303 "coding-category-binary"
306 /* Alist of charsets vs the alternate charsets. */
307 Lisp_Object Valternate_charset_table
;
309 /* Alist of charsets vs revision number. */
310 Lisp_Object Vcharset_revision_alist
;
313 /*** 2. Emacs internal format handlers ***/
315 /* Emacs' internal format for encoding multiple character sets is a
316 kind of multi-byte encoding, i.e. encoding a character by a sequence
317 of one-byte codes of variable length. ASCII characters and control
318 characters (e.g. `tab', `newline') are represented by one-byte as
319 is. It takes the range 0x00 through 0x7F. The other characters
320 are represented by a sequence of `base leading-code', optional
321 `extended leading-code', and one or two `position-code's. Length
322 of the sequence is decided by the base leading-code. Leading-code
323 takes the range 0x80 through 0x9F, whereas extended leading-code
324 and position-code take the range 0xA0 through 0xFF. See the
325 document of `charset.h' for more detail about leading-code and
328 There's one exception in this rule. Special leading-code
329 `leading-code-composition' denotes that the following several
330 characters should be composed into one character. Leading-codes of
331 components (except for ASCII) are added 0x20. An ASCII character
332 component is represented by a 2-byte sequence of `0xA0' and
333 `ASCII-code + 0x80'. See also the document in `charset.h' for the
334 detail of composite character. Hence, we can summarize the code
337 --- CODE RANGE of Emacs' internal format ---
338 (character set) (range)
340 ELSE (1st byte) 0x80 .. 0x9F
341 (rest bytes) 0xA0 .. 0xFF
342 ---------------------------------------------
346 enum emacs_code_class_type emacs_code_class
[256];
348 /* Go to the next statement only if *SRC is accessible and the code is
349 greater than 0xA0. */
350 #define CHECK_CODE_RANGE_A0_FF \
352 if (src >= src_end) \
353 goto label_end_of_switch; \
354 else if (*src++ < 0xA0) \
358 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
359 Check if a text is encoded in Emacs' internal format. If it is,
360 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
363 detect_coding_internal (src
, src_end
)
364 unsigned char *src
, *src_end
;
369 while (src
< src_end
)
381 switch (emacs_code_class
[c
])
383 case EMACS_ascii_code
:
384 case EMACS_linefeed_code
:
387 case EMACS_control_code
:
388 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
392 case EMACS_invalid_code
:
395 case EMACS_leading_code_composition
: /* c == 0x80 */
397 CHECK_CODE_RANGE_A0_FF
;
402 case EMACS_leading_code_4
:
403 CHECK_CODE_RANGE_A0_FF
;
404 /* fall down to check it two more times ... */
406 case EMACS_leading_code_3
:
407 CHECK_CODE_RANGE_A0_FF
;
408 /* fall down to check it one more time ... */
410 case EMACS_leading_code_2
:
411 CHECK_CODE_RANGE_A0_FF
;
419 return CODING_CATEGORY_MASK_INTERNAL
;
423 /*** 3. ISO2022 handlers ***/
425 /* The following note describes the coding system ISO2022 briefly.
426 Since the intension of this note is to help understanding of the
427 programs in this file, some parts are NOT ACCURATE or OVERLY
428 SIMPLIFIED. For the thorough understanding, please refer to the
429 original document of ISO2022.
431 ISO2022 provides many mechanisms to encode several character sets
432 in 7-bit and 8-bit environment. If one choose 7-bite environment,
433 all text is encoded by codes of less than 128. This may make the
434 encoded text a little bit longer, but the text get more stability
435 to pass through several gateways (some of them split MSB off).
437 There are two kind of character set: control character set and
438 graphic character set. The former contains control characters such
439 as `newline' and `escape' to provide control functions (control
440 functions are provided also by escape sequence). The latter
441 contains graphic characters such as ' A' and '-'. Emacs recognizes
442 two control character sets and many graphic character sets.
444 Graphic character sets are classified into one of the following
445 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
446 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
447 bytes (DIMENSION) and the number of characters in one dimension
448 (CHARS) of the set. In addition, each character set is assigned an
449 identification tag (called "final character" and denoted as <F>
450 here after) which is unique in each class. <F> of each character
451 set is decided by ECMA(*) when it is registered in ISO. Code range
452 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
454 Note (*): ECMA = European Computer Manufacturers Association
456 Here are examples of graphic character set [NAME(<F>)]:
457 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
458 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
459 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
460 o DIMENSION2_CHARS96 -- none for the moment
462 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
463 C0 [0x00..0x1F] -- control character plane 0
464 GL [0x20..0x7F] -- graphic character plane 0
465 C1 [0x80..0x9F] -- control character plane 1
466 GR [0xA0..0xFF] -- graphic character plane 1
468 A control character set is directly designated and invoked to C0 or
469 C1 by an escape sequence. The most common case is that ISO646's
470 control character set is designated/invoked to C0 and ISO6429's
471 control character set is designated/invoked to C1, and usually
472 these designations/invocations are omitted in a coded text. With
473 7-bit environment, only C0 can be used, and a control character for
474 C1 is encoded by an appropriate escape sequence to fit in the
475 environment. All control characters for C1 are defined the
476 corresponding escape sequences.
478 A graphic character set is at first designated to one of four
479 graphic registers (G0 through G3), then these graphic registers are
480 invoked to GL or GR. These designations and invocations can be
481 done independently. The most common case is that G0 is invoked to
482 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
483 these invocations and designations are omitted in a coded text.
484 With 7-bit environment, only GL can be used.
486 When a graphic character set of CHARS94 is invoked to GL, code 0x20
487 and 0x7F of GL area work as control characters SPACE and DEL
488 respectively, and code 0xA0 and 0xFF of GR area should not be used.
490 There are two ways of invocation: locking-shift and single-shift.
491 With locking-shift, the invocation lasts until the next different
492 invocation, whereas with single-shift, the invocation works only
493 for the following character and doesn't affect locking-shift.
494 Invocations are done by the following control characters or escape
497 ----------------------------------------------------------------------
498 function control char escape sequence description
499 ----------------------------------------------------------------------
500 SI (shift-in) 0x0F none invoke G0 to GL
501 SI (shift-out) 0x0E none invoke G1 to GL
502 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
503 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
504 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
505 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
506 ----------------------------------------------------------------------
507 The first four are for locking-shift. Control characters for these
508 functions are defined by macros ISO_CODE_XXX in `coding.h'.
510 Designations are done by the following escape sequences.
511 ----------------------------------------------------------------------
512 escape sequence description
513 ----------------------------------------------------------------------
514 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
515 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
516 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
517 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
518 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
519 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
520 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
521 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
522 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
523 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
524 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
525 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
526 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
527 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
528 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
529 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
530 ----------------------------------------------------------------------
532 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
533 of dimension 1, chars 94, and final character <F>, and etc.
535 Note (*): Although these designations are not allowed in ISO2022,
536 Emacs accepts them on decoding, and produces them on encoding
537 CHARS96 character set in a coding system which is characterized as
538 7-bit environment, non-locking-shift, and non-single-shift.
540 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
541 '(' can be omitted. We call this as "short-form" here after.
543 Now you may notice that there are a lot of ways for encoding the
544 same multilingual text in ISO2022. Actually, there exist many
545 coding systems such as Compound Text (used in X's inter client
546 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
547 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
548 localized platforms), and all of these are variants of ISO2022.
550 In addition to the above, Emacs handles two more kinds of escape
551 sequences: ISO6429's direction specification and Emacs' private
552 sequence for specifying character composition.
554 ISO6429's direction specification takes the following format:
555 o CSI ']' -- end of the current direction
556 o CSI '0' ']' -- end of the current direction
557 o CSI '1' ']' -- start of left-to-right text
558 o CSI '2' ']' -- start of right-to-left text
559 The control character CSI (0x9B: control sequence introducer) is
560 abbreviated to the escape sequence ESC '[' in 7-bit environment.
562 Character composition specification takes the following format:
563 o ESC '0' -- start character composition
564 o ESC '1' -- end character composition
565 Since these are not standard escape sequences of any ISO, the use
566 of them for these meaning is restricted to Emacs only. */
568 enum iso_code_class_type iso_code_class
[256];
570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
571 Check if a text is encoded in ISO2022. If it is, returns an
572 integer in which appropriate flag bits any of:
573 CODING_CATEGORY_MASK_ISO_7
574 CODING_CATEGORY_MASK_ISO_8_1
575 CODING_CATEGORY_MASK_ISO_8_2
576 CODING_CATEGORY_MASK_ISO_ELSE
577 are set. If a code which should never appear in ISO2022 is found,
581 detect_coding_iso2022 (src
, src_end
)
582 unsigned char *src
, *src_end
;
584 unsigned char c
, g1
= 0;
585 int mask
= (CODING_CATEGORY_MASK_ISO_7
586 | CODING_CATEGORY_MASK_ISO_8_1
587 | CODING_CATEGORY_MASK_ISO_8_2
);
588 /* We may look ahead at most 4 bytes. */
589 unsigned char *adjusted_src_end
= src_end
- 4;
592 while (src
< src_end
)
601 if (src
+ 2 >= src_end
602 && ((c
>= '(' && c
<= '/')
603 || c
== '$' && ((*src
>= '(' && *src
<= '/')
604 || (*src
>= '@' && *src
<= 'B'))))
606 /* Valid designation sequence. */
607 if (c
== ')' || (c
== '$' && *src
== ')'))
612 else if (c
== 'N' || c
== 'O' || c
== 'n' || c
== 'o')
613 return CODING_CATEGORY_MASK_ISO_ELSE
;
618 return CODING_CATEGORY_MASK_ISO_ELSE
;
624 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
636 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
637 while (src
< src_end
&& *src
>= 0xA0)
639 if (count
& 1 && src
< src_end
)
640 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
649 /* Decode a character of which charset is CHARSET and the 1st position
650 code is C1. If dimension of CHARSET 2, the 2nd position code is
651 fetched from SRC and set to C2. If CHARSET is negative, it means
652 that we are decoding ill formed text, and what we can do is just to
655 #define DECODE_ISO_CHARACTER(charset, c1) \
657 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
658 ONE_MORE_BYTE (c2); \
659 if (COMPOSING_HEAD_P (coding->composing)) \
661 *dst++ = LEADING_CODE_COMPOSITION; \
662 if (COMPOSING_WITH_RULE_P (coding->composing)) \
663 /* To tell composition rules are embeded. */ \
665 coding->composing += 2; \
669 else if ((charset) == CHARSET_ASCII) \
670 DECODE_CHARACTER_ASCII (c1); \
671 else if (CHARSET_DIMENSION (charset) == 1) \
672 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
674 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell a composition rule follows. */ \
677 coding->composing = COMPOSING_WITH_RULE_RULE; \
680 /* Set designation state into CODING. */
681 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
683 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
685 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
687 charset = get_charset_id (XCONS (temp)->cdr); \
690 if (coding->direction == 1 \
691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
692 charset = CHARSET_REVERSE_CHARSET (charset); \
693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
700 decode_coding_iso2022 (coding
, source
, destination
,
701 src_bytes
, dst_bytes
, consumed
)
702 struct coding_system
*coding
;
703 unsigned char *source
, *destination
;
704 int src_bytes
, dst_bytes
;
707 unsigned char *src
= source
;
708 unsigned char *src_end
= source
+ src_bytes
;
709 unsigned char *dst
= destination
;
710 unsigned char *dst_end
= destination
+ dst_bytes
;
711 /* Since the maximum bytes produced by each loop is 7, we subtract 6
712 from DST_END to assure that overflow checking is necessary only
713 at the head of loop. */
714 unsigned char *adjusted_dst_end
= dst_end
- 6;
716 /* Charsets invoked to graphic plane 0 and 1 respectively. */
717 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
718 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
720 while (src
< src_end
&& dst
< adjusted_dst_end
)
722 /* SRC_BASE remembers the start position in source in each loop.
723 The loop will be exited when there's not enough source text
724 to analyze long escape sequence or 2-byte code (within macros
725 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
726 to SRC_BASE before exiting. */
727 unsigned char *src_base
= src
;
728 unsigned char c1
= *src
++, c2
, cmprule
;
730 switch (iso_code_class
[c1
])
732 case ISO_0x20_or_0x7F
:
733 if (!coding
->composing
734 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
736 /* This is SPACE or DEL. */
740 /* This is a graphic character, we fall down ... */
742 case ISO_graphic_plane_0
:
743 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
745 /* This is a composition rule. */
747 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
750 DECODE_ISO_CHARACTER (charset0
, c1
);
753 case ISO_0xA0_or_0xFF
:
754 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94)
760 /* This is a graphic character, we fall down ... */
762 case ISO_graphic_plane_1
:
763 DECODE_ISO_CHARACTER (charset1
, c1
);
766 case ISO_control_code
:
767 /* All ISO2022 control characters in this class have the
768 same representation in Emacs internal format. */
772 case ISO_carriage_return
:
773 if (coding
->eol_type
== CODING_EOL_CR
)
777 else if (coding
->eol_type
== CODING_EOL_CRLF
)
780 if (c1
== ISO_CODE_LF
)
795 if (CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
796 goto label_invalid_escape_sequence
;
797 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
798 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
802 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
803 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
806 case ISO_single_shift_2_7
:
807 case ISO_single_shift_2
:
808 /* SS2 is handled as an escape sequence of ESC 'N' */
810 goto label_escape_sequence
;
812 case ISO_single_shift_3
:
813 /* SS2 is handled as an escape sequence of ESC 'O' */
815 goto label_escape_sequence
;
817 case ISO_control_sequence_introducer
:
818 /* CSI is handled as an escape sequence of ESC '[' ... */
820 goto label_escape_sequence
;
824 label_escape_sequence
:
825 /* Escape sequences handled by Emacs are invocation,
826 designation, direction specification, and character
827 composition specification. */
830 case '&': /* revision of following character set */
832 if (!(c1
>= '@' && c1
<= '~'))
833 goto label_invalid_escape_sequence
;
835 if (c1
!= ISO_CODE_ESC
)
836 goto label_invalid_escape_sequence
;
838 goto label_escape_sequence
;
840 case '$': /* designation of 2-byte character set */
842 if (c1
>= '@' && c1
<= 'B')
843 { /* designation of JISX0208.1978, GB2312.1980,
845 DECODE_DESIGNATION (0, 2, 94, c1
);
847 else if (c1
>= 0x28 && c1
<= 0x2B)
848 { /* designation of DIMENSION2_CHARS94 character set */
850 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
852 else if (c1
>= 0x2C && c1
<= 0x2F)
853 { /* designation of DIMENSION2_CHARS96 character set */
855 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
858 goto label_invalid_escape_sequence
;
861 case 'n': /* invocation of locking-shift-2 */
862 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
863 goto label_invalid_escape_sequence
;
864 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
865 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
868 case 'o': /* invocation of locking-shift-3 */
869 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
870 goto label_invalid_escape_sequence
;
871 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
872 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
875 case 'N': /* invocation of single-shift-2 */
876 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
877 goto label_invalid_escape_sequence
;
879 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
880 DECODE_ISO_CHARACTER (charset
, c1
);
883 case 'O': /* invocation of single-shift-3 */
884 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
885 goto label_invalid_escape_sequence
;
887 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
888 DECODE_ISO_CHARACTER (charset
, c1
);
891 case '0': /* start composing without embeded rules */
892 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
895 case '1': /* end composing */
896 coding
->composing
= COMPOSING_NO
;
899 case '2': /* start composing with embeded rules */
900 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
903 case '[': /* specification of direction */
904 /* For the moment, nested direction is not supported.
905 So, the value of `coding->direction' is 0 or 1: 0
906 means left-to-right, 1 means right-to-left. */
910 case ']': /* end of the current direction */
911 coding
->direction
= 0;
913 case '0': /* end of the current direction */
914 case '1': /* start of left-to-right direction */
917 coding
->direction
= 0;
919 goto label_invalid_escape_sequence
;
922 case '2': /* start of right-to-left direction */
925 coding
->direction
= 1;
927 goto label_invalid_escape_sequence
;
931 goto label_invalid_escape_sequence
;
936 if (c1
>= 0x28 && c1
<= 0x2B)
937 { /* designation of DIMENSION1_CHARS94 character set */
939 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
941 else if (c1
>= 0x2C && c1
<= 0x2F)
942 { /* designation of DIMENSION1_CHARS96 character set */
944 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
948 goto label_invalid_escape_sequence
;
951 /* We must update these variables now. */
952 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
953 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
956 label_invalid_escape_sequence
:
958 int length
= src
- src_base
;
960 bcopy (src_base
, dst
, length
);
967 coding
->carryover_size
= src
- src_base
;
968 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
973 /* If this is the last block of the text to be decoded, we had
974 better just flush out all remaining codes in the text although
975 they are not valid characters. */
976 if (coding
->last_block
)
978 bcopy (src
, dst
, src_end
- src
);
979 dst
+= (src_end
- src
);
982 *consumed
= src
- source
;
983 return dst
- destination
;
986 /* ISO2022 encoding staffs. */
989 It is not enough to say just "ISO2022" on encoding, but we have to
990 specify more details. In Emacs, each coding-system of ISO2022
991 variant has the following specifications:
992 1. Initial designation to G0 thru G3.
993 2. Allows short-form designation?
994 3. ASCII should be designated to G0 before control characters?
995 4. ASCII should be designated to G0 at end of line?
996 5. 7-bit environment or 8-bit environment?
997 6. Use locking-shift?
999 And the following two are only for Japanese:
1000 8. Use ASCII in place of JIS0201-1976-Roman?
1001 9. Use JISX0208-1983 in place of JISX0208-1978?
1002 These specifications are encoded in `coding->flags' as flag bits
1003 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1007 /* Produce codes (escape sequence) for designating CHARSET to graphic
1008 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1009 the coding system CODING allows, produce designation sequence of
1012 #define ENCODE_DESIGNATION(charset, reg, coding) \
1014 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1015 char *intermediate_char_94 = "()*+"; \
1016 char *intermediate_char_96 = ",-./"; \
1018 = Fassq (make_number (charset), Vcharset_revision_alist); \
1019 if (! NILP (temp)) \
1021 *dst++ = ISO_CODE_ESC; \
1023 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1025 *dst++ = ISO_CODE_ESC; \
1026 if (CHARSET_DIMENSION (charset) == 1) \
1028 if (CHARSET_CHARS (charset) == 94) \
1029 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1031 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1036 if (CHARSET_CHARS (charset) == 94) \
1038 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1040 || final_char < '@' || final_char > 'B') \
1041 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1044 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1046 *dst++ = final_char; \
1047 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1050 /* The following two macros produce codes (control character or escape
1051 sequence) for ISO2022 single-shift functions (single-shift-2 and
1054 #define ENCODE_SINGLE_SHIFT_2 \
1056 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1057 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1059 *dst++ = ISO_CODE_SS2; \
1060 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1063 #define ENCODE_SINGLE_SHIFT_3 \
1065 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1066 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1068 *dst++ = ISO_CODE_SS3; \
1069 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1072 /* The following four macros produce codes (control character or
1073 escape sequence) for ISO2022 locking-shift functions (shift-in,
1074 shift-out, locking-shift-2, and locking-shift-3). */
1076 #define ENCODE_SHIFT_IN \
1078 *dst++ = ISO_CODE_SI; \
1079 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1082 #define ENCODE_SHIFT_OUT \
1084 *dst++ = ISO_CODE_SO; \
1085 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1088 #define ENCODE_LOCKING_SHIFT_2 \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1091 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1094 #define ENCODE_LOCKING_SHIFT_3 \
1096 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1097 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1100 /* Produce codes for a DIMENSION1 character of which character set is
1101 CHARSET and position-code is C1. Designation and invocation
1102 sequences are also produced in advance if necessary. */
1105 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1107 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1109 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1110 *dst++ = c1 & 0x7F; \
1112 *dst++ = c1 | 0x80; \
1113 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1116 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1118 *dst++ = c1 & 0x7F; \
1121 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1123 *dst++ = c1 | 0x80; \
1127 /* Since CHARSET is not yet invoked to any graphic planes, we \
1128 must invoke it, or, at first, designate it to some graphic \
1129 register. Then repeat the loop to actually produce the \
1131 dst = encode_invocation_designation (charset, coding, dst); \
1134 /* Produce codes for a DIMENSION2 character of which character set is
1135 CHARSET and position-codes are C1 and C2. Designation and
1136 invocation codes are also produced in advance if necessary. */
1138 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1140 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1142 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1143 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1145 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1146 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1149 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1151 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1154 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1156 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1160 /* Since CHARSET is not yet invoked to any graphic planes, we \
1161 must invoke it, or, at first, designate it to some graphic \
1162 register. Then repeat the loop to actually produce the \
1164 dst = encode_invocation_designation (charset, coding, dst); \
1167 /* Produce designation and invocation codes at a place pointed by DST
1168 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1172 encode_invocation_designation (charset
, coding
, dst
)
1174 struct coding_system
*coding
;
1177 int reg
; /* graphic register number */
1179 /* At first, check designations. */
1180 for (reg
= 0; reg
< 4; reg
++)
1181 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1186 /* CHARSET is not yet designated to any graphic registers. */
1187 /* At first check the requested designation. */
1188 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1190 /* Since CHARSET requests no special designation, designate to
1191 graphic register 0. */
1194 ENCODE_DESIGNATION (charset
, reg
, coding
);
1197 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1198 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1200 /* Since the graphic register REG is not invoked to any graphic
1201 planes, invoke it to graphic plane 0. */
1204 case 0: /* graphic register 0 */
1208 case 1: /* graphic register 1 */
1212 case 2: /* graphic register 2 */
1213 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1214 ENCODE_SINGLE_SHIFT_2
;
1216 ENCODE_LOCKING_SHIFT_2
;
1219 case 3: /* graphic register 3 */
1220 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1221 ENCODE_SINGLE_SHIFT_3
;
1223 ENCODE_LOCKING_SHIFT_3
;
1230 /* The following two macros produce codes for indicating composition. */
1231 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1232 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1233 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1235 /* The following three macros produce codes for indicating direction
1237 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1239 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1240 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1242 *dst++ = ISO_CODE_CSI; \
1245 #define ENCODE_DIRECTION_R2L \
1246 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1248 #define ENCODE_DIRECTION_L2R \
1249 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1251 /* Produce codes for designation and invocation to reset the graphic
1252 planes and registers to initial state. */
1253 #define ENCODE_RESET_PLANE_AND_REGISTER \
1256 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1258 for (reg = 0; reg < 4; reg++) \
1259 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1260 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1261 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1262 ENCODE_DESIGNATION \
1263 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1267 encode_designation_at_bol (coding
, src
, src_end
, dstp
)
1268 struct coding_system
*coding
;
1269 unsigned char *src
, *src_end
, **dstp
;
1271 int charset
, reg
, r
[4];
1272 unsigned char *dst
= *dstp
, c
;
1273 for (reg
= 0; reg
< 4; reg
++) r
[reg
] = -1;
1274 while (src
< src_end
&& (c
= *src
++) != '\n')
1276 switch (emacs_code_class
[c
])
1278 case EMACS_ascii_code
:
1279 charset
= CHARSET_ASCII
;
1281 case EMACS_leading_code_2
:
1282 if (++src
>= src_end
) continue;
1285 case EMACS_leading_code_3
:
1286 if ((src
+= 2) >= src_end
) continue;
1287 charset
= (c
< LEADING_CODE_PRIVATE_11
? c
: *(src
- 2));
1289 case EMACS_leading_code_4
:
1290 if ((src
+= 3) >= src_end
) continue;
1291 charset
= *(src
- 3);
1296 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1298 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != charset
)
1301 if (c
!= '\n' && !coding
->last_block
)
1303 for (reg
= 0; reg
< 4; reg
++)
1305 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1310 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1313 encode_coding_iso2022 (coding
, source
, destination
,
1314 src_bytes
, dst_bytes
, consumed
)
1315 struct coding_system
*coding
;
1316 unsigned char *source
, *destination
;
1317 int src_bytes
, dst_bytes
;
1320 unsigned char *src
= source
;
1321 unsigned char *src_end
= source
+ src_bytes
;
1322 unsigned char *dst
= destination
;
1323 unsigned char *dst_end
= destination
+ dst_bytes
;
1324 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1325 from DST_END to assure overflow checking is necessary only at the
1327 unsigned char *adjusted_dst_end
= dst_end
- 19;
1329 while (src
< src_end
&& dst
< adjusted_dst_end
)
1331 /* SRC_BASE remembers the start position in source in each loop.
1332 The loop will be exited when there's not enough source text
1333 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1334 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1335 reset to SRC_BASE before exiting. */
1336 unsigned char *src_base
= src
;
1337 unsigned char c1
, c2
, c3
, c4
;
1340 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1341 && CODING_SPEC_ISO_BOL (coding
))
1343 /* We have to produce destination sequences now. */
1344 if (encode_designation_at_bol (coding
, src
, src_end
, &dst
) < 0)
1345 /* We can't find end of line in the current block. Let's
1346 repeat encoding starting from the current position
1349 CODING_SPEC_ISO_BOL (coding
) = 0;
1353 /* If we are seeing a component of a composite character, we are
1354 seeing a leading-code specially encoded for composition, or a
1355 composition rule if composing with rule. We must set C1
1356 to a normal leading-code or an ASCII code. If we are not at
1357 a composed character, we must reset the composition state. */
1358 if (COMPOSING_P (coding
->composing
))
1362 /* We are not in a composite character any longer. */
1363 coding
->composing
= COMPOSING_NO
;
1364 ENCODE_COMPOSITION_END
;
1368 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1371 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1374 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1375 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1378 /* This is an ASCII component. */
1383 /* This is a leading-code of non ASCII component. */
1388 /* Now encode one character. C1 is a control character, an
1389 ASCII character, or a leading-code of multi-byte character. */
1390 switch (emacs_code_class
[c1
])
1392 case EMACS_ascii_code
:
1393 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII
, c1
);
1396 case EMACS_control_code
:
1397 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1398 ENCODE_RESET_PLANE_AND_REGISTER
;
1402 case EMACS_carriage_return_code
:
1403 if (!coding
->selective
)
1405 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1406 ENCODE_RESET_PLANE_AND_REGISTER
;
1410 /* fall down to treat '\r' as '\n' ... */
1412 case EMACS_linefeed_code
:
1413 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1414 ENCODE_RESET_PLANE_AND_REGISTER
;
1415 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1416 bcopy (coding
->spec
.iso2022
.initial_designation
,
1417 coding
->spec
.iso2022
.current_designation
,
1418 sizeof coding
->spec
.iso2022
.initial_designation
);
1419 if (coding
->eol_type
== CODING_EOL_LF
1420 || coding
->eol_type
== CODING_EOL_AUTOMATIC
)
1421 *dst
++ = ISO_CODE_LF
;
1422 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1423 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1425 *dst
++ = ISO_CODE_CR
;
1426 CODING_SPEC_ISO_BOL (coding
) = 1;
1429 case EMACS_leading_code_2
:
1431 ENCODE_ISO_CHARACTER_DIMENSION1 (c1
, c2
);
1434 case EMACS_leading_code_3
:
1435 TWO_MORE_BYTES (c2
, c3
);
1436 if (c1
< LEADING_CODE_PRIVATE_11
)
1437 ENCODE_ISO_CHARACTER_DIMENSION2 (c1
, c2
, c3
);
1439 ENCODE_ISO_CHARACTER_DIMENSION1 (c2
, c3
);
1442 case EMACS_leading_code_4
:
1443 THREE_MORE_BYTES (c2
, c3
, c4
);
1444 ENCODE_ISO_CHARACTER_DIMENSION2 (c2
, c3
, c4
);
1447 case EMACS_leading_code_composition
:
1451 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1452 ENCODE_COMPOSITION_WITH_RULE_START
;
1456 /* Rewind one byte because it is a character code of
1457 composition elements. */
1459 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1460 ENCODE_COMPOSITION_NO_RULE_START
;
1464 case EMACS_invalid_code
:
1470 coding
->carryover_size
= src
- src_base
;
1471 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1476 /* If this is the last block of the text to be encoded, we must
1477 reset the state of graphic planes and registers to initial one.
1478 In addition, we had better just flush out all remaining codes in
1479 the text although they are not valid characters. */
1480 if (coding
->last_block
)
1482 ENCODE_RESET_PLANE_AND_REGISTER
;
1483 bcopy(src
, dst
, src_end
- src
);
1484 dst
+= (src_end
- src
);
1487 *consumed
= src
- source
;
1488 return dst
- destination
;
1492 /*** 4. SJIS and BIG5 handlers ***/
1494 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1495 quite widely. So, for the moment, Emacs supports them in the bare
1496 C code. But, in the future, they may be supported only by CCL. */
1498 /* SJIS is a coding system encoding three character sets: ASCII, right
1499 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1500 as is. A character of charset katakana-jisx0201 is encoded by
1501 "position-code + 0x80". A character of charset japanese-jisx0208
1502 is encoded in 2-byte but two position-codes are divided and shifted
1503 so that it fit in the range below.
1505 --- CODE RANGE of SJIS ---
1506 (character set) (range)
1508 KATAKANA-JISX0201 0xA0 .. 0xDF
1509 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1510 (2nd byte) 0x40 .. 0xFF
1511 -------------------------------
1515 /* BIG5 is a coding system encoding two character sets: ASCII and
1516 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1517 character set and is encoded in two-byte.
1519 --- CODE RANGE of BIG5 ---
1520 (character set) (range)
1522 Big5 (1st byte) 0xA1 .. 0xFE
1523 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1524 --------------------------
1526 Since the number of characters in Big5 is larger than maximum
1527 characters in Emacs' charset (96x96), it can't be handled as one
1528 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1529 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1530 contains frequently used characters and the latter contains less
1531 frequently used characters. */
1533 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1534 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1535 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1536 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1538 /* Number of Big5 characters which have the same code in 1st byte. */
1539 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1541 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1544 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1546 charset = charset_big5_1; \
1549 charset = charset_big5_2; \
1550 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1552 c1 = temp / (0xFF - 0xA1) + 0x21; \
1553 c2 = temp % (0xFF - 0xA1) + 0x21; \
1556 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1558 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1559 if (charset == charset_big5_2) \
1560 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1561 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1562 b2 = temp % BIG5_SAME_ROW; \
1563 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1566 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1567 Check if a text is encoded in SJIS. If it is, return
1568 CODING_CATEGORY_MASK_SJIS, else return 0. */
1571 detect_coding_sjis (src
, src_end
)
1572 unsigned char *src
, *src_end
;
1576 while (src
< src_end
)
1579 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1581 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
1583 if (src
< src_end
&& *src
++ < 0x40)
1587 return CODING_CATEGORY_MASK_SJIS
;
1590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591 Check if a text is encoded in BIG5. If it is, return
1592 CODING_CATEGORY_MASK_BIG5, else return 0. */
1595 detect_coding_big5 (src
, src_end
)
1596 unsigned char *src
, *src_end
;
1600 while (src
< src_end
)
1603 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1610 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
1614 return CODING_CATEGORY_MASK_BIG5
;
1617 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1618 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1621 decode_coding_sjis_big5 (coding
, source
, destination
,
1622 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1623 struct coding_system
*coding
;
1624 unsigned char *source
, *destination
;
1625 int src_bytes
, dst_bytes
;
1629 unsigned char *src
= source
;
1630 unsigned char *src_end
= source
+ src_bytes
;
1631 unsigned char *dst
= destination
;
1632 unsigned char *dst_end
= destination
+ dst_bytes
;
1633 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1634 from DST_END to assure overflow checking is necessary only at the
1636 unsigned char *adjusted_dst_end
= dst_end
- 3;
1638 while (src
< src_end
&& dst
< adjusted_dst_end
)
1640 /* SRC_BASE remembers the start position in source in each loop.
1641 The loop will be exited when there's not enough source text
1642 to analyze two-byte character (within macro ONE_MORE_BYTE).
1643 In that case, SRC is reset to SRC_BASE before exiting. */
1644 unsigned char *src_base
= src
;
1645 unsigned char c1
= *src
++, c2
, c3
, c4
;
1649 if (coding
->eol_type
== CODING_EOL_CRLF
)
1655 /* To process C2 again, SRC is subtracted by 1. */
1663 else if (c1
< 0xA0 || c1
>= 0xE0)
1665 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1669 DECODE_SJIS (c1
, c2
, c3
, c4
);
1670 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208
, c3
, c4
);
1672 else if (c1
>= 0xE0 && c1
< 0xFF)
1677 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1678 DECODE_CHARACTER_DIMENSION2 (charset
, c3
, c4
);
1680 else /* Invalid code */
1685 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1687 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201
, c1
);
1693 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1694 DECODE_CHARACTER_DIMENSION2 (charset
, c3
, c4
);
1700 coding
->carryover_size
= src
- src_base
;
1701 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1706 *consumed
= src
- source
;
1707 return dst
- destination
;
1710 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1711 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1712 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1713 sure that all these charsets are registered as official charset
1714 (i.e. do not have extended leading-codes). Characters of other
1715 charsets are produced without any encoding. If SJIS_P is 1, encode
1716 SJIS text, else encode BIG5 text. */
1719 encode_coding_sjis_big5 (coding
, source
, destination
,
1720 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1721 struct coding_system
*coding
;
1722 unsigned char *source
, *destination
;
1723 int src_bytes
, dst_bytes
;
1727 unsigned char *src
= source
;
1728 unsigned char *src_end
= source
+ src_bytes
;
1729 unsigned char *dst
= destination
;
1730 unsigned char *dst_end
= destination
+ dst_bytes
;
1731 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1732 from DST_END to assure overflow checking is necessary only at the
1734 unsigned char *adjusted_dst_end
= dst_end
- 1;
1736 while (src
< src_end
&& dst
< adjusted_dst_end
)
1738 /* SRC_BASE remembers the start position in source in each loop.
1739 The loop will be exited when there's not enough source text
1740 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1741 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1743 unsigned char *src_base
= src
;
1744 unsigned char c1
= *src
++, c2
, c3
, c4
;
1746 if (coding
->composing
)
1753 else if (c1
>= 0xA0)
1756 coding
->composing
= 0;
1759 switch (emacs_code_class
[c1
])
1761 case EMACS_ascii_code
:
1762 case EMACS_control_code
:
1766 case EMACS_carriage_return_code
:
1767 if (!coding
->selective
)
1772 /* fall down to treat '\r' as '\n' ... */
1774 case EMACS_linefeed_code
:
1775 if (coding
->eol_type
== CODING_EOL_LF
1776 || coding
->eol_type
== CODING_EOL_AUTOMATIC
)
1778 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1779 *dst
++ = '\r', *dst
++ = '\n';
1784 case EMACS_leading_code_2
:
1786 if (sjis_p
&& c1
== charset_katakana_jisx0201
)
1789 *dst
++ = c1
, *dst
++ = c2
;
1792 case EMACS_leading_code_3
:
1793 TWO_MORE_BYTES (c2
, c3
);
1794 c2
&= 0x7F, c3
&= 0x7F;
1795 if (sjis_p
&& c1
== charset_jisx0208
)
1797 unsigned char s1
, s2
;
1799 ENCODE_SJIS (c2
, c3
, s1
, s2
);
1800 *dst
++ = s1
, *dst
++ = s2
;
1802 else if (!sjis_p
&& (c1
== charset_big5_1
|| c1
== charset_big5_2
))
1804 unsigned char b1
, b2
;
1806 ENCODE_BIG5 (c1
, c2
, c3
, b1
, b2
);
1807 *dst
++ = b1
, *dst
++ = b2
;
1810 *dst
++ = c1
, *dst
++ = c2
, *dst
++ = c3
;
1813 case EMACS_leading_code_4
:
1814 THREE_MORE_BYTES (c2
, c3
, c4
);
1815 *dst
++ = c1
, *dst
++ = c2
, *dst
++ = c3
, *dst
++ = c4
;
1818 case EMACS_leading_code_composition
:
1819 coding
->composing
= 1;
1822 default: /* i.e. case EMACS_invalid_code: */
1828 coding
->carryover_size
= src
- src_base
;
1829 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1834 *consumed
= src
- source
;
1835 return dst
- destination
;
1839 /*** 5. End-of-line handlers ***/
1841 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1842 This function is called only when `coding->eol_type' is
1843 CODING_EOL_CRLF or CODING_EOL_CR. */
1845 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
1846 struct coding_system
*coding
;
1847 unsigned char *source
, *destination
;
1848 int src_bytes
, dst_bytes
;
1851 unsigned char *src
= source
;
1852 unsigned char *src_end
= source
+ src_bytes
;
1853 unsigned char *dst
= destination
;
1854 unsigned char *dst_end
= destination
+ dst_bytes
;
1857 switch (coding
->eol_type
)
1859 case CODING_EOL_CRLF
:
1861 /* Since the maximum bytes produced by each loop is 2, we
1862 subtract 1 from DST_END to assure overflow checking is
1863 necessary only at the head of loop. */
1864 unsigned char *adjusted_dst_end
= dst_end
- 1;
1866 while (src
< src_end
&& dst
< adjusted_dst_end
)
1868 unsigned char *src_base
= src
;
1869 unsigned char c
= *src
++;
1882 coding
->carryover_size
= src
- src_base
;
1883 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1887 *consumed
= src
- source
;
1888 produced
= dst
- destination
;
1893 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1894 bcopy (source
, destination
, produced
);
1895 dst_end
= destination
+ produced
;
1896 while (dst
< dst_end
)
1897 if (*dst
++ == '\r') dst
[-1] = '\n';
1898 *consumed
= produced
;
1901 default: /* i.e. case: CODING_EOL_LF */
1902 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1903 bcopy (source
, destination
, produced
);
1904 *consumed
= produced
;
1911 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1912 format of end-of-line according to `coding->eol_type'. If
1913 `coding->selective' is 1, code '\r' in source text also means
1916 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
1917 struct coding_system
*coding
;
1918 unsigned char *source
, *destination
;
1919 int src_bytes
, dst_bytes
;
1922 unsigned char *src
= source
;
1923 unsigned char *dst
= destination
;
1929 switch (coding
->eol_type
)
1932 case CODING_EOL_AUTOMATIC
:
1933 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1934 bcopy (source
, destination
, produced
);
1935 if (coding
->selective
)
1939 if (*dst
++ == '\r') dst
[-1] = '\n';
1941 *consumed
= produced
;
1943 case CODING_EOL_CRLF
:
1946 unsigned char *src_end
= source
+ src_bytes
;
1947 unsigned char *dst_end
= destination
+ dst_bytes
;
1948 /* Since the maximum bytes produced by each loop is 2, we
1949 subtract 1 from DST_END to assure overflow checking is
1950 necessary only at the head of loop. */
1951 unsigned char *adjusted_dst_end
= dst_end
- 1;
1953 while (src
< src_end
&& dst
< adjusted_dst_end
)
1956 if (c
== '\n' || (c
== '\r' && coding
->selective
))
1957 *dst
++ = '\r', *dst
++ = '\n';
1961 produced
= dst
- destination
;
1962 *consumed
= src
- source
;
1966 default: /* i.e. case CODING_EOL_CR: */
1967 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1968 bcopy (source
, destination
, produced
);
1972 if (*dst
++ == '\n') dst
[-1] = '\r';
1974 *consumed
= produced
;
1981 /*** 6. C library functions ***/
1983 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1984 has a property `coding-system'. The value of this property is a
1985 vector of length 5 (called as coding-vector). Among elements of
1986 this vector, the first (element[0]) and the fifth (element[4])
1987 carry important information for decoding/encoding. Before
1988 decoding/encoding, this information should be set in fields of a
1989 structure of type `coding_system'.
1991 A value of property `coding-system' can be a symbol of another
1992 subsidiary coding-system. In that case, Emacs gets coding-vector
1995 `element[0]' contains information to be set in `coding->type'. The
1996 value and its meaning is as follows:
1998 0 -- coding_system_internal
1999 1 -- coding_system_sjis
2000 2 -- coding_system_iso2022
2001 3 -- coding_system_big5
2002 4 -- coding_system_ccl
2003 nil -- coding_system_no_conversion
2004 t -- coding_system_automatic
2006 `element[4]' contains information to be set in `coding->flags' and
2007 `coding->spec'. The meaning varies by `coding->type'.
2009 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2010 of length 32 (of which the first 13 sub-elements are used now).
2011 Meanings of these sub-elements are:
2013 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2014 If the value is an integer of valid charset, the charset is
2015 assumed to be designated to graphic register N initially.
2017 If the value is minus, it is a minus value of charset which
2018 reserves graphic register N, which means that the charset is
2019 not designated initially but should be designated to graphic
2020 register N just before encoding a character in that charset.
2022 If the value is nil, graphic register N is never used on
2025 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2026 Each value takes t or nil. See the section ISO2022 of
2027 `coding.h' for more information.
2029 If `coding->type' is `coding_type_big5', element[4] is t to denote
2030 BIG5-ETen or nil to denote BIG5-HKU.
2032 If `coding->type' takes the other value, element[4] is ignored.
2034 Emacs Lisp's coding system also carries information about format of
2035 end-of-line in a value of property `eol-type'. If the value is
2036 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2037 means CODING_EOL_CR. If it is not integer, it should be a vector
2038 of subsidiary coding systems of which property `eol-type' has one
2043 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2044 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2045 is setup so that no conversion is necessary and return -1, else
2049 setup_coding_system (coding_system
, coding
)
2050 Lisp_Object coding_system
;
2051 struct coding_system
*coding
;
2053 Lisp_Object type
, eol_type
;
2055 /* At first, set several fields default values. */
2056 coding
->require_flushing
= 0;
2057 coding
->last_block
= 0;
2058 coding
->selective
= 0;
2059 coding
->composing
= 0;
2060 coding
->direction
= 0;
2061 coding
->carryover_size
= 0;
2062 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2064 Vlast_coding_system_used
= coding
->symbol
= coding_system
;
2066 /* Get value of property `coding-system' until we get a vector.
2067 While doing that, also get values of properties
2068 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2069 while (!NILP (coding_system
) && SYMBOLP (coding_system
))
2071 if (NILP (coding
->post_read_conversion
))
2072 coding
->post_read_conversion
= Fget (coding_system
,
2073 Qpost_read_conversion
);
2074 if (NILP (coding
->pre_write_conversion
))
2075 coding
->pre_write_conversion
= Fget (coding_system
,
2076 Qpre_write_conversion
);
2077 if (NILP (eol_type
))
2078 eol_type
= Fget (coding_system
, Qeol_type
);
2079 coding_system
= Fget (coding_system
, Qcoding_system
);
2081 if (!VECTORP (coding_system
)
2082 || XVECTOR (coding_system
)->size
!= 5)
2083 goto label_invalid_coding_system
;
2085 if (VECTORP (eol_type
))
2086 coding
->eol_type
= CODING_EOL_AUTOMATIC
;
2087 else if (XFASTINT (eol_type
) == 1)
2088 coding
->eol_type
= CODING_EOL_CRLF
;
2089 else if (XFASTINT (eol_type
) == 2)
2090 coding
->eol_type
= CODING_EOL_CR
;
2092 coding
->eol_type
= CODING_EOL_LF
;
2094 type
= XVECTOR (coding_system
)->contents
[0];
2095 switch (XFASTINT (type
))
2098 coding
->type
= coding_type_internal
;
2102 coding
->type
= coding_type_sjis
;
2106 coding
->type
= coding_type_iso2022
;
2108 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2110 int i
, charset
, default_reg_bits
= 0;
2112 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2113 goto label_invalid_coding_system
;
2115 flags
= XVECTOR (val
)->contents
;
2117 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2118 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2119 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2120 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2121 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2122 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2123 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2124 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2125 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2126 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2127 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
));
2129 /* Invoke graphic register 0 to plane 0. */
2130 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2131 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2132 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2133 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2134 /* Not single shifting at first. */
2135 CODING_SPEC_ISO_SINGLE_SHIFTING(coding
) = 0;
2136 /* Beginning of buffer should also be regarded as bol. */
2137 CODING_SPEC_ISO_BOL(coding
) = 1;
2139 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2140 FLAGS[REG] can be one of below:
2141 integer CHARSET: CHARSET occupies register I,
2142 t: designate nothing to REG initially, but can be used
2144 list of integer, nil, or t: designate the first
2145 element (if integer) to REG initially, the remaining
2146 elements (if integer) is designated to REG on request,
2147 if an element is t, REG can be used by any charset,
2148 nil: REG is never used. */
2149 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2150 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = -1;
2151 for (i
= 0; i
< 4; i
++)
2153 if (INTEGERP (flags
[i
])
2154 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2155 || (charset
= get_charset_id (flags
[i
])) >= 0)
2157 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2158 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2160 else if (EQ (flags
[i
], Qt
))
2162 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2163 default_reg_bits
|= 1 << i
;
2165 else if (CONSP (flags
[i
]))
2167 Lisp_Object tail
= flags
[i
];
2169 if (INTEGERP (XCONS (tail
)->car
)
2170 && (charset
= XINT (XCONS (tail
)->car
),
2171 CHARSET_VALID_P (charset
))
2172 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2174 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2175 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2178 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2179 tail
= XCONS (tail
)->cdr
;
2180 while (CONSP (tail
))
2182 if (INTEGERP (XCONS (tail
)->car
)
2183 && (charset
= XINT (XCONS (tail
)->car
),
2184 CHARSET_VALID_P (charset
))
2185 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2186 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2188 else if (EQ (XCONS (tail
)->car
, Qt
))
2189 default_reg_bits
|= 1 << i
;
2190 tail
= XCONS (tail
)->cdr
;
2194 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2196 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2197 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2200 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2202 /* REG 1 can be used only by locking shift in 7-bit env. */
2203 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2204 default_reg_bits
&= ~2;
2205 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2206 /* Without any shifting, only REG 0 and 1 can be used. */
2207 default_reg_bits
&= 3;
2210 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2211 if (CHARSET_VALID_P (charset
)
2212 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) < 0)
2214 /* We have not yet decided where to designate CHARSET. */
2215 int reg_bits
= default_reg_bits
;
2217 if (CHARSET_CHARS (charset
) == 96)
2218 /* A charset of CHARS96 can't be designated to REG 0. */
2222 /* There exist some default graphic register. */
2223 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2225 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
2227 /* We anyway have to designate CHARSET to somewhere. */
2228 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2229 = (CHARSET_CHARS (charset
) == 94
2231 : ((coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
2232 || ! coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2234 : (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
2238 coding
->require_flushing
= 1;
2242 coding
->type
= coding_type_big5
;
2244 = (NILP (XVECTOR (coding_system
)->contents
[4])
2245 ? CODING_FLAG_BIG5_HKU
2246 : CODING_FLAG_BIG5_ETEN
);
2250 coding
->type
= coding_type_ccl
;
2252 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2254 && VECTORP (XCONS (val
)->car
)
2255 && VECTORP (XCONS (val
)->cdr
))
2257 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
2258 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
2261 goto label_invalid_coding_system
;
2263 coding
->require_flushing
= 1;
2268 coding
->type
= coding_type_automatic
;
2270 coding
->type
= coding_type_no_conversion
;
2275 label_invalid_coding_system
:
2276 coding
->type
= coding_type_no_conversion
;
2277 coding
->symbol
= coding
->pre_write_conversion
= coding
->post_read_conversion
2282 /* Emacs has a mechanism to automatically detect a coding system if it
2283 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2284 it's impossible to distinguish some coding systems accurately
2285 because they use the same range of codes. So, at first, coding
2286 systems are categorized into 7, those are:
2288 o coding-category-internal
2290 The category for a coding system which has the same code range
2291 as Emacs' internal format. Assigned the coding-system (Lisp
2292 symbol) `internal' by default.
2294 o coding-category-sjis
2296 The category for a coding system which has the same code range
2297 as SJIS. Assigned the coding-system (Lisp
2298 symbol) `shift-jis' by default.
2300 o coding-category-iso-7
2302 The category for a coding system which has the same code range
2303 as ISO2022 of 7-bit environment. Assigned the coding-system
2304 (Lisp symbol) `iso-2022-7' by default.
2306 o coding-category-iso-8-1
2308 The category for a coding system which has the same code range
2309 as ISO2022 of 8-bit environment and graphic plane 1 used only
2310 for DIMENSION1 charset. Assigned the coding-system (Lisp
2311 symbol) `iso-8859-1' by default.
2313 o coding-category-iso-8-2
2315 The category for a coding system which has the same code range
2316 as ISO2022 of 8-bit environment and graphic plane 1 used only
2317 for DIMENSION2 charset. Assigned the coding-system (Lisp
2318 symbol) `euc-japan' by default.
2320 o coding-category-iso-else
2322 The category for a coding system which has the same code range
2323 as ISO2022 but not belongs to any of the above three
2324 categories. Assigned the coding-system (Lisp symbol)
2325 `iso-2022-ss2-7' by default.
2327 o coding-category-big5
2329 The category for a coding system which has the same code range
2330 as BIG5. Assigned the coding-system (Lisp symbol)
2331 `cn-big5' by default.
2333 o coding-category-binary
2335 The category for a coding system not categorized in any of the
2336 above. Assigned the coding-system (Lisp symbol)
2337 `no-conversion' by default.
2339 Each of them is a Lisp symbol and the value is an actual
2340 `coding-system's (this is also a Lisp symbol) assigned by a user.
2341 What Emacs does actually is to detect a category of coding system.
2342 Then, it uses a `coding-system' assigned to it. If Emacs can't
2343 decide only one possible category, it selects a category of the
2344 highest priority. Priorities of categories are also specified by a
2345 user in a Lisp variable `coding-category-list'.
2349 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2350 If it detects possible coding systems, return an integer in which
2351 appropriate flag bits are set. Flag bits are defined by macros
2352 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2355 detect_coding_mask (src
, src_bytes
)
2359 register unsigned char c
;
2360 unsigned char *src_end
= src
+ src_bytes
;
2363 /* At first, skip all ASCII characters and control characters except
2364 for three ISO2022 specific control characters. */
2365 while (src
< src_end
)
2369 || (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
2375 /* We found nothing other than ASCII. There's nothing to do. */
2376 return CODING_CATEGORY_MASK_ANY
;
2378 /* The text seems to be encoded in some multilingual coding system.
2379 Now, try to find in which coding system the text is encoded. */
2381 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2382 /* C is an ISO2022 specific control code of C0. */
2383 mask
= detect_coding_iso2022 (src
, src_end
);
2385 else if (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
|| c
== ISO_CODE_CSI
)
2386 /* C is an ISO2022 specific control code of C1,
2387 or the first byte of SJIS's 2-byte character code,
2388 or a leading code of Emacs. */
2389 mask
= (detect_coding_iso2022 (src
, src_end
)
2390 | detect_coding_sjis (src
, src_end
)
2391 | detect_coding_internal (src
, src_end
));
2394 /* C is the first byte of SJIS character code,
2395 or a leading-code of Emacs. */
2396 mask
= (detect_coding_sjis (src
, src_end
)
2397 | detect_coding_internal (src
, src_end
));
2400 /* C is a character of ISO2022 in graphic plane right,
2401 or a SJIS's 1-byte character code (i.e. JISX0201),
2402 or the first byte of BIG5's 2-byte code. */
2403 mask
= (detect_coding_iso2022 (src
, src_end
)
2404 | detect_coding_sjis (src
, src_end
)
2405 | detect_coding_big5 (src
, src_end
));
2410 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2411 The information of the detected coding system is set in CODING. */
2414 detect_coding (coding
, src
, src_bytes
)
2415 struct coding_system
*coding
;
2419 int mask
= detect_coding_mask (src
, src_bytes
);
2422 if (mask
== CODING_CATEGORY_MASK_ANY
)
2423 /* We found nothing other than ASCII. There's nothing to do. */
2427 /* The source text seems to be encoded in unknown coding system.
2428 Emacs regards the category of such a kind of coding system as
2429 `coding-category-binary'. We assume that a user has assigned
2430 an appropriate coding system for a `coding-category-binary'. */
2431 idx
= CODING_CATEGORY_IDX_BINARY
;
2434 /* We found some plausible coding systems. Let's use a coding
2435 system of the highest priority. */
2436 Lisp_Object val
= Vcoding_category_list
;
2441 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
2442 if ((idx
< CODING_CATEGORY_IDX_MAX
) && (mask
& (1 << idx
)))
2444 val
= XCONS (val
)->cdr
;
2451 /* For unknown reason, `Vcoding_category_list' contains none
2452 of found categories. Let's use any of them. */
2453 for (idx
= 0; idx
< CODING_CATEGORY_IDX_MAX
; idx
++)
2454 if (mask
& (1 << idx
))
2458 setup_coding_system (XSYMBOL (coding_category_table
[idx
])->value
, coding
);
2461 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2462 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2463 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2466 detect_eol_type (src
, src_bytes
)
2470 unsigned char *src_end
= src
+ src_bytes
;
2473 while (src
< src_end
)
2477 return CODING_EOL_LF
;
2480 if (src
< src_end
&& *src
== '\n')
2481 return CODING_EOL_CRLF
;
2483 return CODING_EOL_CR
;
2486 return CODING_EOL_AUTOMATIC
;
2489 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2490 is encoded. If it detects an appropriate format of end-of-line, it
2491 sets the information in *CODING. */
2494 detect_eol (coding
, src
, src_bytes
)
2495 struct coding_system
*coding
;
2500 int eol_type
= detect_eol_type (src
, src_bytes
);
2502 if (eol_type
== CODING_EOL_AUTOMATIC
)
2503 /* We found no end-of-line in the source text. */
2506 val
= Fget (coding
->symbol
, Qeol_type
);
2507 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
2508 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
2511 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2512 decoding, it may detect coding system and format of end-of-line if
2513 those are not yet decided. */
2516 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2517 struct coding_system
*coding
;
2518 unsigned char *source
, *destination
;
2519 int src_bytes
, dst_bytes
;
2530 if (coding
->type
== coding_type_automatic
)
2531 detect_coding (coding
, source
, src_bytes
);
2533 if (coding
->eol_type
== CODING_EOL_AUTOMATIC
)
2534 detect_eol (coding
, source
, src_bytes
);
2536 coding
->carryover_size
= 0;
2537 switch (coding
->type
)
2539 case coding_type_no_conversion
:
2540 label_no_conversion
:
2541 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2542 bcopy (source
, destination
, produced
);
2543 *consumed
= produced
;
2546 case coding_type_internal
:
2547 case coding_type_automatic
:
2548 if (coding
->eol_type
== CODING_EOL_LF
2549 || coding
->eol_type
== CODING_EOL_AUTOMATIC
)
2550 goto label_no_conversion
;
2551 produced
= decode_eol (coding
, source
, destination
,
2552 src_bytes
, dst_bytes
, consumed
);
2555 case coding_type_sjis
:
2556 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2557 src_bytes
, dst_bytes
, consumed
,
2561 case coding_type_iso2022
:
2562 produced
= decode_coding_iso2022 (coding
, source
, destination
,
2563 src_bytes
, dst_bytes
, consumed
);
2566 case coding_type_big5
:
2567 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2568 src_bytes
, dst_bytes
, consumed
,
2572 case coding_type_ccl
:
2573 produced
= ccl_driver (&coding
->spec
.ccl
.decoder
, source
, destination
,
2574 src_bytes
, dst_bytes
, consumed
);
2581 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2584 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2585 struct coding_system
*coding
;
2586 unsigned char *source
, *destination
;
2587 int src_bytes
, dst_bytes
;
2592 coding
->carryover_size
= 0;
2593 switch (coding
->type
)
2595 case coding_type_no_conversion
:
2596 label_no_conversion
:
2597 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2600 bcopy (source
, destination
, produced
);
2601 if (coding
->selective
)
2603 unsigned char *p
= destination
, *pend
= destination
+ produced
;
2605 if (*p
++ == '\015') p
[-1] = '\n';
2608 *consumed
= produced
;
2611 case coding_type_internal
:
2612 case coding_type_automatic
:
2613 if (coding
->eol_type
== CODING_EOL_LF
2614 || coding
->eol_type
== CODING_EOL_AUTOMATIC
)
2615 goto label_no_conversion
;
2616 produced
= encode_eol (coding
, source
, destination
,
2617 src_bytes
, dst_bytes
, consumed
);
2620 case coding_type_sjis
:
2621 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2622 src_bytes
, dst_bytes
, consumed
,
2626 case coding_type_iso2022
:
2627 produced
= encode_coding_iso2022 (coding
, source
, destination
,
2628 src_bytes
, dst_bytes
, consumed
);
2631 case coding_type_big5
:
2632 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2633 src_bytes
, dst_bytes
, consumed
,
2637 case coding_type_ccl
:
2638 produced
= ccl_driver (&coding
->spec
.ccl
.encoder
, source
, destination
,
2639 src_bytes
, dst_bytes
, consumed
);
2646 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2648 /* Return maximum size (bytes) of a buffer enough for decoding
2649 SRC_BYTES of text encoded in CODING. */
2652 decoding_buffer_size (coding
, src_bytes
)
2653 struct coding_system
*coding
;
2658 if (coding
->type
== coding_type_iso2022
)
2660 else if (coding
->type
== coding_type_ccl
)
2661 magnification
= coding
->spec
.ccl
.decoder
.buf_magnification
;
2665 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2668 /* Return maximum size (bytes) of a buffer enough for encoding
2669 SRC_BYTES of text to CODING. */
2672 encoding_buffer_size (coding
, src_bytes
)
2673 struct coding_system
*coding
;
2678 if (coding
->type
== coding_type_ccl
)
2679 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
2683 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2686 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2687 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2690 char *conversion_buffer
;
2691 int conversion_buffer_size
;
2693 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2694 or decoding. Sufficient memory is allocated automatically. If we
2695 run out of memory, return NULL. */
2698 get_conversion_buffer (size
)
2701 if (size
> conversion_buffer_size
)
2704 int real_size
= conversion_buffer_size
* 2;
2706 while (real_size
< size
) real_size
*= 2;
2707 buf
= (char *) xmalloc (real_size
);
2708 xfree (conversion_buffer
);
2709 conversion_buffer
= buf
;
2710 conversion_buffer_size
= real_size
;
2712 return conversion_buffer
;
2717 /*** 7. Emacs Lisp library functions ***/
2719 DEFUN ("coding-system-vector", Fcoding_system_vector
, Scoding_system_vector
,
2721 "Return coding-vector of CODING-SYSTEM.\n\
2722 If CODING-SYSTEM is not a valid coding-system, return nil.")
2726 while (SYMBOLP (obj
) && !NILP (obj
))
2727 obj
= Fget (obj
, Qcoding_system
);
2728 return ((NILP (obj
) || !VECTORP (obj
) || XVECTOR (obj
)->size
!= 5)
2732 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
2733 "Return t if OBJECT is nil or a coding-system.\n\
2734 See document of make-coding-system for coding-system object.")
2738 return ((NILP (obj
) || !NILP (Fcoding_system_vector (obj
))) ? Qt
: Qnil
);
2741 DEFUN ("read-non-nil-coding-system",
2742 Fread_non_nil_coding_system
, Sread_non_nil_coding_system
, 1, 1, 0,
2743 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2749 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_vector
,
2751 } while (XSTRING (val
)->size
== 0);
2752 return (Fintern (val
, Qnil
));
2755 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 1, 0,
2756 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2760 Lisp_Object val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_p
,
2762 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
2765 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
2767 "Check validity of CODING-SYSTEM.\n\
2768 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2769 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2770 The value of property should be a vector of length 5.")
2772 Lisp_Object coding_system
;
2774 CHECK_SYMBOL (coding_system
, 0);
2775 if (!NILP (Fcoding_system_p (coding_system
)))
2776 return coding_system
;
2778 Fsignal (Qcoding_system_error
, coding_system
);
2781 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
2783 "Detect coding-system of the text in the region between START and END.\n\
2784 Return a list of possible coding-systems ordered by priority.\n\
2785 If only ASCII characters are found, it returns `automatic-conversion'\n\
2786 or its subsidiary coding-system according to a detected end-of-line format.")
2790 int coding_mask
, eol_type
;
2794 validate_region (&b
, &e
);
2795 beg
= XINT (b
), end
= XINT (e
);
2796 if (beg
< GPT
&& end
>= GPT
) move_gap (end
);
2798 coding_mask
= detect_coding_mask (POS_ADDR (beg
), end
- beg
);
2799 eol_type
= detect_eol_type (POS_ADDR (beg
), end
- beg
);
2801 if (coding_mask
== CODING_CATEGORY_MASK_ANY
)
2803 val
= intern ("automatic-conversion");
2804 if (eol_type
!= CODING_EOL_AUTOMATIC
)
2806 Lisp_Object val2
= Fget (val
, Qeol_type
);
2808 val
= XVECTOR (val2
)->contents
[eol_type
];
2815 /* At first, gather possible coding-systems in VAL in a reverse
2818 for (val2
= Vcoding_category_list
;
2820 val2
= XCONS (val2
)->cdr
)
2823 = XFASTINT (Fget (XCONS (val2
)->car
, Qcoding_category_index
));
2824 if (coding_mask
& (1 << idx
))
2825 val
= Fcons (Fsymbol_value (XCONS (val2
)->car
), val
);
2828 /* Then, change the order of the list, while getting subsidiary
2832 for (; !NILP (val2
); val2
= XCONS (val2
)->cdr
)
2834 if (eol_type
== CODING_EOL_AUTOMATIC
)
2835 val
= Fcons (XCONS (val2
)->car
, val
);
2838 Lisp_Object val3
= Fget (XCONS (val2
)->car
, Qeol_type
);
2840 val
= Fcons (XVECTOR (val3
)->contents
[eol_type
], val
);
2842 val
= Fcons (XCONS (val2
)->car
, val
);
2850 /* Scan text in the region between *BEGP and *ENDP, skip characters
2851 which we never have to encode to (iff ENCODEP is 1) or decode from
2852 coding system CODING at the head and tail, then set BEGP and ENDP
2853 to the addresses of start and end of the text we actually convert. */
2856 shrink_conversion_area (begp
, endp
, coding
, encodep
)
2857 unsigned char **begp
, **endp
;
2858 struct coding_system
*coding
;
2861 register unsigned char *beg_addr
= *begp
, *end_addr
= *endp
;
2863 if (coding
->eol_type
!= CODING_EOL_LF
2864 && coding
->eol_type
!= CODING_EOL_AUTOMATIC
)
2865 /* Since we anyway have to convert end-of-line format, it is not
2866 worth skipping at most 100 bytes or so. */
2869 if (encodep
) /* for encoding */
2871 switch (coding
->type
)
2873 case coding_type_no_conversion
:
2874 case coding_type_internal
:
2875 case coding_type_automatic
:
2876 /* We need no conversion. */
2879 case coding_type_ccl
:
2880 /* We can't skip any data. */
2882 case coding_type_iso2022
:
2883 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2885 unsigned char *bol
= beg_addr
;
2886 while (beg_addr
< end_addr
&& *beg_addr
< 0x80)
2889 if (*(beg_addr
- 1) == '\n')
2893 goto label_skip_tail
;
2897 /* We can skip all ASCII characters at the head and tail. */
2898 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
2900 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
2904 else /* for decoding */
2906 switch (coding
->type
)
2908 case coding_type_no_conversion
:
2909 /* We need no conversion. */
2912 case coding_type_internal
:
2913 if (coding
->eol_type
== CODING_EOL_LF
)
2915 /* We need no conversion. */
2919 /* We can skip all but carriage-return. */
2920 while (beg_addr
< end_addr
&& *beg_addr
!= '\r') beg_addr
++;
2921 while (beg_addr
< end_addr
&& *(end_addr
- 1) != '\r') end_addr
--;
2923 case coding_type_sjis
:
2924 case coding_type_big5
:
2925 /* We can skip all ASCII characters at the head. */
2926 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
2927 /* We can skip all ASCII characters at the tail except for
2928 the second byte of SJIS or BIG5 code. */
2929 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
2930 if (end_addr
!= *endp
)
2933 case coding_type_ccl
:
2934 /* We can't skip any data. */
2936 default: /* i.e. case coding_type_iso2022: */
2940 /* We can skip all ASCII characters except for a few
2941 control codes at the head. */
2942 while (beg_addr
< end_addr
&& (c
= *beg_addr
) < 0x80
2943 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
2944 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
)
2955 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2956 text between B and E. B and E are buffer position. */
2959 code_convert_region (b
, e
, coding
, encodep
)
2961 struct coding_system
*coding
;
2964 int beg
, end
, len
, consumed
, produced
;
2966 unsigned char *begp
, *endp
;
2969 validate_region (&b
, &e
);
2970 beg
= XINT (b
), end
= XINT (e
);
2971 if (beg
< GPT
&& end
>= GPT
)
2974 if (encodep
&& !NILP (coding
->pre_write_conversion
))
2976 /* We must call a pre-conversion function which may put a new
2977 text to be converted in a new buffer. */
2978 struct buffer
*old
= current_buffer
, *new;
2981 call2 (coding
->pre_write_conversion
, b
, e
);
2982 if (old
!= current_buffer
)
2984 /* Replace the original text by the text just generated. */
2986 new = current_buffer
;
2987 set_buffer_internal (old
);
2988 del_range (beg
, end
);
2989 insert_from_buffer (new, 1, len
, 0);
2994 /* We may be able to shrink the conversion region. */
2995 begp
= POS_ADDR (beg
); endp
= begp
+ (end
- beg
);
2996 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
2999 /* We need no conversion. */
3003 beg
+= begp
- POS_ADDR (beg
);
3004 end
= beg
+ (endp
- begp
);
3007 len
= encoding_buffer_size (coding
, end
- beg
);
3009 len
= decoding_buffer_size (coding
, end
- beg
);
3010 buf
= get_conversion_buffer (len
);
3012 coding
->last_block
= 1;
3014 ? encode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3016 : decode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3019 len
= produced
+ (beg
- XINT (b
)) + (XINT (e
) - end
);
3022 insert (buf
, produced
);
3023 del_range (PT
, PT
+ end
- beg
);
3025 pos
= PT
+ (pos
- end
);
3031 if (!encodep
&& !NILP (coding
->post_read_conversion
))
3033 /* We must call a post-conversion function which may alter
3034 the text just converted. */
3039 insval
= call1 (coding
->post_read_conversion
, make_number (len
));
3040 CHECK_NUMBER (insval
, 0);
3041 len
= XINT (insval
);
3044 return make_number (len
);
3048 code_convert_string (str
, coding
, encodep
, nocopy
)
3049 Lisp_Object str
, nocopy
;
3050 struct coding_system
*coding
;
3053 int len
, consumed
, produced
;
3055 unsigned char *begp
, *endp
;
3056 int head_skip
, tail_skip
;
3057 struct gcpro gcpro1
;
3059 if (encodep
&& !NILP (coding
->pre_write_conversion
)
3060 || !encodep
&& !NILP (coding
->post_read_conversion
))
3062 /* Since we have to call Lisp functions which assume target text
3063 is in a buffer, after setting a temporary buffer, call
3064 code_convert_region. */
3065 int count
= specpdl_ptr
- specpdl
;
3066 int len
= XSTRING (str
)->size
;
3068 struct buffer
*old
= current_buffer
;
3070 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
3071 temp_output_buffer_setup (" *code-converting-work*");
3072 set_buffer_internal (XBUFFER (Vstandard_output
));
3073 insert_from_string (str
, 0, len
, 0);
3074 code_convert_region (make_number (BEGV
), make_number (ZV
),
3076 result
= make_buffer_string (BEGV
, ZV
, 0);
3077 set_buffer_internal (old
);
3078 return unbind_to (count
, result
);
3081 /* We may be able to shrink the conversion region. */
3082 begp
= XSTRING (str
)->data
;
3083 endp
= begp
+ XSTRING (str
)->size
;
3084 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3087 /* We need no conversion. */
3088 return (NILP (nocopy
) ? Fcopy_sequence (str
) : str
);
3090 head_skip
= begp
- XSTRING (str
)->data
;
3091 tail_skip
= XSTRING (str
)->size
- head_skip
- (endp
- begp
);
3096 len
= encoding_buffer_size (coding
, endp
- begp
);
3098 len
= decoding_buffer_size (coding
, endp
- begp
);
3099 buf
= get_conversion_buffer (len
+ head_skip
+ tail_skip
);
3101 bcopy (XSTRING (str
)->data
, buf
, head_skip
);
3102 coding
->last_block
= 1;
3104 ? encode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3105 buf
+ head_skip
, endp
- begp
, len
, &consumed
)
3106 : decode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3107 buf
+ head_skip
, endp
- begp
, len
, &consumed
));
3108 bcopy (XSTRING (str
)->data
+ head_skip
+ (endp
- begp
),
3109 buf
+ head_skip
+ produced
,
3114 return make_string (buf
, head_skip
+ produced
+ tail_skip
);
3117 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
3118 3, 3, "r\nzCoding system: ",
3119 "Decode current region by specified coding system.\n\
3120 When called from a program, takes three arguments:\n\
3121 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3122 Return length of decoded text.")
3123 (b
, e
, coding_system
)
3124 Lisp_Object b
, e
, coding_system
;
3126 struct coding_system coding
;
3128 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3129 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3130 CHECK_SYMBOL (coding_system
, 2);
3132 if (NILP (coding_system
))
3133 return make_number (XFASTINT (e
) - XFASTINT (b
));
3134 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3135 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3137 return code_convert_region (b
, e
, &coding
, 0);
3140 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
3141 3, 3, "r\nzCoding system: ",
3142 "Encode current region by specified coding system.\n\
3143 When called from a program, takes three arguments:\n\
3144 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3145 Return length of encoded text.")
3146 (b
, e
, coding_system
)
3147 Lisp_Object b
, e
, coding_system
;
3149 struct coding_system coding
;
3151 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3152 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3153 CHECK_SYMBOL (coding_system
, 2);
3155 if (NILP (coding_system
))
3156 return make_number (XFASTINT (e
) - XFASTINT (b
));
3157 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3158 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3160 return code_convert_region (b
, e
, &coding
, 1);
3163 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
3165 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3166 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3168 (string
, coding_system
, nocopy
)
3169 Lisp_Object string
, coding_system
, nocopy
;
3171 struct coding_system coding
;
3173 CHECK_STRING (string
, 0);
3174 CHECK_SYMBOL (coding_system
, 1);
3176 if (NILP (coding_system
))
3177 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3178 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3179 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3181 return code_convert_string (string
, &coding
, 0, nocopy
);
3184 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
3186 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3187 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3189 (string
, coding_system
, nocopy
)
3190 Lisp_Object string
, coding_system
, nocopy
;
3192 struct coding_system coding
;
3194 CHECK_STRING (string
, 0);
3195 CHECK_SYMBOL (coding_system
, 1);
3197 if (NILP (coding_system
))
3198 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3199 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3200 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3202 return code_convert_string (string
, &coding
, 1, nocopy
);
3205 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
3206 "Decode a JISX0208 character of shift-jis encoding.\n\
3207 CODE is the character code in SJIS.\n\
3208 Return the corresponding character.")
3212 unsigned char c1
, c2
, s1
, s2
;
3215 CHECK_NUMBER (code
, 0);
3216 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
3217 DECODE_SJIS (s1
, s2
, c1
, c2
);
3218 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
3222 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
3223 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3224 Return the corresponding character code in SJIS.")
3229 unsigned char c1
, c2
, s1
, s2
;
3232 CHECK_NUMBER (ch
, 0);
3233 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3234 if (charset
== charset_jisx0208
)
3236 ENCODE_SJIS (c1
, c2
, s1
, s2
);
3237 XSETFASTINT (val
, ((int)s1
<< 8) | s2
);
3240 XSETFASTINT (val
, 0);
3244 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
3245 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3246 CODE is the character code in BIG5.\n\
3247 Return the corresponding character.")
3252 unsigned char b1
, b2
, c1
, c2
;
3255 CHECK_NUMBER (code
, 0);
3256 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
3257 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
3258 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
3262 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
3263 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3264 Return the corresponding character code in Big5.")
3269 unsigned char c1
, c2
, b1
, b2
;
3272 CHECK_NUMBER (ch
, 0);
3273 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3274 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
3276 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
3277 XSETFASTINT (val
, ((int)b1
<< 8) | b2
);
3280 XSETFASTINT (val
, 0);
3284 DEFUN ("set-terminal-coding-system",
3285 Fset_terminal_coding_system
, Sset_terminal_coding_system
, 1, 1,
3286 "zCoding-system for terminal display: ",
3287 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3288 All outputs to terminal are encoded to this coding-system.")
3290 Lisp_Object coding_system
;
3292 CHECK_SYMBOL (coding_system
, 0);
3293 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
3294 update_mode_lines
++;
3295 if (!NILP (Finteractive_p ()))
3300 DEFUN ("terminal-coding-system",
3301 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
3302 "Return coding-system of your terminal.")
3305 return terminal_coding
.symbol
;
3308 DEFUN ("set-keyboard-coding-system",
3309 Fset_keyboard_coding_system
, Sset_keyboard_coding_system
, 1, 1,
3310 "zCoding-system for keyboard input: ",
3311 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3312 All inputs from terminal are decoded from this coding-system.")
3314 Lisp_Object coding_system
;
3316 CHECK_SYMBOL (coding_system
, 0);
3317 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
3321 DEFUN ("keyboard-coding-system",
3322 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
3323 "Return coding-system of what is sent from terminal keyboard.")
3326 return keyboard_coding
.symbol
;
3330 DEFUN ("find-coding-system", Ffind_coding_system
, Sfind_coding_system
,
3332 "Choose a coding system for a file operation based on file name.\n\
3333 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3334 ENCODING-SYSTEM is the coding system to use for encoding\n\
3335 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3336 for decoding (in case OPERATION does decoding).\n\
3338 The first argument OPERATION specifies an I/O primitive:\n\
3339 For file I/O, `insert-file-contents' or `write-region'.\n\
3340 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3341 For network I/O, `open-network-stream'.\n\
3343 The remaining arguments should be the same arguments that were passed\n\
3344 to the primitive. Depending on which primitive, one of those arguments\n\
3345 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3346 whichever argument specifies the file name is TARGET.\n\
3348 TARGET has a meaning which depends on OPERATION:\n\
3349 For file I/O, TARGET is a file name.\n\
3350 For process I/O, TARGET is a process name.\n\
3351 For network I/O, TARGET is a service name or a port number\n\
3353 This function looks up what `coding-system-alist' specifies for\n\
3354 OPERATION and TARGET. It may specify a cons cell which represents\n\
3355 a particular coding system or it may have a function to call.\n\
3356 In the latter case, we call the function with one argument,\n\
3357 which is a list of all the arguments given to `find-coding-system'.")
3362 Lisp_Object operation
, target_idx
, target
, val
;
3363 register Lisp_Object chain
;
3366 error ("Too few arguments");
3367 operation
= args
[0];
3368 if (!SYMBOLP (operation
)
3369 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
3370 error ("Invalid first arguement");
3371 if (nargs
< 1 + XINT (target_idx
))
3372 error ("Too few arguments for operation: %s",
3373 XSYMBOL (operation
)->name
->data
);
3374 target
= args
[XINT (target_idx
) + 1];
3375 if (!(STRINGP (target
)
3376 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
3377 error ("Invalid %dth argument", XINT (target_idx
) + 1);
3379 chain
= Fassq (operation
, Vcoding_system_alist
);
3383 for (chain
= XCONS (chain
)->cdr
; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
3385 Lisp_Object elt
= XCONS (chain
)->car
;
3388 && ((STRINGP (target
)
3389 && STRINGP (XCONS (elt
)->car
)
3390 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
3391 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
3392 return (CONSP (val
= XCONS (elt
)->cdr
)
3394 : ((SYMBOLP (val
) && Fboundp (val
)
3395 ? call2 (val
, Flist (nargs
, args
))
3404 /*** 8. Post-amble ***/
3410 /* Emacs internal format specific initialize routine. */
3411 for (i
= 0; i
<= 0x20; i
++)
3412 emacs_code_class
[i
] = EMACS_control_code
;
3413 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
3414 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
3415 for (i
= 0x21 ; i
< 0x7F; i
++)
3416 emacs_code_class
[i
] = EMACS_ascii_code
;
3417 emacs_code_class
[0x7F] = EMACS_control_code
;
3418 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
3419 for (i
= 0x81; i
< 0xFF; i
++)
3420 emacs_code_class
[i
] = EMACS_invalid_code
;
3421 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
3422 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
3423 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
3424 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
3426 /* ISO2022 specific initialize routine. */
3427 for (i
= 0; i
< 0x20; i
++)
3428 iso_code_class
[i
] = ISO_control_code
;
3429 for (i
= 0x21; i
< 0x7F; i
++)
3430 iso_code_class
[i
] = ISO_graphic_plane_0
;
3431 for (i
= 0x80; i
< 0xA0; i
++)
3432 iso_code_class
[i
] = ISO_control_code
;
3433 for (i
= 0xA1; i
< 0xFF; i
++)
3434 iso_code_class
[i
] = ISO_graphic_plane_1
;
3435 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
3436 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
3437 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
3438 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
3439 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
3440 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
3441 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
3442 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
3443 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
3444 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
3446 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
3447 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
3449 setup_coding_system (Qnil
, &keyboard_coding
);
3450 setup_coding_system (Qnil
, &terminal_coding
);
3457 Qtarget_idx
= intern ("target-idx");
3458 staticpro (&Qtarget_idx
);
3460 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
3461 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
3463 Qcall_process
= intern ("call-process");
3464 staticpro (&Qcall_process
);
3465 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
3467 Qcall_process_region
= intern ("call-process-region");
3468 staticpro (&Qcall_process_region
);
3469 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
3471 Qstart_process
= intern ("start-process");
3472 staticpro (&Qstart_process
);
3473 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
3475 Qopen_network_stream
= intern ("open-network-stream");
3476 staticpro (&Qopen_network_stream
);
3477 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
3479 Qcoding_system
= intern ("coding-system");
3480 staticpro (&Qcoding_system
);
3482 Qeol_type
= intern ("eol-type");
3483 staticpro (&Qeol_type
);
3485 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
3486 staticpro (&Qbuffer_file_coding_system
);
3488 Qpost_read_conversion
= intern ("post-read-conversion");
3489 staticpro (&Qpost_read_conversion
);
3491 Qpre_write_conversion
= intern ("pre-write-conversion");
3492 staticpro (&Qpre_write_conversion
);
3494 Qcoding_system_vector
= intern ("coding-system-vector");
3495 staticpro (&Qcoding_system_vector
);
3497 Qcoding_system_p
= intern ("coding-system-p");
3498 staticpro (&Qcoding_system_p
);
3500 Qcoding_system_error
= intern ("coding-system-error");
3501 staticpro (&Qcoding_system_error
);
3503 Fput (Qcoding_system_error
, Qerror_conditions
,
3504 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
3505 Fput (Qcoding_system_error
, Qerror_message
,
3506 build_string ("Coding-system error"));
3508 Qcoding_category_index
= intern ("coding-category-index");
3509 staticpro (&Qcoding_category_index
);
3513 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3515 coding_category_table
[i
] = intern (coding_category_name
[i
]);
3516 staticpro (&coding_category_table
[i
]);
3517 Fput (coding_category_table
[i
], Qcoding_category_index
,
3522 defsubr (&Scoding_system_vector
);
3523 defsubr (&Scoding_system_p
);
3524 defsubr (&Sread_coding_system
);
3525 defsubr (&Sread_non_nil_coding_system
);
3526 defsubr (&Scheck_coding_system
);
3527 defsubr (&Sdetect_coding_region
);
3528 defsubr (&Sdecode_coding_region
);
3529 defsubr (&Sencode_coding_region
);
3530 defsubr (&Sdecode_coding_string
);
3531 defsubr (&Sencode_coding_string
);
3532 defsubr (&Sdecode_sjis_char
);
3533 defsubr (&Sencode_sjis_char
);
3534 defsubr (&Sdecode_big5_char
);
3535 defsubr (&Sencode_big5_char
);
3536 defsubr (&Sset_terminal_coding_system
);
3537 defsubr (&Sterminal_coding_system
);
3538 defsubr (&Sset_keyboard_coding_system
);
3539 defsubr (&Skeyboard_coding_system
);
3540 defsubr (&Sfind_coding_system
);
3542 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
3543 "List of coding-categories (symbols) ordered by priority.");
3547 Vcoding_category_list
= Qnil
;
3548 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
3549 Vcoding_category_list
3550 = Fcons (coding_category_table
[i
], Vcoding_category_list
);
3553 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
3554 "A variable of internal use only.\n\
3555 If the value is a coding system, it is used for decoding on read operation.\n\
3556 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3557 Vcoding_system_for_read
= Qnil
;
3559 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
3560 "A variable of internal use only.\n\
3561 If the value is a coding system, it is used for encoding on write operation.\n\
3562 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3563 Vcoding_system_for_write
= Qnil
;
3565 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
3566 "Coding-system used in the latest file or process I/O.");
3567 Vlast_coding_system_used
= Qnil
;
3569 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
3570 "Nested alist to decide a coding system for a specific I/O operation.\n\
3571 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3573 OPERATION is one of the following Emacs I/O primitives:\n\
3574 For file I/O, insert-file-contents and write-region.\n\
3575 For process I/O, call-process, call-process-region, and start-process.\n\
3576 For network I/O, open-network-stream.\n\
3577 In addition, for process I/O, `process-argument' can be specified for\n\
3578 encoding arguments of the process.\n\
3580 REGEXP is a regular expression matching a target of OPERATION, where\n\
3581 target is a file name for file I/O operations, a process name for\n\
3582 process I/O operations, or a service name for network I/O\n\
3583 operations. REGEXP might be a port number for network I/O operation.\n\
3585 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3586 character code on OPERATION, or a function symbol returning the cons.\n\
3587 See the documentation of `find-coding-system' for more detail.");
3588 Vcoding_system_alist
= Qnil
;
3590 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
3591 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3592 eol_mnemonic_unix
= '.';
3594 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
3595 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3596 eol_mnemonic_dos
= ':';
3598 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
3599 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3600 eol_mnemonic_mac
= '\'';
3602 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
3603 "Mnemonic character indicating end-of-line format is not yet decided.");
3604 eol_mnemonic_undecided
= '-';
3606 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table
,
3607 "Alist of charsets vs the alternate charsets.\n\
3608 While decoding, if a charset (car part of an element) is found,\n\
3609 decode it as the alternate charset (cdr part of the element).");
3610 Valternate_charset_table
= Qnil
;
3612 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
3613 "Alist of charsets vs revision numbers.\n\
3614 While encoding, if a charset (car part of an element) is found,\n\
3615 designate it with the escape sequence identifing revision (cdr part of the element).");
3616 Vcharset_revision_alist
= Qnil
;