1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
107 detect_coding_emacs_mule (src
, src_end
)
108 unsigned char *src
, *src_end
;
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
123 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
124 struct coding_system
*coding
;
125 unsigned char *source
, *destination
;
126 int src_bytes
, dst_bytes
;
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
142 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
143 struct coding_system
*coding
;
144 unsigned char *source
, *destination
;
145 int src_bytes
, dst_bytes
;
152 /*** COMMONLY USED MACROS ***/
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
160 #define ONE_MORE_BYTE(c1) \
165 goto label_end_of_loop; \
168 #define TWO_MORE_BYTES(c1, c2) \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
173 goto label_end_of_loop; \
176 #define THREE_MORE_BYTES(c1, c2, c3) \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
181 goto label_end_of_loop; \
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
191 /* Decode one ASCII character C. */
193 #define DECODE_CHARACTER_ASCII(c) \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
226 /*** 1. Preamble ***/
240 #else /* not emacs */
244 #endif /* not emacs */
246 Lisp_Object Qcoding_system
, Qeol_type
;
247 Lisp_Object Qbuffer_file_coding_system
;
248 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
250 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
251 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
252 Lisp_Object Qstart_process
, Qopen_network_stream
;
253 Lisp_Object Qtarget_idx
;
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
257 /* Mnemonic character to indicate format of end-of-line is not yet
259 int eol_mnemonic_undecided
;
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
267 Lisp_Object Qcoding_system_spec
, Qcoding_system_p
, Qcoding_system_error
;
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule
;
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read
;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write
;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used
;
281 /* Flag to inhibit code conversion of end-of-line format. */
282 int inhibit_eol_conversion
;
284 /* Coding-system of what terminal accept for displaying. */
285 struct coding_system terminal_coding
;
287 /* Coding-system of what is sent from terminal keyboard. */
288 struct coding_system keyboard_coding
;
290 Lisp_Object Vfile_coding_system_alist
;
291 Lisp_Object Vprocess_coding_system_alist
;
292 Lisp_Object Vnetwork_coding_system_alist
;
296 Lisp_Object Qcoding_category_index
;
298 /* List of symbols `coding-category-xxx' ordered by priority. */
299 Lisp_Object Vcoding_category_list
;
301 /* Table of coding-systems currently assigned to each coding-category. */
302 Lisp_Object coding_category_table
[CODING_CATEGORY_IDX_MAX
];
304 /* Table of names of symbol for each coding-category. */
305 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
306 "coding-category-emacs-mule",
307 "coding-category-sjis",
308 "coding-category-iso-7",
309 "coding-category-iso-8-1",
310 "coding-category-iso-8-2",
311 "coding-category-iso-else",
312 "coding-category-big5",
313 "coding-category-binary"
316 /* Flag to tell if we look up unification table on character code
318 Lisp_Object Venable_character_unification
;
319 /* Standard unification table to look up on decoding (reading). */
320 Lisp_Object Vstandard_character_unification_table_for_decode
;
321 /* Standard unification table to look up on encoding (writing). */
322 Lisp_Object Vstandard_character_unification_table_for_encode
;
324 Lisp_Object Qcharacter_unification_table
;
325 Lisp_Object Qcharacter_unification_table_for_decode
;
326 Lisp_Object Qcharacter_unification_table_for_encode
;
328 /* Alist of charsets vs revision number. */
329 Lisp_Object Vcharset_revision_alist
;
331 /* Default coding systems used for process I/O. */
332 Lisp_Object Vdefault_process_coding_system
;
335 /*** 2. Emacs internal format (emacs-mule) handlers ***/
337 /* Emacs' internal format for encoding multiple character sets is a
338 kind of multi-byte encoding, i.e. characters are encoded by
339 variable-length sequences of one-byte codes. ASCII characters
340 and control characters (e.g. `tab', `newline') are represented by
341 one-byte sequences which are their ASCII codes, in the range 0x00
342 through 0x7F. The other characters are represented by a sequence
343 of `base leading-code', optional `extended leading-code', and one
344 or two `position-code's. The length of the sequence is determined
345 by the base leading-code. Leading-code takes the range 0x80
346 through 0x9F, whereas extended leading-code and position-code take
347 the range 0xA0 through 0xFF. See `charset.h' for more details
348 about leading-code and position-code.
350 There's one exception to this rule. Special leading-code
351 `leading-code-composition' denotes that the following several
352 characters should be composed into one character. Leading-codes of
353 components (except for ASCII) are added 0x20. An ASCII character
354 component is represented by a 2-byte sequence of `0xA0' and
355 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
356 details of composite character. Hence, we can summarize the code
359 --- CODE RANGE of Emacs' internal format ---
360 (character set) (range)
362 ELSE (1st byte) 0x80 .. 0x9F
363 (rest bytes) 0xA0 .. 0xFF
364 ---------------------------------------------
368 enum emacs_code_class_type emacs_code_class
[256];
370 /* Go to the next statement only if *SRC is accessible and the code is
371 greater than 0xA0. */
372 #define CHECK_CODE_RANGE_A0_FF \
374 if (src >= src_end) \
375 goto label_end_of_switch; \
376 else if (*src++ < 0xA0) \
380 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
381 Check if a text is encoded in Emacs' internal format. If it is,
382 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
385 detect_coding_emacs_mule (src
, src_end
)
386 unsigned char *src
, *src_end
;
391 while (src
< src_end
)
403 switch (emacs_code_class
[c
])
405 case EMACS_ascii_code
:
406 case EMACS_linefeed_code
:
409 case EMACS_control_code
:
410 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
414 case EMACS_invalid_code
:
417 case EMACS_leading_code_composition
: /* c == 0x80 */
419 CHECK_CODE_RANGE_A0_FF
;
424 case EMACS_leading_code_4
:
425 CHECK_CODE_RANGE_A0_FF
;
426 /* fall down to check it two more times ... */
428 case EMACS_leading_code_3
:
429 CHECK_CODE_RANGE_A0_FF
;
430 /* fall down to check it one more time ... */
432 case EMACS_leading_code_2
:
433 CHECK_CODE_RANGE_A0_FF
;
441 return CODING_CATEGORY_MASK_EMACS_MULE
;
445 /*** 3. ISO2022 handlers ***/
447 /* The following note describes the coding system ISO2022 briefly.
448 Since the intention of this note is to help in understanding of
449 the programs in this file, some parts are NOT ACCURATE or OVERLY
450 SIMPLIFIED. For the thorough understanding, please refer to the
451 original document of ISO2022.
453 ISO2022 provides many mechanisms to encode several character sets
454 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
455 all text is encoded by codes of less than 128. This may make the
456 encoded text a little bit longer, but the text gets more stability
457 to pass through several gateways (some of them strip off the MSB).
459 There are two kinds of character set: control character set and
460 graphic character set. The former contains control characters such
461 as `newline' and `escape' to provide control functions (control
462 functions are provided also by escape sequences). The latter
463 contains graphic characters such as ' A' and '-'. Emacs recognizes
464 two control character sets and many graphic character sets.
466 Graphic character sets are classified into one of the following
467 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
468 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
469 bytes (DIMENSION) and the number of characters in one dimension
470 (CHARS) of the set. In addition, each character set is assigned an
471 identification tag (called "final character" and denoted as <F>
472 here after) which is unique in each class. <F> of each character
473 set is decided by ECMA(*) when it is registered in ISO. Code range
474 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
476 Note (*): ECMA = European Computer Manufacturers Association
478 Here are examples of graphic character set [NAME(<F>)]:
479 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
480 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
481 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
482 o DIMENSION2_CHARS96 -- none for the moment
484 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
485 C0 [0x00..0x1F] -- control character plane 0
486 GL [0x20..0x7F] -- graphic character plane 0
487 C1 [0x80..0x9F] -- control character plane 1
488 GR [0xA0..0xFF] -- graphic character plane 1
490 A control character set is directly designated and invoked to C0 or
491 C1 by an escape sequence. The most common case is that ISO646's
492 control character set is designated/invoked to C0 and ISO6429's
493 control character set is designated/invoked to C1, and usually
494 these designations/invocations are omitted in a coded text. With
495 7-bit environment, only C0 can be used, and a control character for
496 C1 is encoded by an appropriate escape sequence to fit in the
497 environment. All control characters for C1 are defined the
498 corresponding escape sequences.
500 A graphic character set is at first designated to one of four
501 graphic registers (G0 through G3), then these graphic registers are
502 invoked to GL or GR. These designations and invocations can be
503 done independently. The most common case is that G0 is invoked to
504 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
505 these invocations and designations are omitted in a coded text.
506 With 7-bit environment, only GL can be used.
508 When a graphic character set of CHARS94 is invoked to GL, code 0x20
509 and 0x7F of GL area work as control characters SPACE and DEL
510 respectively, and code 0xA0 and 0xFF of GR area should not be used.
512 There are two ways of invocation: locking-shift and single-shift.
513 With locking-shift, the invocation lasts until the next different
514 invocation, whereas with single-shift, the invocation works only
515 for the following character and doesn't affect locking-shift.
516 Invocations are done by the following control characters or escape
519 ----------------------------------------------------------------------
520 function control char escape sequence description
521 ----------------------------------------------------------------------
522 SI (shift-in) 0x0F none invoke G0 to GL
523 SI (shift-out) 0x0E none invoke G1 to GL
524 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
525 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
526 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
527 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
528 ----------------------------------------------------------------------
529 The first four are for locking-shift. Control characters for these
530 functions are defined by macros ISO_CODE_XXX in `coding.h'.
532 Designations are done by the following escape sequences.
533 ----------------------------------------------------------------------
534 escape sequence description
535 ----------------------------------------------------------------------
536 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
537 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
538 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
539 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
540 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
541 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
542 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
543 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
544 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
545 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
546 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
547 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
548 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
549 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
550 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
551 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
552 ----------------------------------------------------------------------
554 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
555 of dimension 1, chars 94, and final character <F>, and etc.
557 Note (*): Although these designations are not allowed in ISO2022,
558 Emacs accepts them on decoding, and produces them on encoding
559 CHARS96 character set in a coding system which is characterized as
560 7-bit environment, non-locking-shift, and non-single-shift.
562 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
563 '(' can be omitted. We call this as "short-form" here after.
565 Now you may notice that there are a lot of ways for encoding the
566 same multilingual text in ISO2022. Actually, there exists many
567 coding systems such as Compound Text (used in X's inter client
568 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
569 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
570 localized platforms), and all of these are variants of ISO2022.
572 In addition to the above, Emacs handles two more kinds of escape
573 sequences: ISO6429's direction specification and Emacs' private
574 sequence for specifying character composition.
576 ISO6429's direction specification takes the following format:
577 o CSI ']' -- end of the current direction
578 o CSI '0' ']' -- end of the current direction
579 o CSI '1' ']' -- start of left-to-right text
580 o CSI '2' ']' -- start of right-to-left text
581 The control character CSI (0x9B: control sequence introducer) is
582 abbreviated to the escape sequence ESC '[' in 7-bit environment.
584 Character composition specification takes the following format:
585 o ESC '0' -- start character composition
586 o ESC '1' -- end character composition
587 Since these are not standard escape sequences of any ISO, the use
588 of them for these meaning is restricted to Emacs only. */
590 enum iso_code_class_type iso_code_class
[256];
592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
593 Check if a text is encoded in ISO2022. If it is, returns an
594 integer in which appropriate flag bits any of:
595 CODING_CATEGORY_MASK_ISO_7
596 CODING_CATEGORY_MASK_ISO_8_1
597 CODING_CATEGORY_MASK_ISO_8_2
598 CODING_CATEGORY_MASK_ISO_ELSE
599 are set. If a code which should never appear in ISO2022 is found,
603 detect_coding_iso2022 (src
, src_end
)
604 unsigned char *src
, *src_end
;
606 int mask
= (CODING_CATEGORY_MASK_ISO_7
607 | CODING_CATEGORY_MASK_ISO_8_1
608 | CODING_CATEGORY_MASK_ISO_8_2
609 | CODING_CATEGORY_MASK_ISO_ELSE
);
610 int g1
= 0; /* 1 iff designating to G1. */
613 while (src
< src_end
)
623 && ((c
>= '(' && c
<= '/')
624 || c
== '$' && ((*src
>= '(' && *src
<= '/')
625 || (*src
>= '@' && *src
<= 'B'))))
627 /* Valid designation sequence. */
628 if (c
== ')' || (c
== '$' && *src
== ')'))
631 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
636 else if (c
== 'N' || c
== 'O' || c
== 'n' || c
== 'o')
637 return CODING_CATEGORY_MASK_ISO_ELSE
;
642 return CODING_CATEGORY_MASK_ISO_ELSE
;
648 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
660 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
661 while (src
< src_end
&& *src
>= 0xA0)
663 if (count
& 1 && src
< src_end
)
664 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
673 /* Decode a character of which charset is CHARSET and the 1st position
674 code is C1. If dimension of CHARSET is 2, the 2nd position code is
675 fetched from SRC and set to C2. If CHARSET is negative, it means
676 that we are decoding ill formed text, and what we can do is just to
679 #define DECODE_ISO_CHARACTER(charset, c1) \
681 int c_alt, charset_alt = (charset); \
682 if (COMPOSING_HEAD_P (coding->composing)) \
684 *dst++ = LEADING_CODE_COMPOSITION; \
685 if (COMPOSING_WITH_RULE_P (coding->composing)) \
686 /* To tell composition rules are embeded. */ \
688 coding->composing += 2; \
690 if ((charset) >= 0) \
692 if (CHARSET_DIMENSION (charset) == 2) \
693 ONE_MORE_BYTE (c2); \
694 if (!NILP (unification_table) \
695 && ((c_alt = unify_char (unification_table, \
696 -1, (charset), c1, c2)) >= 0)) \
697 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
699 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
700 DECODE_CHARACTER_ASCII (c1); \
701 else if (CHARSET_DIMENSION (charset_alt) == 1) \
702 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
704 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
705 if (COMPOSING_WITH_RULE_P (coding->composing)) \
706 /* To tell a composition rule follows. */ \
707 coding->composing = COMPOSING_WITH_RULE_RULE; \
710 /* Set designation state into CODING. */
711 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
713 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
714 make_number (chars), \
715 make_number (final_char)); \
718 if (coding->direction == 1 \
719 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
720 charset = CHARSET_REVERSE_CHARSET (charset); \
721 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
725 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
728 decode_coding_iso2022 (coding
, source
, destination
,
729 src_bytes
, dst_bytes
, consumed
)
730 struct coding_system
*coding
;
731 unsigned char *source
, *destination
;
732 int src_bytes
, dst_bytes
;
735 unsigned char *src
= source
;
736 unsigned char *src_end
= source
+ src_bytes
;
737 unsigned char *dst
= destination
;
738 unsigned char *dst_end
= destination
+ dst_bytes
;
739 /* Since the maximum bytes produced by each loop is 7, we subtract 6
740 from DST_END to assure that overflow checking is necessary only
741 at the head of loop. */
742 unsigned char *adjusted_dst_end
= dst_end
- 6;
744 /* Charsets invoked to graphic plane 0 and 1 respectively. */
745 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
746 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
747 Lisp_Object unification_table
748 = coding
->character_unification_table_for_decode
;
750 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
751 unification_table
= Vstandard_character_unification_table_for_decode
;
753 while (src
< src_end
&& dst
< adjusted_dst_end
)
755 /* SRC_BASE remembers the start position in source in each loop.
756 The loop will be exited when there's not enough source text
757 to analyze long escape sequence or 2-byte code (within macros
758 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
759 to SRC_BASE before exiting. */
760 unsigned char *src_base
= src
;
763 switch (iso_code_class
[c1
])
765 case ISO_0x20_or_0x7F
:
766 if (!coding
->composing
767 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
769 /* This is SPACE or DEL. */
773 /* This is a graphic character, we fall down ... */
775 case ISO_graphic_plane_0
:
776 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
778 /* This is a composition rule. */
780 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
783 DECODE_ISO_CHARACTER (charset0
, c1
);
786 case ISO_0xA0_or_0xFF
:
787 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94)
793 /* This is a graphic character, we fall down ... */
795 case ISO_graphic_plane_1
:
796 DECODE_ISO_CHARACTER (charset1
, c1
);
799 case ISO_control_code
:
800 /* All ISO2022 control characters in this class have the
801 same representation in Emacs internal format. */
805 case ISO_carriage_return
:
806 if (coding
->eol_type
== CODING_EOL_CR
)
810 else if (coding
->eol_type
== CODING_EOL_CRLF
)
813 if (c1
== ISO_CODE_LF
)
828 if (CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
829 goto label_invalid_escape_sequence
;
830 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
831 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
835 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
836 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
839 case ISO_single_shift_2_7
:
840 case ISO_single_shift_2
:
841 /* SS2 is handled as an escape sequence of ESC 'N' */
843 goto label_escape_sequence
;
845 case ISO_single_shift_3
:
846 /* SS2 is handled as an escape sequence of ESC 'O' */
848 goto label_escape_sequence
;
850 case ISO_control_sequence_introducer
:
851 /* CSI is handled as an escape sequence of ESC '[' ... */
853 goto label_escape_sequence
;
857 label_escape_sequence
:
858 /* Escape sequences handled by Emacs are invocation,
859 designation, direction specification, and character
860 composition specification. */
863 case '&': /* revision of following character set */
865 if (!(c1
>= '@' && c1
<= '~'))
866 goto label_invalid_escape_sequence
;
868 if (c1
!= ISO_CODE_ESC
)
869 goto label_invalid_escape_sequence
;
871 goto label_escape_sequence
;
873 case '$': /* designation of 2-byte character set */
875 if (c1
>= '@' && c1
<= 'B')
876 { /* designation of JISX0208.1978, GB2312.1980,
878 DECODE_DESIGNATION (0, 2, 94, c1
);
880 else if (c1
>= 0x28 && c1
<= 0x2B)
881 { /* designation of DIMENSION2_CHARS94 character set */
883 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
885 else if (c1
>= 0x2C && c1
<= 0x2F)
886 { /* designation of DIMENSION2_CHARS96 character set */
888 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
891 goto label_invalid_escape_sequence
;
894 case 'n': /* invocation of locking-shift-2 */
895 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
896 goto label_invalid_escape_sequence
;
897 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
898 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
901 case 'o': /* invocation of locking-shift-3 */
902 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
903 goto label_invalid_escape_sequence
;
904 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
905 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
908 case 'N': /* invocation of single-shift-2 */
909 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
910 goto label_invalid_escape_sequence
;
912 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
913 DECODE_ISO_CHARACTER (charset
, c1
);
916 case 'O': /* invocation of single-shift-3 */
917 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
918 goto label_invalid_escape_sequence
;
920 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
921 DECODE_ISO_CHARACTER (charset
, c1
);
924 case '0': /* start composing without embeded rules */
925 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
928 case '1': /* end composing */
929 coding
->composing
= COMPOSING_NO
;
932 case '2': /* start composing with embeded rules */
933 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
936 case '[': /* specification of direction */
937 /* For the moment, nested direction is not supported.
938 So, the value of `coding->direction' is 0 or 1: 0
939 means left-to-right, 1 means right-to-left. */
943 case ']': /* end of the current direction */
944 coding
->direction
= 0;
946 case '0': /* end of the current direction */
947 case '1': /* start of left-to-right direction */
950 coding
->direction
= 0;
952 goto label_invalid_escape_sequence
;
955 case '2': /* start of right-to-left direction */
958 coding
->direction
= 1;
960 goto label_invalid_escape_sequence
;
964 goto label_invalid_escape_sequence
;
969 if (c1
>= 0x28 && c1
<= 0x2B)
970 { /* designation of DIMENSION1_CHARS94 character set */
972 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
974 else if (c1
>= 0x2C && c1
<= 0x2F)
975 { /* designation of DIMENSION1_CHARS96 character set */
977 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
981 goto label_invalid_escape_sequence
;
984 /* We must update these variables now. */
985 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
986 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
989 label_invalid_escape_sequence
:
991 int length
= src
- src_base
;
993 bcopy (src_base
, dst
, length
);
1000 coding
->carryover_size
= src
- src_base
;
1001 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1006 /* If this is the last block of the text to be decoded, we had
1007 better just flush out all remaining codes in the text although
1008 they are not valid characters. */
1009 if (coding
->last_block
)
1011 bcopy (src
, dst
, src_end
- src
);
1012 dst
+= (src_end
- src
);
1015 *consumed
= src
- source
;
1016 return dst
- destination
;
1019 /* ISO2022 encoding stuff. */
1022 It is not enough to say just "ISO2022" on encoding, we have to
1023 specify more details. In Emacs, each coding-system of ISO2022
1024 variant has the following specifications:
1025 1. Initial designation to G0 thru G3.
1026 2. Allows short-form designation?
1027 3. ASCII should be designated to G0 before control characters?
1028 4. ASCII should be designated to G0 at end of line?
1029 5. 7-bit environment or 8-bit environment?
1030 6. Use locking-shift?
1031 7. Use Single-shift?
1032 And the following two are only for Japanese:
1033 8. Use ASCII in place of JIS0201-1976-Roman?
1034 9. Use JISX0208-1983 in place of JISX0208-1978?
1035 These specifications are encoded in `coding->flags' as flag bits
1036 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1040 /* Produce codes (escape sequence) for designating CHARSET to graphic
1041 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1042 the coding system CODING allows, produce designation sequence of
1045 #define ENCODE_DESIGNATION(charset, reg, coding) \
1047 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1048 char *intermediate_char_94 = "()*+"; \
1049 char *intermediate_char_96 = ",-./"; \
1051 = Fassq (make_number (charset), Vcharset_revision_alist); \
1052 if (! NILP (temp)) \
1054 *dst++ = ISO_CODE_ESC; \
1056 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1058 *dst++ = ISO_CODE_ESC; \
1059 if (CHARSET_DIMENSION (charset) == 1) \
1061 if (CHARSET_CHARS (charset) == 94) \
1062 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1064 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1069 if (CHARSET_CHARS (charset) == 94) \
1071 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1073 || final_char < '@' || final_char > 'B') \
1074 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1077 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1079 *dst++ = final_char; \
1080 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1083 /* The following two macros produce codes (control character or escape
1084 sequence) for ISO2022 single-shift functions (single-shift-2 and
1087 #define ENCODE_SINGLE_SHIFT_2 \
1089 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1092 *dst++ = ISO_CODE_SS2; \
1093 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1096 #define ENCODE_SINGLE_SHIFT_3 \
1098 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1099 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1101 *dst++ = ISO_CODE_SS3; \
1102 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1105 /* The following four macros produce codes (control character or
1106 escape sequence) for ISO2022 locking-shift functions (shift-in,
1107 shift-out, locking-shift-2, and locking-shift-3). */
1109 #define ENCODE_SHIFT_IN \
1111 *dst++ = ISO_CODE_SI; \
1112 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1115 #define ENCODE_SHIFT_OUT \
1117 *dst++ = ISO_CODE_SO; \
1118 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1121 #define ENCODE_LOCKING_SHIFT_2 \
1123 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1124 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1127 #define ENCODE_LOCKING_SHIFT_3 \
1129 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1130 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1133 /* Produce codes for a DIMENSION1 character whose character set is
1134 CHARSET and whose position-code is C1. Designation and invocation
1135 sequences are also produced in advance if necessary. */
1138 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1140 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1142 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1143 *dst++ = c1 & 0x7F; \
1145 *dst++ = c1 | 0x80; \
1146 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1149 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1151 *dst++ = c1 & 0x7F; \
1154 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1156 *dst++ = c1 | 0x80; \
1160 /* Since CHARSET is not yet invoked to any graphic planes, we \
1161 must invoke it, or, at first, designate it to some graphic \
1162 register. Then repeat the loop to actually produce the \
1164 dst = encode_invocation_designation (charset, coding, dst); \
1167 /* Produce codes for a DIMENSION2 character whose character set is
1168 CHARSET and whose position-codes are C1 and C2. Designation and
1169 invocation codes are also produced in advance if necessary. */
1171 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1173 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1175 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1176 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1178 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1179 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1182 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1184 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1187 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1189 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1193 /* Since CHARSET is not yet invoked to any graphic planes, we \
1194 must invoke it, or, at first, designate it to some graphic \
1195 register. Then repeat the loop to actually produce the \
1197 dst = encode_invocation_designation (charset, coding, dst); \
1200 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1202 int c_alt, charset_alt; \
1203 if (!NILP (unification_table) \
1204 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1206 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1208 charset_alt = charset; \
1209 if (CHARSET_DIMENSION (charset_alt) == 1) \
1210 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1212 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1215 /* Produce designation and invocation codes at a place pointed by DST
1216 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1220 encode_invocation_designation (charset
, coding
, dst
)
1222 struct coding_system
*coding
;
1225 int reg
; /* graphic register number */
1227 /* At first, check designations. */
1228 for (reg
= 0; reg
< 4; reg
++)
1229 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1234 /* CHARSET is not yet designated to any graphic registers. */
1235 /* At first check the requested designation. */
1236 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1237 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1238 /* Since CHARSET requests no special designation, designate it
1239 to graphic register 0. */
1242 ENCODE_DESIGNATION (charset
, reg
, coding
);
1245 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1246 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1248 /* Since the graphic register REG is not invoked to any graphic
1249 planes, invoke it to graphic plane 0. */
1252 case 0: /* graphic register 0 */
1256 case 1: /* graphic register 1 */
1260 case 2: /* graphic register 2 */
1261 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1262 ENCODE_SINGLE_SHIFT_2
;
1264 ENCODE_LOCKING_SHIFT_2
;
1267 case 3: /* graphic register 3 */
1268 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1269 ENCODE_SINGLE_SHIFT_3
;
1271 ENCODE_LOCKING_SHIFT_3
;
1278 /* The following two macros produce codes for indicating composition. */
1279 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1280 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1281 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1283 /* The following three macros produce codes for indicating direction
1285 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1287 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1288 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1290 *dst++ = ISO_CODE_CSI; \
1293 #define ENCODE_DIRECTION_R2L \
1294 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1296 #define ENCODE_DIRECTION_L2R \
1297 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1299 /* Produce codes for designation and invocation to reset the graphic
1300 planes and registers to initial state. */
1301 #define ENCODE_RESET_PLANE_AND_REGISTER \
1304 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1306 for (reg = 0; reg < 4; reg++) \
1307 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1308 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1309 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1310 ENCODE_DESIGNATION \
1311 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1314 /* Produce designation sequences of charsets in the line started from
1315 *SRC to a place pointed by DSTP.
1317 If the current block ends before any end-of-line, we may fail to
1318 find all the necessary *designations. */
1319 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1320 struct coding_system
*coding
;
1322 unsigned char *src
, *src_end
, **dstp
;
1324 int charset
, c
, found
= 0, reg
;
1325 /* Table of charsets to be designated to each graphic register. */
1327 unsigned char *dst
= *dstp
;
1329 for (reg
= 0; reg
< 4; reg
++)
1332 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1334 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1337 charset
= CHARSET_AT (src
);
1342 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1343 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1344 charset
= CHAR_CHARSET (c_alt
);
1347 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1348 if (r
[reg
] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1359 for (reg
= 0; reg
< 4; reg
++)
1361 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1362 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1367 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1370 encode_coding_iso2022 (coding
, source
, destination
,
1371 src_bytes
, dst_bytes
, consumed
)
1372 struct coding_system
*coding
;
1373 unsigned char *source
, *destination
;
1374 int src_bytes
, dst_bytes
;
1377 unsigned char *src
= source
;
1378 unsigned char *src_end
= source
+ src_bytes
;
1379 unsigned char *dst
= destination
;
1380 unsigned char *dst_end
= destination
+ dst_bytes
;
1381 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1382 from DST_END to assure overflow checking is necessary only at the
1384 unsigned char *adjusted_dst_end
= dst_end
- 19;
1385 Lisp_Object unification_table
1386 = coding
->character_unification_table_for_encode
;
1388 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1389 unification_table
= Vstandard_character_unification_table_for_encode
;
1391 while (src
< src_end
&& dst
< adjusted_dst_end
)
1393 /* SRC_BASE remembers the start position in source in each loop.
1394 The loop will be exited when there's not enough source text
1395 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1396 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1397 reset to SRC_BASE before exiting. */
1398 unsigned char *src_base
= src
;
1399 int charset
, c1
, c2
, c3
, c4
;
1401 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1402 && CODING_SPEC_ISO_BOL (coding
))
1404 /* We have to produce designation sequences if any now. */
1405 encode_designation_at_bol (coding
, unification_table
,
1406 src
, src_end
, &dst
);
1407 CODING_SPEC_ISO_BOL (coding
) = 0;
1411 /* If we are seeing a component of a composite character, we are
1412 seeing a leading-code specially encoded for composition, or a
1413 composition rule if composing with rule. We must set C1
1414 to a normal leading-code or an ASCII code. If we are not at
1415 a composed character, we must reset the composition state. */
1416 if (COMPOSING_P (coding
->composing
))
1420 /* We are not in a composite character any longer. */
1421 coding
->composing
= COMPOSING_NO
;
1422 ENCODE_COMPOSITION_END
;
1426 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1429 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1432 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1433 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1436 /* This is an ASCII component. */
1441 /* This is a leading-code of non ASCII component. */
1446 /* Now encode one character. C1 is a control character, an
1447 ASCII character, or a leading-code of multi-byte character. */
1448 switch (emacs_code_class
[c1
])
1450 case EMACS_ascii_code
:
1451 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1454 case EMACS_control_code
:
1455 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1456 ENCODE_RESET_PLANE_AND_REGISTER
;
1460 case EMACS_carriage_return_code
:
1461 if (!coding
->selective
)
1463 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1464 ENCODE_RESET_PLANE_AND_REGISTER
;
1468 /* fall down to treat '\r' as '\n' ... */
1470 case EMACS_linefeed_code
:
1471 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1472 ENCODE_RESET_PLANE_AND_REGISTER
;
1473 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1474 bcopy (coding
->spec
.iso2022
.initial_designation
,
1475 coding
->spec
.iso2022
.current_designation
,
1476 sizeof coding
->spec
.iso2022
.initial_designation
);
1477 if (coding
->eol_type
== CODING_EOL_LF
1478 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1479 *dst
++ = ISO_CODE_LF
;
1480 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1481 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1483 *dst
++ = ISO_CODE_CR
;
1484 CODING_SPEC_ISO_BOL (coding
) = 1;
1487 case EMACS_leading_code_2
:
1489 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1492 case EMACS_leading_code_3
:
1493 TWO_MORE_BYTES (c2
, c3
);
1494 if (c1
< LEADING_CODE_PRIVATE_11
)
1495 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1497 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1500 case EMACS_leading_code_4
:
1501 THREE_MORE_BYTES (c2
, c3
, c4
);
1502 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1505 case EMACS_leading_code_composition
:
1509 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1510 ENCODE_COMPOSITION_WITH_RULE_START
;
1514 /* Rewind one byte because it is a character code of
1515 composition elements. */
1517 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1518 ENCODE_COMPOSITION_NO_RULE_START
;
1522 case EMACS_invalid_code
:
1528 coding
->carryover_size
= src
- src_base
;
1529 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1533 /* If this is the last block of the text to be encoded, we must
1534 reset graphic planes and registers to the initial state. */
1535 if (src
>= src_end
&& coding
->last_block
)
1537 ENCODE_RESET_PLANE_AND_REGISTER
;
1538 if (coding
->carryover_size
> 0
1539 && coding
->carryover_size
< (dst_end
- dst
))
1541 bcopy (coding
->carryover
, dst
, coding
->carryover_size
);
1542 dst
+= coding
->carryover_size
;
1543 coding
->carryover_size
= 0;
1546 *consumed
= src
- source
;
1547 return dst
- destination
;
1551 /*** 4. SJIS and BIG5 handlers ***/
1553 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1554 quite widely. So, for the moment, Emacs supports them in the bare
1555 C code. But, in the future, they may be supported only by CCL. */
1557 /* SJIS is a coding system encoding three character sets: ASCII, right
1558 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1559 as is. A character of charset katakana-jisx0201 is encoded by
1560 "position-code + 0x80". A character of charset japanese-jisx0208
1561 is encoded in 2-byte but two position-codes are divided and shifted
1562 so that it fit in the range below.
1564 --- CODE RANGE of SJIS ---
1565 (character set) (range)
1567 KATAKANA-JISX0201 0xA0 .. 0xDF
1568 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1569 (2nd byte) 0x40 .. 0xFF
1570 -------------------------------
1574 /* BIG5 is a coding system encoding two character sets: ASCII and
1575 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1576 character set and is encoded in two-byte.
1578 --- CODE RANGE of BIG5 ---
1579 (character set) (range)
1581 Big5 (1st byte) 0xA1 .. 0xFE
1582 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1583 --------------------------
1585 Since the number of characters in Big5 is larger than maximum
1586 characters in Emacs' charset (96x96), it can't be handled as one
1587 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1588 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1589 contains frequently used characters and the latter contains less
1590 frequently used characters. */
1592 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1593 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1594 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1595 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1597 /* Number of Big5 characters which have the same code in 1st byte. */
1598 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1600 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1603 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1605 charset = charset_big5_1; \
1608 charset = charset_big5_2; \
1609 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1611 c1 = temp / (0xFF - 0xA1) + 0x21; \
1612 c2 = temp % (0xFF - 0xA1) + 0x21; \
1615 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1617 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1618 if (charset == charset_big5_2) \
1619 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1620 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1621 b2 = temp % BIG5_SAME_ROW; \
1622 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1625 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1627 int c_alt, charset_alt = (charset); \
1628 if (!NILP (unification_table) \
1629 && ((c_alt = unify_char (unification_table, \
1630 -1, (charset), c1, c2)) >= 0)) \
1631 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1632 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1633 DECODE_CHARACTER_ASCII (c1); \
1634 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1635 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1637 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1640 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1642 int c_alt, charset_alt; \
1643 if (!NILP (unification_table) \
1644 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1646 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1648 charset_alt = charset; \
1649 if (charset_alt == charset_ascii) \
1651 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1653 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1656 *dst++ = charset_alt, *dst++ = c1; \
1660 c1 &= 0x7F, c2 &= 0x7F; \
1661 if (sjis_p && charset_alt == charset_jisx0208) \
1663 unsigned char s1, s2; \
1665 ENCODE_SJIS (c1, c2, s1, s2); \
1666 *dst++ = s1, *dst++ = s2; \
1669 && (charset_alt == charset_big5_1 \
1670 || charset_alt == charset_big5_2)) \
1672 unsigned char b1, b2; \
1674 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1675 *dst++ = b1, *dst++ = b2; \
1678 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1683 Check if a text is encoded in SJIS. If it is, return
1684 CODING_CATEGORY_MASK_SJIS, else return 0. */
1687 detect_coding_sjis (src
, src_end
)
1688 unsigned char *src
, *src_end
;
1692 while (src
< src_end
)
1695 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1697 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
1699 if (src
< src_end
&& *src
++ < 0x40)
1703 return CODING_CATEGORY_MASK_SJIS
;
1706 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1707 Check if a text is encoded in BIG5. If it is, return
1708 CODING_CATEGORY_MASK_BIG5, else return 0. */
1711 detect_coding_big5 (src
, src_end
)
1712 unsigned char *src
, *src_end
;
1716 while (src
< src_end
)
1719 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1726 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
1730 return CODING_CATEGORY_MASK_BIG5
;
1733 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1734 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1737 decode_coding_sjis_big5 (coding
, source
, destination
,
1738 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1739 struct coding_system
*coding
;
1740 unsigned char *source
, *destination
;
1741 int src_bytes
, dst_bytes
;
1745 unsigned char *src
= source
;
1746 unsigned char *src_end
= source
+ src_bytes
;
1747 unsigned char *dst
= destination
;
1748 unsigned char *dst_end
= destination
+ dst_bytes
;
1749 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1750 from DST_END to assure overflow checking is necessary only at the
1752 unsigned char *adjusted_dst_end
= dst_end
- 3;
1753 Lisp_Object unification_table
1754 = coding
->character_unification_table_for_decode
;
1756 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1757 unification_table
= Vstandard_character_unification_table_for_decode
;
1759 while (src
< src_end
&& dst
< adjusted_dst_end
)
1761 /* SRC_BASE remembers the start position in source in each loop.
1762 The loop will be exited when there's not enough source text
1763 to analyze two-byte character (within macro ONE_MORE_BYTE).
1764 In that case, SRC is reset to SRC_BASE before exiting. */
1765 unsigned char *src_base
= src
;
1766 unsigned char c1
= *src
++, c2
, c3
, c4
;
1770 if (coding
->eol_type
== CODING_EOL_CRLF
)
1776 /* To process C2 again, SRC is subtracted by 1. */
1785 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1786 else if (c1
< 0xA0 || c1
>= 0xE0)
1788 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1792 DECODE_SJIS (c1
, c2
, c3
, c4
);
1793 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
1795 else if (c1
>= 0xE0 && c1
< 0xFF)
1800 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1801 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1803 else /* Invalid code */
1808 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1810 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
, /* dummy */ c2
);
1816 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1817 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1823 coding
->carryover_size
= src
- src_base
;
1824 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1829 *consumed
= src
- source
;
1830 return dst
- destination
;
1833 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1834 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1835 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1836 sure that all these charsets are registered as official charset
1837 (i.e. do not have extended leading-codes). Characters of other
1838 charsets are produced without any encoding. If SJIS_P is 1, encode
1839 SJIS text, else encode BIG5 text. */
1842 encode_coding_sjis_big5 (coding
, source
, destination
,
1843 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1844 struct coding_system
*coding
;
1845 unsigned char *source
, *destination
;
1846 int src_bytes
, dst_bytes
;
1850 unsigned char *src
= source
;
1851 unsigned char *src_end
= source
+ src_bytes
;
1852 unsigned char *dst
= destination
;
1853 unsigned char *dst_end
= destination
+ dst_bytes
;
1854 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1855 from DST_END to assure overflow checking is necessary only at the
1857 unsigned char *adjusted_dst_end
= dst_end
- 1;
1858 Lisp_Object unification_table
1859 = coding
->character_unification_table_for_encode
;
1861 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1862 unification_table
= Vstandard_character_unification_table_for_encode
;
1864 while (src
< src_end
&& dst
< adjusted_dst_end
)
1866 /* SRC_BASE remembers the start position in source in each loop.
1867 The loop will be exited when there's not enough source text
1868 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1869 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1871 unsigned char *src_base
= src
;
1872 unsigned char c1
= *src
++, c2
, c3
, c4
;
1874 if (coding
->composing
)
1881 else if (c1
>= 0xA0)
1884 coding
->composing
= 0;
1887 switch (emacs_code_class
[c1
])
1889 case EMACS_ascii_code
:
1890 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1893 case EMACS_control_code
:
1897 case EMACS_carriage_return_code
:
1898 if (!coding
->selective
)
1903 /* fall down to treat '\r' as '\n' ... */
1905 case EMACS_linefeed_code
:
1906 if (coding
->eol_type
== CODING_EOL_LF
1907 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1909 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1910 *dst
++ = '\r', *dst
++ = '\n';
1915 case EMACS_leading_code_2
:
1917 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
1920 case EMACS_leading_code_3
:
1921 TWO_MORE_BYTES (c2
, c3
);
1922 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
1925 case EMACS_leading_code_4
:
1926 THREE_MORE_BYTES (c2
, c3
, c4
);
1927 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
1930 case EMACS_leading_code_composition
:
1931 coding
->composing
= 1;
1934 default: /* i.e. case EMACS_invalid_code: */
1940 coding
->carryover_size
= src
- src_base
;
1941 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1946 *consumed
= src
- source
;
1947 return dst
- destination
;
1951 /*** 5. End-of-line handlers ***/
1953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1954 This function is called only when `coding->eol_type' is
1955 CODING_EOL_CRLF or CODING_EOL_CR. */
1957 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
1958 struct coding_system
*coding
;
1959 unsigned char *source
, *destination
;
1960 int src_bytes
, dst_bytes
;
1963 unsigned char *src
= source
;
1964 unsigned char *src_end
= source
+ src_bytes
;
1965 unsigned char *dst
= destination
;
1966 unsigned char *dst_end
= destination
+ dst_bytes
;
1969 switch (coding
->eol_type
)
1971 case CODING_EOL_CRLF
:
1973 /* Since the maximum bytes produced by each loop is 2, we
1974 subtract 1 from DST_END to assure overflow checking is
1975 necessary only at the head of loop. */
1976 unsigned char *adjusted_dst_end
= dst_end
- 1;
1978 while (src
< src_end
&& dst
< adjusted_dst_end
)
1980 unsigned char *src_base
= src
;
1981 unsigned char c
= *src
++;
1994 coding
->carryover_size
= src
- src_base
;
1995 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1999 *consumed
= src
- source
;
2000 produced
= dst
- destination
;
2005 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2006 bcopy (source
, destination
, produced
);
2007 dst_end
= destination
+ produced
;
2008 while (dst
< dst_end
)
2009 if (*dst
++ == '\r') dst
[-1] = '\n';
2010 *consumed
= produced
;
2013 default: /* i.e. case: CODING_EOL_LF */
2014 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2015 bcopy (source
, destination
, produced
);
2016 *consumed
= produced
;
2023 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2024 format of end-of-line according to `coding->eol_type'. If
2025 `coding->selective' is 1, code '\r' in source text also means
2028 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2029 struct coding_system
*coding
;
2030 unsigned char *source
, *destination
;
2031 int src_bytes
, dst_bytes
;
2034 unsigned char *src
= source
;
2035 unsigned char *dst
= destination
;
2041 switch (coding
->eol_type
)
2044 case CODING_EOL_UNDECIDED
:
2045 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2046 bcopy (source
, destination
, produced
);
2047 if (coding
->selective
)
2051 if (*dst
++ == '\r') dst
[-1] = '\n';
2053 *consumed
= produced
;
2055 case CODING_EOL_CRLF
:
2058 unsigned char *src_end
= source
+ src_bytes
;
2059 unsigned char *dst_end
= destination
+ dst_bytes
;
2060 /* Since the maximum bytes produced by each loop is 2, we
2061 subtract 1 from DST_END to assure overflow checking is
2062 necessary only at the head of loop. */
2063 unsigned char *adjusted_dst_end
= dst_end
- 1;
2065 while (src
< src_end
&& dst
< adjusted_dst_end
)
2068 if (c
== '\n' || (c
== '\r' && coding
->selective
))
2069 *dst
++ = '\r', *dst
++ = '\n';
2073 produced
= dst
- destination
;
2074 *consumed
= src
- source
;
2078 default: /* i.e. case CODING_EOL_CR: */
2079 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2080 bcopy (source
, destination
, produced
);
2084 if (*dst
++ == '\n') dst
[-1] = '\r';
2086 *consumed
= produced
;
2093 /*** 6. C library functions ***/
2095 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2096 has a property `coding-system'. The value of this property is a
2097 vector of length 5 (called as coding-vector). Among elements of
2098 this vector, the first (element[0]) and the fifth (element[4])
2099 carry important information for decoding/encoding. Before
2100 decoding/encoding, this information should be set in fields of a
2101 structure of type `coding_system'.
2103 A value of property `coding-system' can be a symbol of another
2104 subsidiary coding-system. In that case, Emacs gets coding-vector
2107 `element[0]' contains information to be set in `coding->type'. The
2108 value and its meaning is as follows:
2110 0 -- coding_type_emacs_mule
2111 1 -- coding_type_sjis
2112 2 -- coding_type_iso2022
2113 3 -- coding_type_big5
2114 4 -- coding_type_ccl encoder/decoder written in CCL
2115 nil -- coding_type_no_conversion
2116 t -- coding_type_undecided (automatic conversion on decoding,
2117 no-conversion on encoding)
2119 `element[4]' contains information to be set in `coding->flags' and
2120 `coding->spec'. The meaning varies by `coding->type'.
2122 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2123 of length 32 (of which the first 13 sub-elements are used now).
2124 Meanings of these sub-elements are:
2126 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2127 If the value is an integer of valid charset, the charset is
2128 assumed to be designated to graphic register N initially.
2130 If the value is minus, it is a minus value of charset which
2131 reserves graphic register N, which means that the charset is
2132 not designated initially but should be designated to graphic
2133 register N just before encoding a character in that charset.
2135 If the value is nil, graphic register N is never used on
2138 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2139 Each value takes t or nil. See the section ISO2022 of
2140 `coding.h' for more information.
2142 If `coding->type' is `coding_type_big5', element[4] is t to denote
2143 BIG5-ETen or nil to denote BIG5-HKU.
2145 If `coding->type' takes the other value, element[4] is ignored.
2147 Emacs Lisp's coding system also carries information about format of
2148 end-of-line in a value of property `eol-type'. If the value is
2149 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2150 means CODING_EOL_CR. If it is not integer, it should be a vector
2151 of subsidiary coding systems of which property `eol-type' has one
2156 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2157 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2158 is setup so that no conversion is necessary and return -1, else
2162 setup_coding_system (coding_system
, coding
)
2163 Lisp_Object coding_system
;
2164 struct coding_system
*coding
;
2166 Lisp_Object type
, eol_type
;
2168 /* At first, set several fields to default values. */
2169 coding
->require_flushing
= 0;
2170 coding
->last_block
= 0;
2171 coding
->selective
= 0;
2172 coding
->composing
= 0;
2173 coding
->direction
= 0;
2174 coding
->carryover_size
= 0;
2175 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2176 coding
->character_unification_table_for_decode
= Qnil
;
2177 coding
->character_unification_table_for_encode
= Qnil
;
2179 Vlast_coding_system_used
= coding
->symbol
= coding_system
;
2181 /* Get value of property `coding-system' until we get a vector.
2182 While doing that, also get values of properties
2183 `post-read-conversion', `pre-write-conversion',
2184 `character-unification-table-for-decode',
2185 `character-unification-table-for-encode' and `eol-type'. */
2186 while (!NILP (coding_system
) && SYMBOLP (coding_system
))
2188 if (NILP (coding
->post_read_conversion
))
2189 coding
->post_read_conversion
= Fget (coding_system
,
2190 Qpost_read_conversion
);
2191 if (NILP (coding
->pre_write_conversion
))
2192 coding
->pre_write_conversion
= Fget (coding_system
,
2193 Qpre_write_conversion
);
2194 if (!inhibit_eol_conversion
&& NILP (eol_type
))
2195 eol_type
= Fget (coding_system
, Qeol_type
);
2197 if (NILP (coding
->character_unification_table_for_decode
))
2198 coding
->character_unification_table_for_decode
2199 = Fget (coding_system
, Qcharacter_unification_table_for_decode
);
2201 if (NILP (coding
->character_unification_table_for_encode
))
2202 coding
->character_unification_table_for_encode
2203 = Fget (coding_system
, Qcharacter_unification_table_for_encode
);
2205 coding_system
= Fget (coding_system
, Qcoding_system
);
2208 while (!NILP (coding
->character_unification_table_for_decode
)
2209 && SYMBOLP (coding
->character_unification_table_for_decode
))
2210 coding
->character_unification_table_for_decode
2211 = Fget (coding
->character_unification_table_for_decode
,
2212 Qcharacter_unification_table_for_decode
);
2213 if (!NILP (coding
->character_unification_table_for_decode
)
2214 && !CHAR_TABLE_P (coding
->character_unification_table_for_decode
))
2215 coding
->character_unification_table_for_decode
= Qnil
;
2217 while (!NILP (coding
->character_unification_table_for_encode
)
2218 && SYMBOLP (coding
->character_unification_table_for_encode
))
2219 coding
->character_unification_table_for_encode
2220 = Fget (coding
->character_unification_table_for_encode
,
2221 Qcharacter_unification_table_for_encode
);
2222 if (!NILP (coding
->character_unification_table_for_encode
)
2223 && !CHAR_TABLE_P (coding
->character_unification_table_for_encode
))
2224 coding
->character_unification_table_for_encode
= Qnil
;
2226 if (!VECTORP (coding_system
)
2227 || XVECTOR (coding_system
)->size
!= 5)
2228 goto label_invalid_coding_system
;
2230 if (VECTORP (eol_type
))
2231 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2232 else if (XFASTINT (eol_type
) == 1)
2233 coding
->eol_type
= CODING_EOL_CRLF
;
2234 else if (XFASTINT (eol_type
) == 2)
2235 coding
->eol_type
= CODING_EOL_CR
;
2237 coding
->eol_type
= CODING_EOL_LF
;
2239 type
= XVECTOR (coding_system
)->contents
[0];
2240 switch (XFASTINT (type
))
2243 coding
->type
= coding_type_emacs_mule
;
2247 coding
->type
= coding_type_sjis
;
2251 coding
->type
= coding_type_iso2022
;
2253 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2255 int i
, charset
, default_reg_bits
= 0;
2257 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2258 goto label_invalid_coding_system
;
2260 flags
= XVECTOR (val
)->contents
;
2262 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2263 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2264 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2265 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2266 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2267 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2268 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2269 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2270 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2271 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2272 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
));
2274 /* Invoke graphic register 0 to plane 0. */
2275 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2276 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2277 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2278 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2279 /* Not single shifting at first. */
2280 CODING_SPEC_ISO_SINGLE_SHIFTING(coding
) = 0;
2281 /* Beginning of buffer should also be regarded as bol. */
2282 CODING_SPEC_ISO_BOL(coding
) = 1;
2284 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2285 FLAGS[REG] can be one of below:
2286 integer CHARSET: CHARSET occupies register I,
2287 t: designate nothing to REG initially, but can be used
2289 list of integer, nil, or t: designate the first
2290 element (if integer) to REG initially, the remaining
2291 elements (if integer) is designated to REG on request,
2292 if an element is t, REG can be used by any charset,
2293 nil: REG is never used. */
2294 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2295 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2296 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2297 for (i
= 0; i
< 4; i
++)
2299 if (INTEGERP (flags
[i
])
2300 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2301 || (charset
= get_charset_id (flags
[i
])) >= 0)
2303 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2304 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2306 else if (EQ (flags
[i
], Qt
))
2308 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2309 default_reg_bits
|= 1 << i
;
2311 else if (CONSP (flags
[i
]))
2313 Lisp_Object tail
= flags
[i
];
2315 if (INTEGERP (XCONS (tail
)->car
)
2316 && (charset
= XINT (XCONS (tail
)->car
),
2317 CHARSET_VALID_P (charset
))
2318 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2320 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2321 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2324 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2325 tail
= XCONS (tail
)->cdr
;
2326 while (CONSP (tail
))
2328 if (INTEGERP (XCONS (tail
)->car
)
2329 && (charset
= XINT (XCONS (tail
)->car
),
2330 CHARSET_VALID_P (charset
))
2331 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2332 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2334 else if (EQ (XCONS (tail
)->car
, Qt
))
2335 default_reg_bits
|= 1 << i
;
2336 tail
= XCONS (tail
)->cdr
;
2340 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2342 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2343 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2346 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2348 /* REG 1 can be used only by locking shift in 7-bit env. */
2349 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2350 default_reg_bits
&= ~2;
2351 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2352 /* Without any shifting, only REG 0 and 1 can be used. */
2353 default_reg_bits
&= 3;
2356 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2357 if (CHARSET_VALID_P (charset
)
2358 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2359 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
2361 /* We have not yet decided where to designate CHARSET. */
2362 int reg_bits
= default_reg_bits
;
2364 if (CHARSET_CHARS (charset
) == 96)
2365 /* A charset of CHARS96 can't be designated to REG 0. */
2369 /* There exist some default graphic register. */
2370 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2372 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
2374 /* We anyway have to designate CHARSET to somewhere. */
2375 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2376 = (CHARSET_CHARS (charset
) == 94
2378 : ((coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
2379 || ! coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2381 : (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
2385 coding
->require_flushing
= 1;
2389 coding
->type
= coding_type_big5
;
2391 = (NILP (XVECTOR (coding_system
)->contents
[4])
2392 ? CODING_FLAG_BIG5_HKU
2393 : CODING_FLAG_BIG5_ETEN
);
2397 coding
->type
= coding_type_ccl
;
2399 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2401 && VECTORP (XCONS (val
)->car
)
2402 && VECTORP (XCONS (val
)->cdr
))
2404 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
2405 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
2408 goto label_invalid_coding_system
;
2410 coding
->require_flushing
= 1;
2415 coding
->type
= coding_type_undecided
;
2417 coding
->type
= coding_type_no_conversion
;
2422 label_invalid_coding_system
:
2423 coding
->type
= coding_type_no_conversion
;
2424 coding
->eol_type
= CODING_EOL_LF
;
2425 coding
->symbol
= coding
->pre_write_conversion
= coding
->post_read_conversion
2430 /* Emacs has a mechanism to automatically detect a coding system if it
2431 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2432 it's impossible to distinguish some coding systems accurately
2433 because they use the same range of codes. So, at first, coding
2434 systems are categorized into 7, those are:
2436 o coding-category-emacs-mule
2438 The category for a coding system which has the same code range
2439 as Emacs' internal format. Assigned the coding-system (Lisp
2440 symbol) `emacs-mule' by default.
2442 o coding-category-sjis
2444 The category for a coding system which has the same code range
2445 as SJIS. Assigned the coding-system (Lisp
2446 symbol) `shift-jis' by default.
2448 o coding-category-iso-7
2450 The category for a coding system which has the same code range
2451 as ISO2022 of 7-bit environment. Assigned the coding-system
2452 (Lisp symbol) `iso-2022-7' by default.
2454 o coding-category-iso-8-1
2456 The category for a coding system which has the same code range
2457 as ISO2022 of 8-bit environment and graphic plane 1 used only
2458 for DIMENSION1 charset. Assigned the coding-system (Lisp
2459 symbol) `iso-8859-1' by default.
2461 o coding-category-iso-8-2
2463 The category for a coding system which has the same code range
2464 as ISO2022 of 8-bit environment and graphic plane 1 used only
2465 for DIMENSION2 charset. Assigned the coding-system (Lisp
2466 symbol) `euc-japan' by default.
2468 o coding-category-iso-else
2470 The category for a coding system which has the same code range
2471 as ISO2022 but not belongs to any of the above three
2472 categories. Assigned the coding-system (Lisp symbol)
2473 `iso-2022-ss2-7' by default.
2475 o coding-category-big5
2477 The category for a coding system which has the same code range
2478 as BIG5. Assigned the coding-system (Lisp symbol)
2479 `cn-big5' by default.
2481 o coding-category-binary
2483 The category for a coding system not categorized in any of the
2484 above. Assigned the coding-system (Lisp symbol)
2485 `no-conversion' by default.
2487 Each of them is a Lisp symbol and the value is an actual
2488 `coding-system's (this is also a Lisp symbol) assigned by a user.
2489 What Emacs does actually is to detect a category of coding system.
2490 Then, it uses a `coding-system' assigned to it. If Emacs can't
2491 decide only one possible category, it selects a category of the
2492 highest priority. Priorities of categories are also specified by a
2493 user in a Lisp variable `coding-category-list'.
2497 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2498 If it detects possible coding systems, return an integer in which
2499 appropriate flag bits are set. Flag bits are defined by macros
2500 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2503 detect_coding_mask (src
, src_bytes
)
2507 register unsigned char c
;
2508 unsigned char *src_end
= src
+ src_bytes
;
2511 /* At first, skip all ASCII characters and control characters except
2512 for three ISO2022 specific control characters. */
2513 label_loop_detect_coding
:
2514 while (src
< src_end
)
2518 || (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
2524 /* We found nothing other than ASCII. There's nothing to do. */
2525 return CODING_CATEGORY_MASK_ANY
;
2527 /* The text seems to be encoded in some multilingual coding system.
2528 Now, try to find in which coding system the text is encoded. */
2531 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2532 /* C is an ISO2022 specific control code of C0. */
2533 mask
= detect_coding_iso2022 (src
, src_end
);
2535 if (mask
== CODING_CATEGORY_MASK_ANY
)
2536 /* No valid ISO2022 code follows C. Try again. */
2537 goto label_loop_detect_coding
;
2539 else if (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
|| c
== ISO_CODE_CSI
)
2540 /* C is an ISO2022 specific control code of C1,
2541 or the first byte of SJIS's 2-byte character code,
2542 or a leading code of Emacs. */
2543 mask
= (detect_coding_iso2022 (src
, src_end
)
2544 | detect_coding_sjis (src
, src_end
)
2545 | detect_coding_emacs_mule (src
, src_end
));
2548 /* C is the first byte of SJIS character code,
2549 or a leading-code of Emacs. */
2550 mask
= (detect_coding_sjis (src
, src_end
)
2551 | detect_coding_emacs_mule (src
, src_end
));
2554 /* C is a character of ISO2022 in graphic plane right,
2555 or a SJIS's 1-byte character code (i.e. JISX0201),
2556 or the first byte of BIG5's 2-byte code. */
2557 mask
= (detect_coding_iso2022 (src
, src_end
)
2558 | detect_coding_sjis (src
, src_end
)
2559 | detect_coding_big5 (src
, src_end
));
2564 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2565 The information of the detected coding system is set in CODING. */
2568 detect_coding (coding
, src
, src_bytes
)
2569 struct coding_system
*coding
;
2573 int mask
= detect_coding_mask (src
, src_bytes
);
2576 if (mask
== CODING_CATEGORY_MASK_ANY
)
2577 /* We found nothing other than ASCII. There's nothing to do. */
2581 /* The source text seems to be encoded in unknown coding system.
2582 Emacs regards the category of such a kind of coding system as
2583 `coding-category-binary'. We assume that a user has assigned
2584 an appropriate coding system for a `coding-category-binary'. */
2585 idx
= CODING_CATEGORY_IDX_BINARY
;
2588 /* We found some plausible coding systems. Let's use a coding
2589 system of the highest priority. */
2590 Lisp_Object val
= Vcoding_category_list
;
2595 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
2596 if ((idx
< CODING_CATEGORY_IDX_MAX
) && (mask
& (1 << idx
)))
2598 val
= XCONS (val
)->cdr
;
2605 /* For unknown reason, `Vcoding_category_list' contains none
2606 of found categories. Let's use any of them. */
2607 for (idx
= 0; idx
< CODING_CATEGORY_IDX_MAX
; idx
++)
2608 if (mask
& (1 << idx
))
2612 setup_coding_system (XSYMBOL (coding_category_table
[idx
])->value
, coding
);
2615 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2616 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2617 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2620 detect_eol_type (src
, src_bytes
)
2624 unsigned char *src_end
= src
+ src_bytes
;
2627 while (src
< src_end
)
2631 return CODING_EOL_LF
;
2634 if (src
< src_end
&& *src
== '\n')
2635 return CODING_EOL_CRLF
;
2637 return CODING_EOL_CR
;
2640 return CODING_EOL_UNDECIDED
;
2643 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2644 is encoded. If it detects an appropriate format of end-of-line, it
2645 sets the information in *CODING. */
2648 detect_eol (coding
, src
, src_bytes
)
2649 struct coding_system
*coding
;
2654 int eol_type
= detect_eol_type (src
, src_bytes
);
2656 if (eol_type
== CODING_EOL_UNDECIDED
)
2657 /* We found no end-of-line in the source text. */
2660 val
= Fget (coding
->symbol
, Qeol_type
);
2661 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
2662 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
2665 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2666 decoding, it may detect coding system and format of end-of-line if
2667 those are not yet decided. */
2670 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2671 struct coding_system
*coding
;
2672 unsigned char *source
, *destination
;
2673 int src_bytes
, dst_bytes
;
2684 if (coding
->type
== coding_type_undecided
)
2685 detect_coding (coding
, source
, src_bytes
);
2687 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
2688 detect_eol (coding
, source
, src_bytes
);
2690 coding
->carryover_size
= 0;
2691 switch (coding
->type
)
2693 case coding_type_no_conversion
:
2694 label_no_conversion
:
2695 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2696 bcopy (source
, destination
, produced
);
2697 *consumed
= produced
;
2700 case coding_type_emacs_mule
:
2701 case coding_type_undecided
:
2702 if (coding
->eol_type
== CODING_EOL_LF
2703 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2704 goto label_no_conversion
;
2705 produced
= decode_eol (coding
, source
, destination
,
2706 src_bytes
, dst_bytes
, consumed
);
2709 case coding_type_sjis
:
2710 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2711 src_bytes
, dst_bytes
, consumed
,
2715 case coding_type_iso2022
:
2716 produced
= decode_coding_iso2022 (coding
, source
, destination
,
2717 src_bytes
, dst_bytes
, consumed
);
2720 case coding_type_big5
:
2721 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2722 src_bytes
, dst_bytes
, consumed
,
2726 case coding_type_ccl
:
2727 produced
= ccl_driver (&coding
->spec
.ccl
.decoder
, source
, destination
,
2728 src_bytes
, dst_bytes
, consumed
);
2735 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2738 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2739 struct coding_system
*coding
;
2740 unsigned char *source
, *destination
;
2741 int src_bytes
, dst_bytes
;
2746 coding
->carryover_size
= 0;
2747 switch (coding
->type
)
2749 case coding_type_no_conversion
:
2750 label_no_conversion
:
2751 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2754 bcopy (source
, destination
, produced
);
2755 if (coding
->selective
)
2757 unsigned char *p
= destination
, *pend
= destination
+ produced
;
2759 if (*p
++ == '\015') p
[-1] = '\n';
2762 *consumed
= produced
;
2765 case coding_type_emacs_mule
:
2766 case coding_type_undecided
:
2767 if (coding
->eol_type
== CODING_EOL_LF
2768 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2769 goto label_no_conversion
;
2770 produced
= encode_eol (coding
, source
, destination
,
2771 src_bytes
, dst_bytes
, consumed
);
2774 case coding_type_sjis
:
2775 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2776 src_bytes
, dst_bytes
, consumed
,
2780 case coding_type_iso2022
:
2781 produced
= encode_coding_iso2022 (coding
, source
, destination
,
2782 src_bytes
, dst_bytes
, consumed
);
2785 case coding_type_big5
:
2786 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2787 src_bytes
, dst_bytes
, consumed
,
2791 case coding_type_ccl
:
2792 produced
= ccl_driver (&coding
->spec
.ccl
.encoder
, source
, destination
,
2793 src_bytes
, dst_bytes
, consumed
);
2800 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2802 /* Return maximum size (bytes) of a buffer enough for decoding
2803 SRC_BYTES of text encoded in CODING. */
2806 decoding_buffer_size (coding
, src_bytes
)
2807 struct coding_system
*coding
;
2812 if (coding
->type
== coding_type_iso2022
)
2814 else if (coding
->type
== coding_type_ccl
)
2815 magnification
= coding
->spec
.ccl
.decoder
.buf_magnification
;
2819 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2822 /* Return maximum size (bytes) of a buffer enough for encoding
2823 SRC_BYTES of text to CODING. */
2826 encoding_buffer_size (coding
, src_bytes
)
2827 struct coding_system
*coding
;
2832 if (coding
->type
== coding_type_ccl
)
2833 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
2837 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2840 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2841 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2844 char *conversion_buffer
;
2845 int conversion_buffer_size
;
2847 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2848 or decoding. Sufficient memory is allocated automatically. If we
2849 run out of memory, return NULL. */
2852 get_conversion_buffer (size
)
2855 if (size
> conversion_buffer_size
)
2858 int real_size
= conversion_buffer_size
* 2;
2860 while (real_size
< size
) real_size
*= 2;
2861 buf
= (char *) xmalloc (real_size
);
2862 xfree (conversion_buffer
);
2863 conversion_buffer
= buf
;
2864 conversion_buffer_size
= real_size
;
2866 return conversion_buffer
;
2871 /*** 7. Emacs Lisp library functions ***/
2873 DEFUN ("coding-system-spec", Fcoding_system_spec
, Scoding_system_spec
,
2875 "Return coding-spec of CODING-SYSTEM.\n\
2876 If CODING-SYSTEM is not a valid coding-system, return nil.")
2880 while (SYMBOLP (obj
) && !NILP (obj
))
2881 obj
= Fget (obj
, Qcoding_system
);
2882 return ((NILP (obj
) || !VECTORP (obj
) || XVECTOR (obj
)->size
!= 5)
2886 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
2887 "Return t if OBJECT is nil or a coding-system.\n\
2888 See document of make-coding-system for coding-system object.")
2892 return ((NILP (obj
) || !NILP (Fcoding_system_spec (obj
))) ? Qt
: Qnil
);
2895 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
2896 Sread_non_nil_coding_system
, 1, 1, 0,
2897 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2904 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_spec
,
2905 Qt
, Qnil
, Qnil
, Qnil
);
2907 while (XSTRING (val
)->size
== 0);
2908 return (Fintern (val
, Qnil
));
2911 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 1, 0,
2912 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2916 Lisp_Object val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_p
,
2917 Qt
, Qnil
, Qnil
, Qnil
);
2918 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
2921 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
2923 "Check validity of CODING-SYSTEM.\n\
2924 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2925 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2926 The value of property should be a vector of length 5.")
2928 Lisp_Object coding_system
;
2930 CHECK_SYMBOL (coding_system
, 0);
2931 if (!NILP (Fcoding_system_p (coding_system
)))
2932 return coding_system
;
2934 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
2937 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
2939 "Detect coding-system of the text in the region between START and END.\n\
2940 Return a list of possible coding-systems ordered by priority.\n\
2941 If only ASCII characters are found, it returns `undecided'\n\
2942 or its subsidiary coding-system according to a detected end-of-line format.")
2946 int coding_mask
, eol_type
;
2950 validate_region (&b
, &e
);
2951 beg
= XINT (b
), end
= XINT (e
);
2952 if (beg
< GPT
&& end
>= GPT
) move_gap (end
);
2954 coding_mask
= detect_coding_mask (POS_ADDR (beg
), end
- beg
);
2955 eol_type
= detect_eol_type (POS_ADDR (beg
), end
- beg
);
2957 if (coding_mask
== CODING_CATEGORY_MASK_ANY
)
2959 val
= intern ("undecided");
2960 if (eol_type
!= CODING_EOL_UNDECIDED
)
2962 Lisp_Object val2
= Fget (val
, Qeol_type
);
2964 val
= XVECTOR (val2
)->contents
[eol_type
];
2971 /* At first, gather possible coding-systems in VAL in a reverse
2974 for (val2
= Vcoding_category_list
;
2976 val2
= XCONS (val2
)->cdr
)
2979 = XFASTINT (Fget (XCONS (val2
)->car
, Qcoding_category_index
));
2980 if (coding_mask
& (1 << idx
))
2981 val
= Fcons (Fsymbol_value (XCONS (val2
)->car
), val
);
2984 /* Then, change the order of the list, while getting subsidiary
2988 for (; !NILP (val2
); val2
= XCONS (val2
)->cdr
)
2990 if (eol_type
== CODING_EOL_UNDECIDED
)
2991 val
= Fcons (XCONS (val2
)->car
, val
);
2994 Lisp_Object val3
= Fget (XCONS (val2
)->car
, Qeol_type
);
2996 val
= Fcons (XVECTOR (val3
)->contents
[eol_type
], val
);
2998 val
= Fcons (XCONS (val2
)->car
, val
);
3006 /* Scan text in the region between *BEGP and *ENDP, skip characters
3007 which we never have to encode to (iff ENCODEP is 1) or decode from
3008 coding system CODING at the head and tail, then set BEGP and ENDP
3009 to the addresses of start and end of the text we actually convert. */
3012 shrink_conversion_area (begp
, endp
, coding
, encodep
)
3013 unsigned char **begp
, **endp
;
3014 struct coding_system
*coding
;
3017 register unsigned char *beg_addr
= *begp
, *end_addr
= *endp
;
3019 if (coding
->eol_type
!= CODING_EOL_LF
3020 && coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3021 /* Since we anyway have to convert end-of-line format, it is not
3022 worth skipping at most 100 bytes or so. */
3025 if (encodep
) /* for encoding */
3027 switch (coding
->type
)
3029 case coding_type_no_conversion
:
3030 case coding_type_emacs_mule
:
3031 case coding_type_undecided
:
3032 /* We need no conversion. */
3035 case coding_type_ccl
:
3036 /* We can't skip any data. */
3038 case coding_type_iso2022
:
3039 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3041 unsigned char *bol
= beg_addr
;
3042 while (beg_addr
< end_addr
&& *beg_addr
< 0x80)
3045 if (*(beg_addr
- 1) == '\n')
3049 goto label_skip_tail
;
3053 /* We can skip all ASCII characters at the head and tail. */
3054 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3056 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3060 else /* for decoding */
3062 switch (coding
->type
)
3064 case coding_type_no_conversion
:
3065 /* We need no conversion. */
3068 case coding_type_emacs_mule
:
3069 if (coding
->eol_type
== CODING_EOL_LF
)
3071 /* We need no conversion. */
3075 /* We can skip all but carriage-return. */
3076 while (beg_addr
< end_addr
&& *beg_addr
!= '\r') beg_addr
++;
3077 while (beg_addr
< end_addr
&& *(end_addr
- 1) != '\r') end_addr
--;
3079 case coding_type_sjis
:
3080 case coding_type_big5
:
3081 /* We can skip all ASCII characters at the head. */
3082 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3083 /* We can skip all ASCII characters at the tail except for
3084 the second byte of SJIS or BIG5 code. */
3085 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3086 if (end_addr
!= *endp
)
3089 case coding_type_ccl
:
3090 /* We can't skip any data. */
3092 default: /* i.e. case coding_type_iso2022: */
3096 /* We can skip all ASCII characters except for a few
3097 control codes at the head. */
3098 while (beg_addr
< end_addr
&& (c
= *beg_addr
) < 0x80
3099 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3100 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
)
3111 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3112 text between B and E. B and E are buffer position. */
3115 code_convert_region (b
, e
, coding
, encodep
)
3117 struct coding_system
*coding
;
3120 int beg
, end
, len
, consumed
, produced
;
3122 unsigned char *begp
, *endp
;
3125 validate_region (&b
, &e
);
3126 beg
= XINT (b
), end
= XINT (e
);
3127 if (beg
< GPT
&& end
>= GPT
)
3130 if (encodep
&& !NILP (coding
->pre_write_conversion
))
3132 /* We must call a pre-conversion function which may put a new
3133 text to be converted in a new buffer. */
3134 struct buffer
*old
= current_buffer
, *new;
3137 call2 (coding
->pre_write_conversion
, b
, e
);
3138 if (old
!= current_buffer
)
3140 /* Replace the original text by the text just generated. */
3142 new = current_buffer
;
3143 set_buffer_internal (old
);
3144 del_range (beg
, end
);
3145 insert_from_buffer (new, 1, len
, 0);
3150 /* We may be able to shrink the conversion region. */
3151 begp
= POS_ADDR (beg
); endp
= begp
+ (end
- beg
);
3152 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3155 /* We need no conversion. */
3159 beg
+= begp
- POS_ADDR (beg
);
3160 end
= beg
+ (endp
- begp
);
3163 len
= encoding_buffer_size (coding
, end
- beg
);
3165 len
= decoding_buffer_size (coding
, end
- beg
);
3166 buf
= get_conversion_buffer (len
);
3168 coding
->last_block
= 1;
3170 ? encode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3172 : decode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3175 len
= produced
+ (beg
- XINT (b
)) + (XINT (e
) - end
);
3178 insert (buf
, produced
);
3179 del_range (PT
, PT
+ end
- beg
);
3181 pos
= PT
+ (pos
- end
);
3187 if (!encodep
&& !NILP (coding
->post_read_conversion
))
3189 /* We must call a post-conversion function which may alter
3190 the text just converted. */
3195 insval
= call1 (coding
->post_read_conversion
, make_number (len
));
3196 CHECK_NUMBER (insval
, 0);
3197 len
= XINT (insval
);
3200 return make_number (len
);
3204 code_convert_string (str
, coding
, encodep
, nocopy
)
3205 Lisp_Object str
, nocopy
;
3206 struct coding_system
*coding
;
3209 int len
, consumed
, produced
;
3211 unsigned char *begp
, *endp
;
3212 int head_skip
, tail_skip
;
3213 struct gcpro gcpro1
;
3215 if (encodep
&& !NILP (coding
->pre_write_conversion
)
3216 || !encodep
&& !NILP (coding
->post_read_conversion
))
3218 /* Since we have to call Lisp functions which assume target text
3219 is in a buffer, after setting a temporary buffer, call
3220 code_convert_region. */
3221 int count
= specpdl_ptr
- specpdl
;
3222 int len
= XSTRING (str
)->size
;
3224 struct buffer
*old
= current_buffer
;
3226 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
3227 temp_output_buffer_setup (" *code-converting-work*");
3228 set_buffer_internal (XBUFFER (Vstandard_output
));
3229 insert_from_string (str
, 0, len
, 0);
3230 code_convert_region (make_number (BEGV
), make_number (ZV
),
3232 result
= make_buffer_string (BEGV
, ZV
, 0);
3233 set_buffer_internal (old
);
3234 return unbind_to (count
, result
);
3237 /* We may be able to shrink the conversion region. */
3238 begp
= XSTRING (str
)->data
;
3239 endp
= begp
+ XSTRING (str
)->size
;
3240 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3243 /* We need no conversion. */
3244 return (NILP (nocopy
) ? Fcopy_sequence (str
) : str
);
3246 head_skip
= begp
- XSTRING (str
)->data
;
3247 tail_skip
= XSTRING (str
)->size
- head_skip
- (endp
- begp
);
3252 len
= encoding_buffer_size (coding
, endp
- begp
);
3254 len
= decoding_buffer_size (coding
, endp
- begp
);
3255 buf
= get_conversion_buffer (len
+ head_skip
+ tail_skip
);
3257 bcopy (XSTRING (str
)->data
, buf
, head_skip
);
3258 coding
->last_block
= 1;
3260 ? encode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3261 buf
+ head_skip
, endp
- begp
, len
, &consumed
)
3262 : decode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3263 buf
+ head_skip
, endp
- begp
, len
, &consumed
));
3264 bcopy (XSTRING (str
)->data
+ head_skip
+ (endp
- begp
),
3265 buf
+ head_skip
+ produced
,
3270 return make_string (buf
, head_skip
+ produced
+ tail_skip
);
3273 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
3274 3, 3, "r\nzCoding system: ",
3275 "Decode current region by specified coding system.\n\
3276 When called from a program, takes three arguments:\n\
3277 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3278 Return length of decoded text.")
3279 (b
, e
, coding_system
)
3280 Lisp_Object b
, e
, coding_system
;
3282 struct coding_system coding
;
3284 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3285 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3286 CHECK_SYMBOL (coding_system
, 2);
3288 if (NILP (coding_system
))
3289 return make_number (XFASTINT (e
) - XFASTINT (b
));
3290 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3291 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3293 return code_convert_region (b
, e
, &coding
, 0);
3296 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
3297 3, 3, "r\nzCoding system: ",
3298 "Encode current region by specified coding system.\n\
3299 When called from a program, takes three arguments:\n\
3300 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3301 Return length of encoded text.")
3302 (b
, e
, coding_system
)
3303 Lisp_Object b
, e
, coding_system
;
3305 struct coding_system coding
;
3307 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3308 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3309 CHECK_SYMBOL (coding_system
, 2);
3311 if (NILP (coding_system
))
3312 return make_number (XFASTINT (e
) - XFASTINT (b
));
3313 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3314 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3316 return code_convert_region (b
, e
, &coding
, 1);
3319 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
3321 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3322 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3324 (string
, coding_system
, nocopy
)
3325 Lisp_Object string
, coding_system
, nocopy
;
3327 struct coding_system coding
;
3329 CHECK_STRING (string
, 0);
3330 CHECK_SYMBOL (coding_system
, 1);
3332 if (NILP (coding_system
))
3333 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3334 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3335 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3337 return code_convert_string (string
, &coding
, 0, nocopy
);
3340 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
3342 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3343 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3345 (string
, coding_system
, nocopy
)
3346 Lisp_Object string
, coding_system
, nocopy
;
3348 struct coding_system coding
;
3350 CHECK_STRING (string
, 0);
3351 CHECK_SYMBOL (coding_system
, 1);
3353 if (NILP (coding_system
))
3354 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3355 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3356 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3358 return code_convert_string (string
, &coding
, 1, nocopy
);
3361 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
3362 "Decode a JISX0208 character of shift-jis encoding.\n\
3363 CODE is the character code in SJIS.\n\
3364 Return the corresponding character.")
3368 unsigned char c1
, c2
, s1
, s2
;
3371 CHECK_NUMBER (code
, 0);
3372 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
3373 DECODE_SJIS (s1
, s2
, c1
, c2
);
3374 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
3378 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
3379 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3380 Return the corresponding character code in SJIS.")
3384 int charset
, c1
, c2
, s1
, s2
;
3387 CHECK_NUMBER (ch
, 0);
3388 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3389 if (charset
== charset_jisx0208
)
3391 ENCODE_SJIS (c1
, c2
, s1
, s2
);
3392 XSETFASTINT (val
, (s1
<< 8) | s2
);
3395 XSETFASTINT (val
, 0);
3399 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
3400 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3401 CODE is the character code in BIG5.\n\
3402 Return the corresponding character.")
3407 unsigned char b1
, b2
, c1
, c2
;
3410 CHECK_NUMBER (code
, 0);
3411 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
3412 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
3413 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
3417 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
3418 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3419 Return the corresponding character code in Big5.")
3423 int charset
, c1
, c2
, b1
, b2
;
3426 CHECK_NUMBER (ch
, 0);
3427 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3428 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
3430 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
3431 XSETFASTINT (val
, (b1
<< 8) | b2
);
3434 XSETFASTINT (val
, 0);
3438 DEFUN ("set-terminal-coding-system-internal",
3439 Fset_terminal_coding_system_internal
,
3440 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
3442 Lisp_Object coding_system
;
3444 CHECK_SYMBOL (coding_system
, 0);
3445 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
3449 DEFUN ("terminal-coding-system",
3450 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
3451 "Return coding-system of your terminal.")
3454 return terminal_coding
.symbol
;
3457 DEFUN ("set-keyboard-coding-system-internal",
3458 Fset_keyboard_coding_system_internal
,
3459 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
3461 Lisp_Object coding_system
;
3463 CHECK_SYMBOL (coding_system
, 0);
3464 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
3468 DEFUN ("keyboard-coding-system",
3469 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
3470 "Return coding-system of what is sent from terminal keyboard.")
3473 return keyboard_coding
.symbol
;
3477 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
3478 Sfind_operation_coding_system
, 1, MANY
, 0,
3479 "Choose a coding system for an operation based on the target name.\n\
3480 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3481 DECODING-SYSTEM is the coding system to use for decoding\n\
3482 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3483 for encoding (in case OPERATION does encoding).\n\
3485 The first argument OPERATION specifies an I/O primitive:\n\
3486 For file I/O, `insert-file-contents' or `write-region'.\n\
3487 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3488 For network I/O, `open-network-stream'.\n\
3490 The remaining arguments should be the same arguments that were passed\n\
3491 to the primitive. Depending on which primitive, one of those arguments\n\
3492 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3493 whichever argument specifies the file name is TARGET.\n\
3495 TARGET has a meaning which depends on OPERATION:\n\
3496 For file I/O, TARGET is a file name.\n\
3497 For process I/O, TARGET is a process name.\n\
3498 For network I/O, TARGET is a service name or a port number\n\
3500 This function looks up what specified for TARGET in,\n\
3501 `file-coding-system-alist', `process-coding-system-alist',\n\
3502 or `network-coding-system-alist' depending on OPERATION.\n\
3503 They may specify a coding system, a cons of coding systems,\n\
3504 or a function symbol to call.\n\
3505 In the last case, we call the function with one argument,\n\
3506 which is a list of all the arguments given to this function.")
3511 Lisp_Object operation
, target_idx
, target
, val
;
3512 register Lisp_Object chain
;
3515 error ("Too few arguments");
3516 operation
= args
[0];
3517 if (!SYMBOLP (operation
)
3518 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
3519 error ("Invalid first arguement");
3520 if (nargs
< 1 + XINT (target_idx
))
3521 error ("Too few arguments for operation: %s",
3522 XSYMBOL (operation
)->name
->data
);
3523 target
= args
[XINT (target_idx
) + 1];
3524 if (!(STRINGP (target
)
3525 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
3526 error ("Invalid %dth argument", XINT (target_idx
) + 1);
3528 chain
= ((EQ (operation
, Qinsert_file_contents
)
3529 || EQ (operation
, Qwrite_region
))
3530 ? Vfile_coding_system_alist
3531 : (EQ (operation
, Qopen_network_stream
)
3532 ? Vnetwork_coding_system_alist
3533 : Vprocess_coding_system_alist
));
3537 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
3539 Lisp_Object elt
= XCONS (chain
)->car
;
3542 && ((STRINGP (target
)
3543 && STRINGP (XCONS (elt
)->car
)
3544 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
3545 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
3547 val
= XCONS (elt
)->cdr
;
3550 if (! SYMBOLP (val
))
3552 if (! NILP (Fcoding_system_p (val
)))
3553 return Fcons (val
, val
);
3554 if (!NILP (Fboundp (val
)))
3555 return call1 (val
, Flist (nargs
, args
));
3565 /*** 8. Post-amble ***/
3571 /* Emacs' internal format specific initialize routine. */
3572 for (i
= 0; i
<= 0x20; i
++)
3573 emacs_code_class
[i
] = EMACS_control_code
;
3574 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
3575 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
3576 for (i
= 0x21 ; i
< 0x7F; i
++)
3577 emacs_code_class
[i
] = EMACS_ascii_code
;
3578 emacs_code_class
[0x7F] = EMACS_control_code
;
3579 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
3580 for (i
= 0x81; i
< 0xFF; i
++)
3581 emacs_code_class
[i
] = EMACS_invalid_code
;
3582 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
3583 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
3584 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
3585 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
3587 /* ISO2022 specific initialize routine. */
3588 for (i
= 0; i
< 0x20; i
++)
3589 iso_code_class
[i
] = ISO_control_code
;
3590 for (i
= 0x21; i
< 0x7F; i
++)
3591 iso_code_class
[i
] = ISO_graphic_plane_0
;
3592 for (i
= 0x80; i
< 0xA0; i
++)
3593 iso_code_class
[i
] = ISO_control_code
;
3594 for (i
= 0xA1; i
< 0xFF; i
++)
3595 iso_code_class
[i
] = ISO_graphic_plane_1
;
3596 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
3597 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
3598 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
3599 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
3600 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
3601 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
3602 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
3603 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
3604 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
3605 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
3607 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
3608 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
3610 setup_coding_system (Qnil
, &keyboard_coding
);
3611 setup_coding_system (Qnil
, &terminal_coding
);
3613 #if defined (MSDOS) || defined (WINDOWSNT)
3614 system_eol_type
= CODING_EOL_CRLF
;
3616 system_eol_type
= CODING_EOL_LF
;
3624 Qtarget_idx
= intern ("target-idx");
3625 staticpro (&Qtarget_idx
);
3627 /* Target FILENAME is the first argument. */
3628 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
3629 /* Target FILENAME is the third argument. */
3630 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
3632 Qcall_process
= intern ("call-process");
3633 staticpro (&Qcall_process
);
3634 /* Target PROGRAM is the first argument. */
3635 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
3637 Qcall_process_region
= intern ("call-process-region");
3638 staticpro (&Qcall_process_region
);
3639 /* Target PROGRAM is the third argument. */
3640 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
3642 Qstart_process
= intern ("start-process");
3643 staticpro (&Qstart_process
);
3644 /* Target PROGRAM is the third argument. */
3645 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
3647 Qopen_network_stream
= intern ("open-network-stream");
3648 staticpro (&Qopen_network_stream
);
3649 /* Target SERVICE is the fourth argument. */
3650 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
3652 Qcoding_system
= intern ("coding-system");
3653 staticpro (&Qcoding_system
);
3655 Qeol_type
= intern ("eol-type");
3656 staticpro (&Qeol_type
);
3658 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
3659 staticpro (&Qbuffer_file_coding_system
);
3661 Qpost_read_conversion
= intern ("post-read-conversion");
3662 staticpro (&Qpost_read_conversion
);
3664 Qpre_write_conversion
= intern ("pre-write-conversion");
3665 staticpro (&Qpre_write_conversion
);
3667 Qcoding_system_spec
= intern ("coding-system-spec");
3668 staticpro (&Qcoding_system_spec
);
3670 Qcoding_system_p
= intern ("coding-system-p");
3671 staticpro (&Qcoding_system_p
);
3673 Qcoding_system_error
= intern ("coding-system-error");
3674 staticpro (&Qcoding_system_error
);
3676 Fput (Qcoding_system_error
, Qerror_conditions
,
3677 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
3678 Fput (Qcoding_system_error
, Qerror_message
,
3679 build_string ("Invalid coding system"));
3681 Qcoding_category_index
= intern ("coding-category-index");
3682 staticpro (&Qcoding_category_index
);
3686 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3688 coding_category_table
[i
] = intern (coding_category_name
[i
]);
3689 staticpro (&coding_category_table
[i
]);
3690 Fput (coding_category_table
[i
], Qcoding_category_index
,
3695 Qcharacter_unification_table
= intern ("character-unification-table");
3696 staticpro (&Qcharacter_unification_table
);
3697 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
3700 Qcharacter_unification_table_for_decode
3701 = intern ("character-unification-table-for-decode");
3702 staticpro (&Qcharacter_unification_table_for_decode
);
3704 Qcharacter_unification_table_for_encode
3705 = intern ("character-unification-table-for-encode");
3706 staticpro (&Qcharacter_unification_table_for_encode
);
3708 Qemacs_mule
= intern ("emacs-mule");
3709 staticpro (&Qemacs_mule
);
3711 defsubr (&Scoding_system_spec
);
3712 defsubr (&Scoding_system_p
);
3713 defsubr (&Sread_coding_system
);
3714 defsubr (&Sread_non_nil_coding_system
);
3715 defsubr (&Scheck_coding_system
);
3716 defsubr (&Sdetect_coding_region
);
3717 defsubr (&Sdecode_coding_region
);
3718 defsubr (&Sencode_coding_region
);
3719 defsubr (&Sdecode_coding_string
);
3720 defsubr (&Sencode_coding_string
);
3721 defsubr (&Sdecode_sjis_char
);
3722 defsubr (&Sencode_sjis_char
);
3723 defsubr (&Sdecode_big5_char
);
3724 defsubr (&Sencode_big5_char
);
3725 defsubr (&Sset_terminal_coding_system_internal
);
3726 defsubr (&Sterminal_coding_system
);
3727 defsubr (&Sset_keyboard_coding_system_internal
);
3728 defsubr (&Skeyboard_coding_system
);
3729 defsubr (&Sfind_operation_coding_system
);
3731 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
3732 "List of coding-categories (symbols) ordered by priority.");
3736 Vcoding_category_list
= Qnil
;
3737 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
3738 Vcoding_category_list
3739 = Fcons (coding_category_table
[i
], Vcoding_category_list
);
3742 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
3743 "A variable of internal use only.\n\
3744 If the value is a coding system, it is used for decoding on read operation.\n\
3745 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3746 Vcoding_system_for_read
= Qnil
;
3748 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
3749 "A variable of internal use only.\n\
3750 If the value is a coding system, it is used for encoding on write operation.\n\
3751 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3752 Vcoding_system_for_write
= Qnil
;
3754 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
3755 "Coding-system used in the latest file or process I/O.");
3756 Vlast_coding_system_used
= Qnil
;
3758 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
3759 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3760 inhibit_eol_conversion
= 0;
3762 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
3763 "Alist to decide a coding system to use for a file I/O operation.\n\
3764 The format is ((PATTERN . VAL) ...),\n\
3765 where PATTERN is a regular expression matching a file name,\n\
3766 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3767 If VAL is a coding system, it is used for both decoding and encoding\n\
3768 the file contents.\n\
3769 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3770 and the cdr part is used for encoding.\n\
3771 If VAL is a function symbol, the function must return a coding system\n\
3772 or a cons of coding systems which are used as above.\n\
3774 See also the function `find-operation-coding-system'.");
3775 Vfile_coding_system_alist
= Qnil
;
3777 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
3778 "Alist to decide a coding system to use for a process I/O operation.\n\
3779 The format is ((PATTERN . VAL) ...),\n\
3780 where PATTERN is a regular expression matching a program name,\n\
3781 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3782 If VAL is a coding system, it is used for both decoding what received\n\
3783 from the program and encoding what sent to the program.\n\
3784 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3785 and the cdr part is used for encoding.\n\
3786 If VAL is a function symbol, the function must return a coding system\n\
3787 or a cons of coding systems which are used as above.\n\
3789 See also the function `find-operation-coding-system'.");
3790 Vprocess_coding_system_alist
= Qnil
;
3792 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
3793 "Alist to decide a coding system to use for a network I/O operation.\n\
3794 The format is ((PATTERN . VAL) ...),\n\
3795 where PATTERN is a regular expression matching a network service name\n\
3796 or is a port number to connect to,\n\
3797 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3798 If VAL is a coding system, it is used for both decoding what received\n\
3799 from the network stream and encoding what sent to the network stream.\n\
3800 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3801 and the cdr part is used for encoding.\n\
3802 If VAL is a function symbol, the function must return a coding system\n\
3803 or a cons of coding systems which are used as above.\n\
3805 See also the function `find-operation-coding-system'.");
3806 Vnetwork_coding_system_alist
= Qnil
;
3808 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
3809 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3810 eol_mnemonic_unix
= ':';
3812 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
3813 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3814 eol_mnemonic_dos
= '\\';
3816 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
3817 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3818 eol_mnemonic_mac
= '/';
3820 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
3821 "Mnemonic character indicating end-of-line format is not yet decided.");
3822 eol_mnemonic_undecided
= ':';
3824 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
3825 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3826 Venable_character_unification
= Qt
;
3828 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3829 &Vstandard_character_unification_table_for_decode
,
3830 "Table for unifying characters when reading.");
3831 Vstandard_character_unification_table_for_decode
= Qnil
;
3833 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3834 &Vstandard_character_unification_table_for_encode
,
3835 "Table for unifying characters when writing.");
3836 Vstandard_character_unification_table_for_encode
= Qnil
;
3838 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
3839 "Alist of charsets vs revision numbers.\n\
3840 While encoding, if a charset (car part of an element) is found,\n\
3841 designate it with the escape sequence identifing revision (cdr part of the element).");
3842 Vcharset_revision_alist
= Qnil
;
3844 DEFVAR_LISP ("default-process-coding-system",
3845 &Vdefault_process_coding_system
,
3846 "Cons of coding systems used for process I/O by default.\n\
3847 The car part is used for decoding a process output,\n\
3848 the cdr part is used for encoding a text to be sent to a process.");
3849 Vdefault_process_coding_system
= Qnil
;