Don't create faces if make-face isn't defined.
[emacs.git] / src / coding.c
blobbcc603a2c636f62f8b4effb30a25ee6ad65f48df
1 /* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /*** TABLE OF CONTENTS ***
25 1. Preamble
26 2. Emacs' internal format handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
32 8. Post-amble
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format, and when we say "encode", it means
42 converting Emacs' internal format to some other coding system.
44 0. Emacs' internal format
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
49 1. ISO2022
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
62 3. BIG5
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
70 4. Else
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107 #if 0
109 detect_coding_internal (src, src_end)
110 unsigned char *src, *src_end;
114 #endif
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format. The resulting text goes to a
120 place pointed by DESTINATION, the length of which should not exceed
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
122 The return value is the length of the decoded text. Below is a
123 template of these functions. */
124 #if 0
125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
133 #endif
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
137 These functions encode SRC_BYTES length text at SOURCE of Emacs
138 internal format to CODING. The resulting text goes to a place
139 pointed by DESTINATION, the length of which should not exceed
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
141 The return value is the length of the encoded text. Below is a
142 template of these functions. */
143 #if 0
144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
152 #endif
154 /*** COMMONLY USED MACROS ***/
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
162 #define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
170 #define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
178 #define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
186 /* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
193 /* Decode one ASCII character C. */
195 #define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
228 /*** 1. Preamble ***/
230 #include <stdio.h>
232 #ifdef emacs
234 #include <config.h>
235 #include "lisp.h"
236 #include "buffer.h"
237 #include "charset.h"
238 #include "ccl.h"
239 #include "coding.h"
240 #include "window.h"
242 #else /* not emacs */
244 #include "mulelib.h"
246 #endif /* not emacs */
248 Lisp_Object Qcoding_system, Qeol_type;
249 Lisp_Object Qbuffer_file_coding_system;
250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254 Lisp_Object Qstart_process, Qopen_network_stream;
255 Lisp_Object Qtarget_idx;
257 /* Mnemonic character of each format of end-of-line. */
258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259 /* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261 int eol_mnemonic_undecided;
263 #ifdef emacs
265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269 /* Coding-system for reading files and receiving data from process. */
270 Lisp_Object Vcoding_system_for_read;
271 /* Coding-system for writing files and sending data to process. */
272 Lisp_Object Vcoding_system_for_write;
273 /* Coding-system actually used in the latest I/O. */
274 Lisp_Object Vlast_coding_system_used;
276 /* Coding-system of what terminal accept for displaying. */
277 struct coding_system terminal_coding;
279 /* Coding-system of what is sent from terminal keyboard. */
280 struct coding_system keyboard_coding;
282 Lisp_Object Vcoding_system_alist;
284 #endif /* emacs */
286 Lisp_Object Qcoding_category_index;
288 /* List of symbols `coding-category-xxx' ordered by priority. */
289 Lisp_Object Vcoding_category_list;
291 /* Table of coding-systems currently assigned to each coding-category. */
292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
294 /* Table of names of symbol for each coding-category. */
295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
296 "coding-category-internal",
297 "coding-category-sjis",
298 "coding-category-iso-7",
299 "coding-category-iso-8-1",
300 "coding-category-iso-8-2",
301 "coding-category-iso-else",
302 "coding-category-big5",
303 "coding-category-binary"
306 /* Flag to tell if we look up unification table on character code
307 conversion. */
308 Lisp_Object Venable_character_unification;
309 /* Standard unification table to look up on reading (decoding). */
310 Lisp_Object Vstandard_character_unification_table_for_read;
311 /* Standard unification table to look up on writing (encoding). */
312 Lisp_Object Vstandard_character_unification_table_for_write;
314 Lisp_Object Qcharacter_unification_table;
316 /* Alist of charsets vs revision number. */
317 Lisp_Object Vcharset_revision_alist;
320 /*** 2. Emacs internal format handlers ***/
322 /* Emacs' internal format for encoding multiple character sets is a
323 kind of multi-byte encoding, i.e. encoding a character by a sequence
324 of one-byte codes of variable length. ASCII characters and control
325 characters (e.g. `tab', `newline') are represented by one-byte as
326 is. It takes the range 0x00 through 0x7F. The other characters
327 are represented by a sequence of `base leading-code', optional
328 `extended leading-code', and one or two `position-code's. Length
329 of the sequence is decided by the base leading-code. Leading-code
330 takes the range 0x80 through 0x9F, whereas extended leading-code
331 and position-code take the range 0xA0 through 0xFF. See the
332 document of `charset.h' for more detail about leading-code and
333 position-code.
335 There's one exception in this rule. Special leading-code
336 `leading-code-composition' denotes that the following several
337 characters should be composed into one character. Leading-codes of
338 components (except for ASCII) are added 0x20. An ASCII character
339 component is represented by a 2-byte sequence of `0xA0' and
340 `ASCII-code + 0x80'. See also the document in `charset.h' for the
341 detail of composite character. Hence, we can summarize the code
342 range as follows:
344 --- CODE RANGE of Emacs' internal format ---
345 (character set) (range)
346 ASCII 0x00 .. 0x7F
347 ELSE (1st byte) 0x80 .. 0x9F
348 (rest bytes) 0xA0 .. 0xFF
349 ---------------------------------------------
353 enum emacs_code_class_type emacs_code_class[256];
355 /* Go to the next statement only if *SRC is accessible and the code is
356 greater than 0xA0. */
357 #define CHECK_CODE_RANGE_A0_FF \
358 do { \
359 if (src >= src_end) \
360 goto label_end_of_switch; \
361 else if (*src++ < 0xA0) \
362 return 0; \
363 } while (0)
365 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
366 Check if a text is encoded in Emacs' internal format. If it is,
367 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
370 detect_coding_internal (src, src_end)
371 unsigned char *src, *src_end;
373 unsigned char c;
374 int composing = 0;
376 while (src < src_end)
378 c = *src++;
380 if (composing)
382 if (c < 0xA0)
383 composing = 0;
384 else
385 c -= 0x20;
388 switch (emacs_code_class[c])
390 case EMACS_ascii_code:
391 case EMACS_linefeed_code:
392 break;
394 case EMACS_control_code:
395 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
396 return 0;
397 break;
399 case EMACS_invalid_code:
400 return 0;
402 case EMACS_leading_code_composition: /* c == 0x80 */
403 if (composing)
404 CHECK_CODE_RANGE_A0_FF;
405 else
406 composing = 1;
407 break;
409 case EMACS_leading_code_4:
410 CHECK_CODE_RANGE_A0_FF;
411 /* fall down to check it two more times ... */
413 case EMACS_leading_code_3:
414 CHECK_CODE_RANGE_A0_FF;
415 /* fall down to check it one more time ... */
417 case EMACS_leading_code_2:
418 CHECK_CODE_RANGE_A0_FF;
419 break;
421 default:
422 label_end_of_switch:
423 break;
426 return CODING_CATEGORY_MASK_INTERNAL;
430 /*** 3. ISO2022 handlers ***/
432 /* The following note describes the coding system ISO2022 briefly.
433 Since the intension of this note is to help understanding of the
434 programs in this file, some parts are NOT ACCURATE or OVERLY
435 SIMPLIFIED. For the thorough understanding, please refer to the
436 original document of ISO2022.
438 ISO2022 provides many mechanisms to encode several character sets
439 in 7-bit and 8-bit environment. If one choose 7-bite environment,
440 all text is encoded by codes of less than 128. This may make the
441 encoded text a little bit longer, but the text get more stability
442 to pass through several gateways (some of them split MSB off).
444 There are two kind of character set: control character set and
445 graphic character set. The former contains control characters such
446 as `newline' and `escape' to provide control functions (control
447 functions are provided also by escape sequence). The latter
448 contains graphic characters such as ' A' and '-'. Emacs recognizes
449 two control character sets and many graphic character sets.
451 Graphic character sets are classified into one of the following
452 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
453 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
454 bytes (DIMENSION) and the number of characters in one dimension
455 (CHARS) of the set. In addition, each character set is assigned an
456 identification tag (called "final character" and denoted as <F>
457 here after) which is unique in each class. <F> of each character
458 set is decided by ECMA(*) when it is registered in ISO. Code range
459 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
461 Note (*): ECMA = European Computer Manufacturers Association
463 Here are examples of graphic character set [NAME(<F>)]:
464 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
465 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
466 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
467 o DIMENSION2_CHARS96 -- none for the moment
469 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
470 C0 [0x00..0x1F] -- control character plane 0
471 GL [0x20..0x7F] -- graphic character plane 0
472 C1 [0x80..0x9F] -- control character plane 1
473 GR [0xA0..0xFF] -- graphic character plane 1
475 A control character set is directly designated and invoked to C0 or
476 C1 by an escape sequence. The most common case is that ISO646's
477 control character set is designated/invoked to C0 and ISO6429's
478 control character set is designated/invoked to C1, and usually
479 these designations/invocations are omitted in a coded text. With
480 7-bit environment, only C0 can be used, and a control character for
481 C1 is encoded by an appropriate escape sequence to fit in the
482 environment. All control characters for C1 are defined the
483 corresponding escape sequences.
485 A graphic character set is at first designated to one of four
486 graphic registers (G0 through G3), then these graphic registers are
487 invoked to GL or GR. These designations and invocations can be
488 done independently. The most common case is that G0 is invoked to
489 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
490 these invocations and designations are omitted in a coded text.
491 With 7-bit environment, only GL can be used.
493 When a graphic character set of CHARS94 is invoked to GL, code 0x20
494 and 0x7F of GL area work as control characters SPACE and DEL
495 respectively, and code 0xA0 and 0xFF of GR area should not be used.
497 There are two ways of invocation: locking-shift and single-shift.
498 With locking-shift, the invocation lasts until the next different
499 invocation, whereas with single-shift, the invocation works only
500 for the following character and doesn't affect locking-shift.
501 Invocations are done by the following control characters or escape
502 sequences.
504 ----------------------------------------------------------------------
505 function control char escape sequence description
506 ----------------------------------------------------------------------
507 SI (shift-in) 0x0F none invoke G0 to GL
508 SI (shift-out) 0x0E none invoke G1 to GL
509 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
510 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
511 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
512 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
513 ----------------------------------------------------------------------
514 The first four are for locking-shift. Control characters for these
515 functions are defined by macros ISO_CODE_XXX in `coding.h'.
517 Designations are done by the following escape sequences.
518 ----------------------------------------------------------------------
519 escape sequence description
520 ----------------------------------------------------------------------
521 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
522 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
523 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
524 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
525 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
526 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
527 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
528 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
529 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
530 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
531 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
532 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
533 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
534 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
535 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
536 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
537 ----------------------------------------------------------------------
539 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
540 of dimension 1, chars 94, and final character <F>, and etc.
542 Note (*): Although these designations are not allowed in ISO2022,
543 Emacs accepts them on decoding, and produces them on encoding
544 CHARS96 character set in a coding system which is characterized as
545 7-bit environment, non-locking-shift, and non-single-shift.
547 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
548 '(' can be omitted. We call this as "short-form" here after.
550 Now you may notice that there are a lot of ways for encoding the
551 same multilingual text in ISO2022. Actually, there exist many
552 coding systems such as Compound Text (used in X's inter client
553 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
554 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
555 localized platforms), and all of these are variants of ISO2022.
557 In addition to the above, Emacs handles two more kinds of escape
558 sequences: ISO6429's direction specification and Emacs' private
559 sequence for specifying character composition.
561 ISO6429's direction specification takes the following format:
562 o CSI ']' -- end of the current direction
563 o CSI '0' ']' -- end of the current direction
564 o CSI '1' ']' -- start of left-to-right text
565 o CSI '2' ']' -- start of right-to-left text
566 The control character CSI (0x9B: control sequence introducer) is
567 abbreviated to the escape sequence ESC '[' in 7-bit environment.
569 Character composition specification takes the following format:
570 o ESC '0' -- start character composition
571 o ESC '1' -- end character composition
572 Since these are not standard escape sequences of any ISO, the use
573 of them for these meaning is restricted to Emacs only. */
575 enum iso_code_class_type iso_code_class[256];
577 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
578 Check if a text is encoded in ISO2022. If it is, returns an
579 integer in which appropriate flag bits any of:
580 CODING_CATEGORY_MASK_ISO_7
581 CODING_CATEGORY_MASK_ISO_8_1
582 CODING_CATEGORY_MASK_ISO_8_2
583 CODING_CATEGORY_MASK_ISO_ELSE
584 are set. If a code which should never appear in ISO2022 is found,
585 returns 0. */
588 detect_coding_iso2022 (src, src_end)
589 unsigned char *src, *src_end;
591 int mask = CODING_CATEGORY_MASK_ANY;
592 int g1 = 0; /* 1 iff designating to G1. */
593 int c, i;
595 while (src < src_end)
597 c = *src++;
598 switch (c)
600 case ISO_CODE_ESC:
601 if (src >= src_end)
602 break;
603 c = *src++;
604 if (src < src_end
605 && ((c >= '(' && c <= '/')
606 || c == '$' && ((*src >= '(' && *src <= '/')
607 || (*src >= '@' && *src <= 'B'))))
609 /* Valid designation sequence. */
610 mask &= (CODING_CATEGORY_MASK_ISO_7
611 | CODING_CATEGORY_MASK_ISO_8_1
612 | CODING_CATEGORY_MASK_ISO_8_2
613 | CODING_CATEGORY_MASK_ISO_ELSE);
614 if (c == ')' || (c == '$' && *src == ')'))
616 g1 = 1;
617 mask &= ~CODING_CATEGORY_MASK_ISO_7;
619 src++;
620 break;
622 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
623 return CODING_CATEGORY_MASK_ISO_ELSE;
624 break;
626 case ISO_CODE_SO:
627 if (g1)
628 return CODING_CATEGORY_MASK_ISO_ELSE;
629 break;
631 case ISO_CODE_CSI:
632 case ISO_CODE_SS2:
633 case ISO_CODE_SS3:
634 mask &= ~CODING_CATEGORY_MASK_ISO_7;
635 break;
637 default:
638 if (c < 0x80)
639 break;
640 else if (c < 0xA0)
641 return 0;
642 else
644 int count = 1;
646 mask &= ~CODING_CATEGORY_MASK_ISO_7;
647 while (src < src_end && *src >= 0xA0)
648 count++, src++;
649 if (count & 1 && src < src_end)
650 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
652 break;
656 return mask;
659 /* Decode a character of which charset is CHARSET and the 1st position
660 code is C1. If dimension of CHARSET is 2, the 2nd position code is
661 fetched from SRC and set to C2. If CHARSET is negative, it means
662 that we are decoding ill formed text, and what we can do is just to
663 read C1 as is. */
665 #define DECODE_ISO_CHARACTER(charset, c1) \
666 do { \
667 int c_alt, charset_alt = (charset); \
668 if (COMPOSING_HEAD_P (coding->composing)) \
670 *dst++ = LEADING_CODE_COMPOSITION; \
671 if (COMPOSING_WITH_RULE_P (coding->composing)) \
672 /* To tell composition rules are embeded. */ \
673 *dst++ = 0xFF; \
674 coding->composing += 2; \
676 if ((charset) >= 0) \
678 if (CHARSET_DIMENSION (charset) == 2) \
679 ONE_MORE_BYTE (c2); \
680 if (!NILP (unification_table) \
681 && ((c_alt = unify_char (unification_table, \
682 -1, (charset), c1, c2)) >= 0)) \
683 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
685 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
686 DECODE_CHARACTER_ASCII (c1); \
687 else if (CHARSET_DIMENSION (charset_alt) == 1) \
688 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
689 else \
690 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
691 if (COMPOSING_WITH_RULE_P (coding->composing)) \
692 /* To tell a composition rule follows. */ \
693 coding->composing = COMPOSING_WITH_RULE_RULE; \
694 } while (0)
696 /* Set designation state into CODING. */
697 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
698 do { \
699 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
700 if (charset >= 0) \
702 if (coding->direction == 1 \
703 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
704 charset = CHARSET_REVERSE_CHARSET (charset); \
705 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
707 } while (0)
709 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
712 decode_coding_iso2022 (coding, source, destination,
713 src_bytes, dst_bytes, consumed)
714 struct coding_system *coding;
715 unsigned char *source, *destination;
716 int src_bytes, dst_bytes;
717 int *consumed;
719 unsigned char *src = source;
720 unsigned char *src_end = source + src_bytes;
721 unsigned char *dst = destination;
722 unsigned char *dst_end = destination + dst_bytes;
723 /* Since the maximum bytes produced by each loop is 7, we subtract 6
724 from DST_END to assure that overflow checking is necessary only
725 at the head of loop. */
726 unsigned char *adjusted_dst_end = dst_end - 6;
727 int charset;
728 /* Charsets invoked to graphic plane 0 and 1 respectively. */
729 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
730 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
731 Lisp_Object unification_table = coding->character_unification_table;
733 if (!NILP (Venable_character_unification) && NILP (unification_table))
734 unification_table = Vstandard_character_unification_table_for_read;
736 while (src < src_end && dst < adjusted_dst_end)
738 /* SRC_BASE remembers the start position in source in each loop.
739 The loop will be exited when there's not enough source text
740 to analyze long escape sequence or 2-byte code (within macros
741 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
742 to SRC_BASE before exiting. */
743 unsigned char *src_base = src;
744 int c1 = *src++, c2;
746 switch (iso_code_class [c1])
748 case ISO_0x20_or_0x7F:
749 if (!coding->composing
750 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
752 /* This is SPACE or DEL. */
753 *dst++ = c1;
754 break;
756 /* This is a graphic character, we fall down ... */
758 case ISO_graphic_plane_0:
759 if (coding->composing == COMPOSING_WITH_RULE_RULE)
761 /* This is a composition rule. */
762 *dst++ = c1 | 0x80;
763 coding->composing = COMPOSING_WITH_RULE_TAIL;
765 else
766 DECODE_ISO_CHARACTER (charset0, c1);
767 break;
769 case ISO_0xA0_or_0xFF:
770 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
772 /* Invalid code. */
773 *dst++ = c1;
774 break;
776 /* This is a graphic character, we fall down ... */
778 case ISO_graphic_plane_1:
779 DECODE_ISO_CHARACTER (charset1, c1);
780 break;
782 case ISO_control_code:
783 /* All ISO2022 control characters in this class have the
784 same representation in Emacs internal format. */
785 *dst++ = c1;
786 break;
788 case ISO_carriage_return:
789 if (coding->eol_type == CODING_EOL_CR)
791 *dst++ = '\n';
793 else if (coding->eol_type == CODING_EOL_CRLF)
795 ONE_MORE_BYTE (c1);
796 if (c1 == ISO_CODE_LF)
797 *dst++ = '\n';
798 else
800 src--;
801 *dst++ = c1;
804 else
806 *dst++ = c1;
808 break;
810 case ISO_shift_out:
811 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
812 goto label_invalid_escape_sequence;
813 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
814 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
815 break;
817 case ISO_shift_in:
818 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
819 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
820 break;
822 case ISO_single_shift_2_7:
823 case ISO_single_shift_2:
824 /* SS2 is handled as an escape sequence of ESC 'N' */
825 c1 = 'N';
826 goto label_escape_sequence;
828 case ISO_single_shift_3:
829 /* SS2 is handled as an escape sequence of ESC 'O' */
830 c1 = 'O';
831 goto label_escape_sequence;
833 case ISO_control_sequence_introducer:
834 /* CSI is handled as an escape sequence of ESC '[' ... */
835 c1 = '[';
836 goto label_escape_sequence;
838 case ISO_escape:
839 ONE_MORE_BYTE (c1);
840 label_escape_sequence:
841 /* Escape sequences handled by Emacs are invocation,
842 designation, direction specification, and character
843 composition specification. */
844 switch (c1)
846 case '&': /* revision of following character set */
847 ONE_MORE_BYTE (c1);
848 if (!(c1 >= '@' && c1 <= '~'))
849 goto label_invalid_escape_sequence;
850 ONE_MORE_BYTE (c1);
851 if (c1 != ISO_CODE_ESC)
852 goto label_invalid_escape_sequence;
853 ONE_MORE_BYTE (c1);
854 goto label_escape_sequence;
856 case '$': /* designation of 2-byte character set */
857 ONE_MORE_BYTE (c1);
858 if (c1 >= '@' && c1 <= 'B')
859 { /* designation of JISX0208.1978, GB2312.1980,
860 or JISX0208.1980 */
861 DECODE_DESIGNATION (0, 2, 94, c1);
863 else if (c1 >= 0x28 && c1 <= 0x2B)
864 { /* designation of DIMENSION2_CHARS94 character set */
865 ONE_MORE_BYTE (c2);
866 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
868 else if (c1 >= 0x2C && c1 <= 0x2F)
869 { /* designation of DIMENSION2_CHARS96 character set */
870 ONE_MORE_BYTE (c2);
871 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
873 else
874 goto label_invalid_escape_sequence;
875 break;
877 case 'n': /* invocation of locking-shift-2 */
878 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
879 goto label_invalid_escape_sequence;
880 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
881 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
882 break;
884 case 'o': /* invocation of locking-shift-3 */
885 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
886 goto label_invalid_escape_sequence;
887 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
888 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
889 break;
891 case 'N': /* invocation of single-shift-2 */
892 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
893 goto label_invalid_escape_sequence;
894 ONE_MORE_BYTE (c1);
895 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
896 DECODE_ISO_CHARACTER (charset, c1);
897 break;
899 case 'O': /* invocation of single-shift-3 */
900 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
901 goto label_invalid_escape_sequence;
902 ONE_MORE_BYTE (c1);
903 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
904 DECODE_ISO_CHARACTER (charset, c1);
905 break;
907 case '0': /* start composing without embeded rules */
908 coding->composing = COMPOSING_NO_RULE_HEAD;
909 break;
911 case '1': /* end composing */
912 coding->composing = COMPOSING_NO;
913 break;
915 case '2': /* start composing with embeded rules */
916 coding->composing = COMPOSING_WITH_RULE_HEAD;
917 break;
919 case '[': /* specification of direction */
920 /* For the moment, nested direction is not supported.
921 So, the value of `coding->direction' is 0 or 1: 0
922 means left-to-right, 1 means right-to-left. */
923 ONE_MORE_BYTE (c1);
924 switch (c1)
926 case ']': /* end of the current direction */
927 coding->direction = 0;
929 case '0': /* end of the current direction */
930 case '1': /* start of left-to-right direction */
931 ONE_MORE_BYTE (c1);
932 if (c1 == ']')
933 coding->direction = 0;
934 else
935 goto label_invalid_escape_sequence;
936 break;
938 case '2': /* start of right-to-left direction */
939 ONE_MORE_BYTE (c1);
940 if (c1 == ']')
941 coding->direction= 1;
942 else
943 goto label_invalid_escape_sequence;
944 break;
946 default:
947 goto label_invalid_escape_sequence;
949 break;
951 default:
952 if (c1 >= 0x28 && c1 <= 0x2B)
953 { /* designation of DIMENSION1_CHARS94 character set */
954 ONE_MORE_BYTE (c2);
955 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
957 else if (c1 >= 0x2C && c1 <= 0x2F)
958 { /* designation of DIMENSION1_CHARS96 character set */
959 ONE_MORE_BYTE (c2);
960 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
962 else
964 goto label_invalid_escape_sequence;
967 /* We must update these variables now. */
968 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
969 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
970 break;
972 label_invalid_escape_sequence:
974 int length = src - src_base;
976 bcopy (src_base, dst, length);
977 dst += length;
980 continue;
982 label_end_of_loop:
983 coding->carryover_size = src - src_base;
984 bcopy (src_base, coding->carryover, coding->carryover_size);
985 src = src_base;
986 break;
989 /* If this is the last block of the text to be decoded, we had
990 better just flush out all remaining codes in the text although
991 they are not valid characters. */
992 if (coding->last_block)
994 bcopy (src, dst, src_end - src);
995 dst += (src_end - src);
996 src = src_end;
998 *consumed = src - source;
999 return dst - destination;
1002 /* ISO2022 encoding staffs. */
1005 It is not enough to say just "ISO2022" on encoding, but we have to
1006 specify more details. In Emacs, each coding-system of ISO2022
1007 variant has the following specifications:
1008 1. Initial designation to G0 thru G3.
1009 2. Allows short-form designation?
1010 3. ASCII should be designated to G0 before control characters?
1011 4. ASCII should be designated to G0 at end of line?
1012 5. 7-bit environment or 8-bit environment?
1013 6. Use locking-shift?
1014 7. Use Single-shift?
1015 And the following two are only for Japanese:
1016 8. Use ASCII in place of JIS0201-1976-Roman?
1017 9. Use JISX0208-1983 in place of JISX0208-1978?
1018 These specifications are encoded in `coding->flags' as flag bits
1019 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1020 detail.
1023 /* Produce codes (escape sequence) for designating CHARSET to graphic
1024 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1025 the coding system CODING allows, produce designation sequence of
1026 short-form. */
1028 #define ENCODE_DESIGNATION(charset, reg, coding) \
1029 do { \
1030 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1031 char *intermediate_char_94 = "()*+"; \
1032 char *intermediate_char_96 = ",-./"; \
1033 Lisp_Object temp \
1034 = Fassq (make_number (charset), Vcharset_revision_alist); \
1035 if (! NILP (temp)) \
1037 *dst++ = ISO_CODE_ESC; \
1038 *dst++ = '&'; \
1039 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1041 *dst++ = ISO_CODE_ESC; \
1042 if (CHARSET_DIMENSION (charset) == 1) \
1044 if (CHARSET_CHARS (charset) == 94) \
1045 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1046 else \
1047 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1049 else \
1051 *dst++ = '$'; \
1052 if (CHARSET_CHARS (charset) == 94) \
1054 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1055 || reg != 0 \
1056 || final_char < '@' || final_char > 'B') \
1057 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1059 else \
1060 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1062 *dst++ = final_char; \
1063 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1064 } while (0)
1066 /* The following two macros produce codes (control character or escape
1067 sequence) for ISO2022 single-shift functions (single-shift-2 and
1068 single-shift-3). */
1070 #define ENCODE_SINGLE_SHIFT_2 \
1071 do { \
1072 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1073 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1074 else \
1075 *dst++ = ISO_CODE_SS2; \
1076 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1077 } while (0)
1079 #define ENCODE_SINGLE_SHIFT_3 \
1080 do { \
1081 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1082 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1083 else \
1084 *dst++ = ISO_CODE_SS3; \
1085 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1086 } while (0)
1088 /* The following four macros produce codes (control character or
1089 escape sequence) for ISO2022 locking-shift functions (shift-in,
1090 shift-out, locking-shift-2, and locking-shift-3). */
1092 #define ENCODE_SHIFT_IN \
1093 do { \
1094 *dst++ = ISO_CODE_SI; \
1095 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1096 } while (0)
1098 #define ENCODE_SHIFT_OUT \
1099 do { \
1100 *dst++ = ISO_CODE_SO; \
1101 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1102 } while (0)
1104 #define ENCODE_LOCKING_SHIFT_2 \
1105 do { \
1106 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1107 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1108 } while (0)
1110 #define ENCODE_LOCKING_SHIFT_3 \
1111 do { \
1112 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1113 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1114 } while (0)
1116 /* Produce codes for a DIMENSION1 character of which character set is
1117 CHARSET and position-code is C1. Designation and invocation
1118 sequences are also produced in advance if necessary. */
1121 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1122 do { \
1123 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1125 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1126 *dst++ = c1 & 0x7F; \
1127 else \
1128 *dst++ = c1 | 0x80; \
1129 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1130 break; \
1132 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1134 *dst++ = c1 & 0x7F; \
1135 break; \
1137 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1139 *dst++ = c1 | 0x80; \
1140 break; \
1142 else \
1143 /* Since CHARSET is not yet invoked to any graphic planes, we \
1144 must invoke it, or, at first, designate it to some graphic \
1145 register. Then repeat the loop to actually produce the \
1146 character. */ \
1147 dst = encode_invocation_designation (charset, coding, dst); \
1148 } while (1)
1150 /* Produce codes for a DIMENSION2 character of which character set is
1151 CHARSET and position-codes are C1 and C2. Designation and
1152 invocation codes are also produced in advance if necessary. */
1154 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1155 do { \
1156 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1158 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1159 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1160 else \
1161 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1162 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1163 break; \
1165 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1167 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1168 break; \
1170 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1172 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1173 break; \
1175 else \
1176 /* Since CHARSET is not yet invoked to any graphic planes, we \
1177 must invoke it, or, at first, designate it to some graphic \
1178 register. Then repeat the loop to actually produce the \
1179 character. */ \
1180 dst = encode_invocation_designation (charset, coding, dst); \
1181 } while (1)
1183 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1184 do { \
1185 int c_alt, charset_alt; \
1186 if (!NILP (unification_table) \
1187 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1188 < 0)) \
1189 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1190 else \
1191 charset_alt = charset; \
1192 if (CHARSET_DIMENSION (charset_alt) == 1) \
1193 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1194 else \
1195 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1196 } while (0)
1198 /* Produce designation and invocation codes at a place pointed by DST
1199 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1200 Return new DST. */
1202 unsigned char *
1203 encode_invocation_designation (charset, coding, dst)
1204 int charset;
1205 struct coding_system *coding;
1206 unsigned char *dst;
1208 int reg; /* graphic register number */
1210 /* At first, check designations. */
1211 for (reg = 0; reg < 4; reg++)
1212 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1213 break;
1215 if (reg >= 4)
1217 /* CHARSET is not yet designated to any graphic registers. */
1218 /* At first check the requested designation. */
1219 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1220 if (reg < 0)
1221 /* Since CHARSET requests no special designation, designate to
1222 graphic register 0. */
1223 reg = 0;
1225 ENCODE_DESIGNATION (charset, reg, coding);
1228 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1229 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1231 /* Since the graphic register REG is not invoked to any graphic
1232 planes, invoke it to graphic plane 0. */
1233 switch (reg)
1235 case 0: /* graphic register 0 */
1236 ENCODE_SHIFT_IN;
1237 break;
1239 case 1: /* graphic register 1 */
1240 ENCODE_SHIFT_OUT;
1241 break;
1243 case 2: /* graphic register 2 */
1244 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1245 ENCODE_SINGLE_SHIFT_2;
1246 else
1247 ENCODE_LOCKING_SHIFT_2;
1248 break;
1250 case 3: /* graphic register 3 */
1251 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1252 ENCODE_SINGLE_SHIFT_3;
1253 else
1254 ENCODE_LOCKING_SHIFT_3;
1255 break;
1258 return dst;
1261 /* The following two macros produce codes for indicating composition. */
1262 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1263 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1264 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1266 /* The following three macros produce codes for indicating direction
1267 of text. */
1268 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1269 do { \
1270 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1271 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1272 else \
1273 *dst++ = ISO_CODE_CSI; \
1274 } while (0)
1276 #define ENCODE_DIRECTION_R2L \
1277 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1279 #define ENCODE_DIRECTION_L2R \
1280 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1282 /* Produce codes for designation and invocation to reset the graphic
1283 planes and registers to initial state. */
1284 #define ENCODE_RESET_PLANE_AND_REGISTER \
1285 do { \
1286 int reg; \
1287 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1288 ENCODE_SHIFT_IN; \
1289 for (reg = 0; reg < 4; reg++) \
1290 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1291 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1292 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1293 ENCODE_DESIGNATION \
1294 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1295 } while (0)
1297 /* Produce designation sequences of charsets in the line started from
1298 *SRC to a place pointed by DSTP.
1300 If the current block ends before any end-of-line, we may fail to
1301 find all the necessary *designations. */
1302 encode_designation_at_bol (coding, table, src, src_end, dstp)
1303 struct coding_system *coding;
1304 Lisp_Object table;
1305 unsigned char *src, *src_end, **dstp;
1307 int charset, c, found = 0, reg;
1308 /* Table of charsets to be designated to each graphic register. */
1309 int r[4];
1310 unsigned char *dst = *dstp;
1312 for (reg = 0; reg < 4; reg++)
1313 r[reg] = -1;
1315 while (src < src_end && *src != '\n' && found < 4)
1317 int bytes = BYTES_BY_CHAR_HEAD (*src);
1319 if (NILP (table))
1320 charset = CHARSET_AT (src);
1321 else
1323 int c_alt, c1, c2;
1325 SPLIT_STRING(src, bytes, charset, c1, c2);
1326 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1327 charset = CHAR_CHARSET (c_alt);
1330 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1331 if (r[reg] < 0)
1333 found++;
1334 r[reg] = charset;
1337 src += bytes;
1340 if (found)
1342 for (reg = 0; reg < 4; reg++)
1343 if (r[reg] >= 0
1344 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1345 ENCODE_DESIGNATION (r[reg], reg, coding);
1346 *dstp = dst;
1350 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1353 encode_coding_iso2022 (coding, source, destination,
1354 src_bytes, dst_bytes, consumed)
1355 struct coding_system *coding;
1356 unsigned char *source, *destination;
1357 int src_bytes, dst_bytes;
1358 int *consumed;
1360 unsigned char *src = source;
1361 unsigned char *src_end = source + src_bytes;
1362 unsigned char *dst = destination;
1363 unsigned char *dst_end = destination + dst_bytes;
1364 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1365 from DST_END to assure overflow checking is necessary only at the
1366 head of loop. */
1367 unsigned char *adjusted_dst_end = dst_end - 19;
1368 Lisp_Object unification_table = coding->character_unification_table;
1370 if (!NILP (Venable_character_unification) && NILP (unification_table))
1371 unification_table = Vstandard_character_unification_table_for_write;
1373 while (src < src_end && dst < adjusted_dst_end)
1375 /* SRC_BASE remembers the start position in source in each loop.
1376 The loop will be exited when there's not enough source text
1377 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1378 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1379 reset to SRC_BASE before exiting. */
1380 unsigned char *src_base = src;
1381 int charset, c1, c2, c3, c4;
1383 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1384 && CODING_SPEC_ISO_BOL (coding))
1386 /* We have to produce designation sequences if any now. */
1387 encode_designation_at_bol (coding, unification_table,
1388 src, src_end, &dst);
1389 CODING_SPEC_ISO_BOL (coding) = 0;
1392 c1 = *src++;
1393 /* If we are seeing a component of a composite character, we are
1394 seeing a leading-code specially encoded for composition, or a
1395 composition rule if composing with rule. We must set C1
1396 to a normal leading-code or an ASCII code. If we are not at
1397 a composed character, we must reset the composition state. */
1398 if (COMPOSING_P (coding->composing))
1400 if (c1 < 0xA0)
1402 /* We are not in a composite character any longer. */
1403 coding->composing = COMPOSING_NO;
1404 ENCODE_COMPOSITION_END;
1406 else
1408 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1410 *dst++ = c1 & 0x7F;
1411 coding->composing = COMPOSING_WITH_RULE_HEAD;
1412 continue;
1414 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1415 coding->composing = COMPOSING_WITH_RULE_RULE;
1416 if (c1 == 0xA0)
1418 /* This is an ASCII component. */
1419 ONE_MORE_BYTE (c1);
1420 c1 &= 0x7F;
1422 else
1423 /* This is a leading-code of non ASCII component. */
1424 c1 -= 0x20;
1428 /* Now encode one character. C1 is a control character, an
1429 ASCII character, or a leading-code of multi-byte character. */
1430 switch (emacs_code_class[c1])
1432 case EMACS_ascii_code:
1433 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1434 break;
1436 case EMACS_control_code:
1437 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1438 ENCODE_RESET_PLANE_AND_REGISTER;
1439 *dst++ = c1;
1440 break;
1442 case EMACS_carriage_return_code:
1443 if (!coding->selective)
1445 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1446 ENCODE_RESET_PLANE_AND_REGISTER;
1447 *dst++ = c1;
1448 break;
1450 /* fall down to treat '\r' as '\n' ... */
1452 case EMACS_linefeed_code:
1453 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1454 ENCODE_RESET_PLANE_AND_REGISTER;
1455 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1456 bcopy (coding->spec.iso2022.initial_designation,
1457 coding->spec.iso2022.current_designation,
1458 sizeof coding->spec.iso2022.initial_designation);
1459 if (coding->eol_type == CODING_EOL_LF
1460 || coding->eol_type == CODING_EOL_AUTOMATIC)
1461 *dst++ = ISO_CODE_LF;
1462 else if (coding->eol_type == CODING_EOL_CRLF)
1463 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1464 else
1465 *dst++ = ISO_CODE_CR;
1466 CODING_SPEC_ISO_BOL (coding) = 1;
1467 break;
1469 case EMACS_leading_code_2:
1470 ONE_MORE_BYTE (c2);
1471 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1472 break;
1474 case EMACS_leading_code_3:
1475 TWO_MORE_BYTES (c2, c3);
1476 if (c1 < LEADING_CODE_PRIVATE_11)
1477 ENCODE_ISO_CHARACTER (c1, c2, c3);
1478 else
1479 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1480 break;
1482 case EMACS_leading_code_4:
1483 THREE_MORE_BYTES (c2, c3, c4);
1484 ENCODE_ISO_CHARACTER (c2, c3, c4);
1485 break;
1487 case EMACS_leading_code_composition:
1488 ONE_MORE_BYTE (c1);
1489 if (c1 == 0xFF)
1491 coding->composing = COMPOSING_WITH_RULE_HEAD;
1492 ENCODE_COMPOSITION_WITH_RULE_START;
1494 else
1496 /* Rewind one byte because it is a character code of
1497 composition elements. */
1498 src--;
1499 coding->composing = COMPOSING_NO_RULE_HEAD;
1500 ENCODE_COMPOSITION_NO_RULE_START;
1502 break;
1504 case EMACS_invalid_code:
1505 *dst++ = c1;
1506 break;
1508 continue;
1509 label_end_of_loop:
1510 coding->carryover_size = src - src_base;
1511 bcopy (src_base, coding->carryover, coding->carryover_size);
1512 break;
1515 /* If this is the last block of the text to be encoded, we must
1516 reset graphic planes and registers to the initial state. */
1517 if (src >= src_end && coding->last_block)
1519 ENCODE_RESET_PLANE_AND_REGISTER;
1520 if (coding->carryover_size > 0
1521 && coding->carryover_size < (dst_end - dst))
1523 bcopy (coding->carryover, dst, coding->carryover_size);
1524 dst += coding->carryover_size;
1525 coding->carryover_size = 0;
1528 *consumed = src - source;
1529 return dst - destination;
1533 /*** 4. SJIS and BIG5 handlers ***/
1535 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1536 quite widely. So, for the moment, Emacs supports them in the bare
1537 C code. But, in the future, they may be supported only by CCL. */
1539 /* SJIS is a coding system encoding three character sets: ASCII, right
1540 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1541 as is. A character of charset katakana-jisx0201 is encoded by
1542 "position-code + 0x80". A character of charset japanese-jisx0208
1543 is encoded in 2-byte but two position-codes are divided and shifted
1544 so that it fit in the range below.
1546 --- CODE RANGE of SJIS ---
1547 (character set) (range)
1548 ASCII 0x00 .. 0x7F
1549 KATAKANA-JISX0201 0xA0 .. 0xDF
1550 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1551 (2nd byte) 0x40 .. 0xFF
1552 -------------------------------
1556 /* BIG5 is a coding system encoding two character sets: ASCII and
1557 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1558 character set and is encoded in two-byte.
1560 --- CODE RANGE of BIG5 ---
1561 (character set) (range)
1562 ASCII 0x00 .. 0x7F
1563 Big5 (1st byte) 0xA1 .. 0xFE
1564 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1565 --------------------------
1567 Since the number of characters in Big5 is larger than maximum
1568 characters in Emacs' charset (96x96), it can't be handled as one
1569 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1570 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1571 contains frequently used characters and the latter contains less
1572 frequently used characters. */
1574 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1575 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1576 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1577 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1579 /* Number of Big5 characters which have the same code in 1st byte. */
1580 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1582 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1583 do { \
1584 unsigned int temp \
1585 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1586 if (b1 < 0xC9) \
1587 charset = charset_big5_1; \
1588 else \
1590 charset = charset_big5_2; \
1591 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1593 c1 = temp / (0xFF - 0xA1) + 0x21; \
1594 c2 = temp % (0xFF - 0xA1) + 0x21; \
1595 } while (0)
1597 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1598 do { \
1599 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1600 if (charset == charset_big5_2) \
1601 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1602 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1603 b2 = temp % BIG5_SAME_ROW; \
1604 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1605 } while (0)
1607 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1608 Check if a text is encoded in SJIS. If it is, return
1609 CODING_CATEGORY_MASK_SJIS, else return 0. */
1612 detect_coding_sjis (src, src_end)
1613 unsigned char *src, *src_end;
1615 unsigned char c;
1617 while (src < src_end)
1619 c = *src++;
1620 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1621 return 0;
1622 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1624 if (src < src_end && *src++ < 0x40)
1625 return 0;
1628 return CODING_CATEGORY_MASK_SJIS;
1631 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1632 Check if a text is encoded in BIG5. If it is, return
1633 CODING_CATEGORY_MASK_BIG5, else return 0. */
1636 detect_coding_big5 (src, src_end)
1637 unsigned char *src, *src_end;
1639 unsigned char c;
1641 while (src < src_end)
1643 c = *src++;
1644 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1645 return 0;
1646 if (c >= 0xA1)
1648 if (src >= src_end)
1649 break;
1650 c = *src++;
1651 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1652 return 0;
1655 return CODING_CATEGORY_MASK_BIG5;
1658 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1659 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1662 decode_coding_sjis_big5 (coding, source, destination,
1663 src_bytes, dst_bytes, consumed, sjis_p)
1664 struct coding_system *coding;
1665 unsigned char *source, *destination;
1666 int src_bytes, dst_bytes;
1667 int *consumed;
1668 int sjis_p;
1670 unsigned char *src = source;
1671 unsigned char *src_end = source + src_bytes;
1672 unsigned char *dst = destination;
1673 unsigned char *dst_end = destination + dst_bytes;
1674 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1675 from DST_END to assure overflow checking is necessary only at the
1676 head of loop. */
1677 unsigned char *adjusted_dst_end = dst_end - 3;
1679 while (src < src_end && dst < adjusted_dst_end)
1681 /* SRC_BASE remembers the start position in source in each loop.
1682 The loop will be exited when there's not enough source text
1683 to analyze two-byte character (within macro ONE_MORE_BYTE).
1684 In that case, SRC is reset to SRC_BASE before exiting. */
1685 unsigned char *src_base = src;
1686 unsigned char c1 = *src++, c2, c3, c4;
1688 if (c1 == '\r')
1690 if (coding->eol_type == CODING_EOL_CRLF)
1692 ONE_MORE_BYTE (c2);
1693 if (c2 == '\n')
1694 *dst++ = c2;
1695 else
1696 /* To process C2 again, SRC is subtracted by 1. */
1697 *dst++ = c1, src--;
1699 else
1700 *dst++ = c1;
1702 else if (c1 < 0x80)
1703 *dst++ = c1;
1704 else if (c1 < 0xA0 || c1 >= 0xE0)
1706 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1707 if (sjis_p)
1709 ONE_MORE_BYTE (c2);
1710 DECODE_SJIS (c1, c2, c3, c4);
1711 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1713 else if (c1 >= 0xE0 && c1 < 0xFF)
1715 int charset;
1717 ONE_MORE_BYTE (c2);
1718 DECODE_BIG5 (c1, c2, charset, c3, c4);
1719 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1721 else /* Invalid code */
1722 *dst++ = c1;
1724 else
1726 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1727 if (sjis_p)
1728 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1729 else
1731 int charset;
1733 ONE_MORE_BYTE (c2);
1734 DECODE_BIG5 (c1, c2, charset, c3, c4);
1735 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1738 continue;
1740 label_end_of_loop:
1741 coding->carryover_size = src - src_base;
1742 bcopy (src_base, coding->carryover, coding->carryover_size);
1743 src = src_base;
1744 break;
1747 *consumed = src - source;
1748 return dst - destination;
1751 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1752 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1753 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1754 sure that all these charsets are registered as official charset
1755 (i.e. do not have extended leading-codes). Characters of other
1756 charsets are produced without any encoding. If SJIS_P is 1, encode
1757 SJIS text, else encode BIG5 text. */
1760 encode_coding_sjis_big5 (coding, source, destination,
1761 src_bytes, dst_bytes, consumed, sjis_p)
1762 struct coding_system *coding;
1763 unsigned char *source, *destination;
1764 int src_bytes, dst_bytes;
1765 int *consumed;
1766 int sjis_p;
1768 unsigned char *src = source;
1769 unsigned char *src_end = source + src_bytes;
1770 unsigned char *dst = destination;
1771 unsigned char *dst_end = destination + dst_bytes;
1772 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1773 from DST_END to assure overflow checking is necessary only at the
1774 head of loop. */
1775 unsigned char *adjusted_dst_end = dst_end - 1;
1777 while (src < src_end && dst < adjusted_dst_end)
1779 /* SRC_BASE remembers the start position in source in each loop.
1780 The loop will be exited when there's not enough source text
1781 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1782 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1783 before exiting. */
1784 unsigned char *src_base = src;
1785 unsigned char c1 = *src++, c2, c3, c4;
1787 if (coding->composing)
1789 if (c1 == 0xA0)
1791 ONE_MORE_BYTE (c1);
1792 c1 &= 0x7F;
1794 else if (c1 >= 0xA0)
1795 c1 -= 0x20;
1796 else
1797 coding->composing = 0;
1800 switch (emacs_code_class[c1])
1802 case EMACS_ascii_code:
1803 case EMACS_control_code:
1804 *dst++ = c1;
1805 break;
1807 case EMACS_carriage_return_code:
1808 if (!coding->selective)
1810 *dst++ = c1;
1811 break;
1813 /* fall down to treat '\r' as '\n' ... */
1815 case EMACS_linefeed_code:
1816 if (coding->eol_type == CODING_EOL_LF
1817 || coding->eol_type == CODING_EOL_AUTOMATIC)
1818 *dst++ = '\n';
1819 else if (coding->eol_type == CODING_EOL_CRLF)
1820 *dst++ = '\r', *dst++ = '\n';
1821 else
1822 *dst++ = '\r';
1823 break;
1825 case EMACS_leading_code_2:
1826 ONE_MORE_BYTE (c2);
1827 if (sjis_p && c1 == charset_katakana_jisx0201)
1828 *dst++ = c2;
1829 else
1830 *dst++ = c1, *dst++ = c2;
1831 break;
1833 case EMACS_leading_code_3:
1834 TWO_MORE_BYTES (c2, c3);
1835 c2 &= 0x7F, c3 &= 0x7F;
1836 if (sjis_p && c1 == charset_jisx0208)
1838 unsigned char s1, s2;
1840 ENCODE_SJIS (c2, c3, s1, s2);
1841 *dst++ = s1, *dst++ = s2;
1843 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1845 unsigned char b1, b2;
1847 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1848 *dst++ = b1, *dst++ = b2;
1850 else
1851 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1852 break;
1854 case EMACS_leading_code_4:
1855 THREE_MORE_BYTES (c2, c3, c4);
1856 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1857 break;
1859 case EMACS_leading_code_composition:
1860 coding->composing = 1;
1861 break;
1863 default: /* i.e. case EMACS_invalid_code: */
1864 *dst++ = c1;
1866 continue;
1868 label_end_of_loop:
1869 coding->carryover_size = src - src_base;
1870 bcopy (src_base, coding->carryover, coding->carryover_size);
1871 src = src_base;
1872 break;
1875 *consumed = src - source;
1876 return dst - destination;
1880 /*** 5. End-of-line handlers ***/
1882 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1883 This function is called only when `coding->eol_type' is
1884 CODING_EOL_CRLF or CODING_EOL_CR. */
1886 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1887 struct coding_system *coding;
1888 unsigned char *source, *destination;
1889 int src_bytes, dst_bytes;
1890 int *consumed;
1892 unsigned char *src = source;
1893 unsigned char *src_end = source + src_bytes;
1894 unsigned char *dst = destination;
1895 unsigned char *dst_end = destination + dst_bytes;
1896 int produced;
1898 switch (coding->eol_type)
1900 case CODING_EOL_CRLF:
1902 /* Since the maximum bytes produced by each loop is 2, we
1903 subtract 1 from DST_END to assure overflow checking is
1904 necessary only at the head of loop. */
1905 unsigned char *adjusted_dst_end = dst_end - 1;
1907 while (src < src_end && dst < adjusted_dst_end)
1909 unsigned char *src_base = src;
1910 unsigned char c = *src++;
1911 if (c == '\r')
1913 ONE_MORE_BYTE (c);
1914 if (c != '\n')
1915 *dst++ = '\r';
1916 *dst++ = c;
1918 else
1919 *dst++ = c;
1920 continue;
1922 label_end_of_loop:
1923 coding->carryover_size = src - src_base;
1924 bcopy (src_base, coding->carryover, coding->carryover_size);
1925 src = src_base;
1926 break;
1928 *consumed = src - source;
1929 produced = dst - destination;
1930 break;
1933 case CODING_EOL_CR:
1934 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1935 bcopy (source, destination, produced);
1936 dst_end = destination + produced;
1937 while (dst < dst_end)
1938 if (*dst++ == '\r') dst[-1] = '\n';
1939 *consumed = produced;
1940 break;
1942 default: /* i.e. case: CODING_EOL_LF */
1943 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1944 bcopy (source, destination, produced);
1945 *consumed = produced;
1946 break;
1949 return produced;
1952 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1953 format of end-of-line according to `coding->eol_type'. If
1954 `coding->selective' is 1, code '\r' in source text also means
1955 end-of-line. */
1957 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1958 struct coding_system *coding;
1959 unsigned char *source, *destination;
1960 int src_bytes, dst_bytes;
1961 int *consumed;
1963 unsigned char *src = source;
1964 unsigned char *dst = destination;
1965 int produced;
1967 if (src_bytes <= 0)
1968 return 0;
1970 switch (coding->eol_type)
1972 case CODING_EOL_LF:
1973 case CODING_EOL_AUTOMATIC:
1974 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1975 bcopy (source, destination, produced);
1976 if (coding->selective)
1978 int i = produced;
1979 while (i--)
1980 if (*dst++ == '\r') dst[-1] = '\n';
1982 *consumed = produced;
1984 case CODING_EOL_CRLF:
1986 unsigned char c;
1987 unsigned char *src_end = source + src_bytes;
1988 unsigned char *dst_end = destination + dst_bytes;
1989 /* Since the maximum bytes produced by each loop is 2, we
1990 subtract 1 from DST_END to assure overflow checking is
1991 necessary only at the head of loop. */
1992 unsigned char *adjusted_dst_end = dst_end - 1;
1994 while (src < src_end && dst < adjusted_dst_end)
1996 c = *src++;
1997 if (c == '\n' || (c == '\r' && coding->selective))
1998 *dst++ = '\r', *dst++ = '\n';
1999 else
2000 *dst++ = c;
2002 produced = dst - destination;
2003 *consumed = src - source;
2004 break;
2007 default: /* i.e. case CODING_EOL_CR: */
2008 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2009 bcopy (source, destination, produced);
2011 int i = produced;
2012 while (i--)
2013 if (*dst++ == '\n') dst[-1] = '\r';
2015 *consumed = produced;
2018 return produced;
2022 /*** 6. C library functions ***/
2024 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2025 has a property `coding-system'. The value of this property is a
2026 vector of length 5 (called as coding-vector). Among elements of
2027 this vector, the first (element[0]) and the fifth (element[4])
2028 carry important information for decoding/encoding. Before
2029 decoding/encoding, this information should be set in fields of a
2030 structure of type `coding_system'.
2032 A value of property `coding-system' can be a symbol of another
2033 subsidiary coding-system. In that case, Emacs gets coding-vector
2034 from that symbol.
2036 `element[0]' contains information to be set in `coding->type'. The
2037 value and its meaning is as follows:
2039 0 -- coding_system_internal
2040 1 -- coding_system_sjis
2041 2 -- coding_system_iso2022
2042 3 -- coding_system_big5
2043 4 -- coding_system_ccl
2044 nil -- coding_system_no_conversion
2045 t -- coding_system_automatic
2047 `element[4]' contains information to be set in `coding->flags' and
2048 `coding->spec'. The meaning varies by `coding->type'.
2050 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2051 of length 32 (of which the first 13 sub-elements are used now).
2052 Meanings of these sub-elements are:
2054 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2055 If the value is an integer of valid charset, the charset is
2056 assumed to be designated to graphic register N initially.
2058 If the value is minus, it is a minus value of charset which
2059 reserves graphic register N, which means that the charset is
2060 not designated initially but should be designated to graphic
2061 register N just before encoding a character in that charset.
2063 If the value is nil, graphic register N is never used on
2064 encoding.
2066 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2067 Each value takes t or nil. See the section ISO2022 of
2068 `coding.h' for more information.
2070 If `coding->type' is `coding_type_big5', element[4] is t to denote
2071 BIG5-ETen or nil to denote BIG5-HKU.
2073 If `coding->type' takes the other value, element[4] is ignored.
2075 Emacs Lisp's coding system also carries information about format of
2076 end-of-line in a value of property `eol-type'. If the value is
2077 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2078 means CODING_EOL_CR. If it is not integer, it should be a vector
2079 of subsidiary coding systems of which property `eol-type' has one
2080 of above values.
2084 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2085 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2086 is setup so that no conversion is necessary and return -1, else
2087 return 0. */
2090 setup_coding_system (coding_system, coding)
2091 Lisp_Object coding_system;
2092 struct coding_system *coding;
2094 Lisp_Object type, eol_type;
2096 /* At first, set several fields default values. */
2097 coding->require_flushing = 0;
2098 coding->last_block = 0;
2099 coding->selective = 0;
2100 coding->composing = 0;
2101 coding->direction = 0;
2102 coding->carryover_size = 0;
2103 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2104 /* We have not yet implemented a way to specify unification table in
2105 a coding system. */
2106 coding->character_unification_table = Qnil;
2108 Vlast_coding_system_used = coding->symbol = coding_system;
2109 eol_type = Qnil;
2110 /* Get value of property `coding-system' until we get a vector.
2111 While doing that, also get values of properties
2112 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2113 while (!NILP (coding_system) && SYMBOLP (coding_system))
2115 if (NILP (coding->post_read_conversion))
2116 coding->post_read_conversion = Fget (coding_system,
2117 Qpost_read_conversion);
2118 if (NILP (coding->pre_write_conversion))
2119 coding->pre_write_conversion = Fget (coding_system,
2120 Qpre_write_conversion);
2121 if (NILP (eol_type))
2122 eol_type = Fget (coding_system, Qeol_type);
2123 coding_system = Fget (coding_system, Qcoding_system);
2125 if (!VECTORP (coding_system)
2126 || XVECTOR (coding_system)->size != 5)
2127 goto label_invalid_coding_system;
2129 if (VECTORP (eol_type))
2130 coding->eol_type = CODING_EOL_AUTOMATIC;
2131 else if (XFASTINT (eol_type) == 1)
2132 coding->eol_type = CODING_EOL_CRLF;
2133 else if (XFASTINT (eol_type) == 2)
2134 coding->eol_type = CODING_EOL_CR;
2135 else
2136 coding->eol_type = CODING_EOL_LF;
2138 type = XVECTOR (coding_system)->contents[0];
2139 switch (XFASTINT (type))
2141 case 0:
2142 coding->type = coding_type_internal;
2143 break;
2145 case 1:
2146 coding->type = coding_type_sjis;
2147 break;
2149 case 2:
2150 coding->type = coding_type_iso2022;
2152 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2153 Lisp_Object *flags;
2154 int i, charset, default_reg_bits = 0;
2156 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2157 goto label_invalid_coding_system;
2159 flags = XVECTOR (val)->contents;
2160 coding->flags
2161 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2162 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2163 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2164 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2165 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2166 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2167 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2168 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2169 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2170 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2171 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2173 /* Invoke graphic register 0 to plane 0. */
2174 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2175 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2176 CODING_SPEC_ISO_INVOCATION (coding, 1)
2177 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2178 /* Not single shifting at first. */
2179 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2180 /* Beginning of buffer should also be regarded as bol. */
2181 CODING_SPEC_ISO_BOL(coding) = 1;
2183 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2184 FLAGS[REG] can be one of below:
2185 integer CHARSET: CHARSET occupies register I,
2186 t: designate nothing to REG initially, but can be used
2187 by any charsets,
2188 list of integer, nil, or t: designate the first
2189 element (if integer) to REG initially, the remaining
2190 elements (if integer) is designated to REG on request,
2191 if an element is t, REG can be used by any charset,
2192 nil: REG is never used. */
2193 for (charset = 0; charset <= MAX_CHARSET; charset++)
2194 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2195 for (i = 0; i < 4; i++)
2197 if (INTEGERP (flags[i])
2198 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2199 || (charset = get_charset_id (flags[i])) >= 0)
2201 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2202 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2204 else if (EQ (flags[i], Qt))
2206 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2207 default_reg_bits |= 1 << i;
2209 else if (CONSP (flags[i]))
2211 Lisp_Object tail = flags[i];
2213 if (INTEGERP (XCONS (tail)->car)
2214 && (charset = XINT (XCONS (tail)->car),
2215 CHARSET_VALID_P (charset))
2216 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2218 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2219 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2221 else
2222 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2223 tail = XCONS (tail)->cdr;
2224 while (CONSP (tail))
2226 if (INTEGERP (XCONS (tail)->car)
2227 && (charset = XINT (XCONS (tail)->car),
2228 CHARSET_VALID_P (charset))
2229 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2230 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2231 = i;
2232 else if (EQ (XCONS (tail)->car, Qt))
2233 default_reg_bits |= 1 << i;
2234 tail = XCONS (tail)->cdr;
2237 else
2238 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2240 CODING_SPEC_ISO_DESIGNATION (coding, i)
2241 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2244 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2246 /* REG 1 can be used only by locking shift in 7-bit env. */
2247 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2248 default_reg_bits &= ~2;
2249 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2250 /* Without any shifting, only REG 0 and 1 can be used. */
2251 default_reg_bits &= 3;
2254 for (charset = 0; charset <= MAX_CHARSET; charset++)
2255 if (CHARSET_VALID_P (charset)
2256 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2258 /* We have not yet decided where to designate CHARSET. */
2259 int reg_bits = default_reg_bits;
2261 if (CHARSET_CHARS (charset) == 96)
2262 /* A charset of CHARS96 can't be designated to REG 0. */
2263 reg_bits &= ~1;
2265 if (reg_bits)
2266 /* There exist some default graphic register. */
2267 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2268 = (reg_bits & 1
2269 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2270 else
2271 /* We anyway have to designate CHARSET to somewhere. */
2272 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2273 = (CHARSET_CHARS (charset) == 94
2275 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2276 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2278 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2279 ? 2 : 0)));
2282 coding->require_flushing = 1;
2283 break;
2285 case 3:
2286 coding->type = coding_type_big5;
2287 coding->flags
2288 = (NILP (XVECTOR (coding_system)->contents[4])
2289 ? CODING_FLAG_BIG5_HKU
2290 : CODING_FLAG_BIG5_ETEN);
2291 break;
2293 case 4:
2294 coding->type = coding_type_ccl;
2296 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2297 if (CONSP (val)
2298 && VECTORP (XCONS (val)->car)
2299 && VECTORP (XCONS (val)->cdr))
2301 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2302 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2304 else
2305 goto label_invalid_coding_system;
2307 coding->require_flushing = 1;
2308 break;
2310 default:
2311 if (EQ (type, Qt))
2312 coding->type = coding_type_automatic;
2313 else
2314 coding->type = coding_type_no_conversion;
2315 break;
2317 return 0;
2319 label_invalid_coding_system:
2320 coding->type = coding_type_no_conversion;
2321 coding->eol_type = CODING_EOL_LF;
2322 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2323 = Qnil;
2324 return -1;
2327 /* Emacs has a mechanism to automatically detect a coding system if it
2328 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2329 it's impossible to distinguish some coding systems accurately
2330 because they use the same range of codes. So, at first, coding
2331 systems are categorized into 7, those are:
2333 o coding-category-internal
2335 The category for a coding system which has the same code range
2336 as Emacs' internal format. Assigned the coding-system (Lisp
2337 symbol) `internal' by default.
2339 o coding-category-sjis
2341 The category for a coding system which has the same code range
2342 as SJIS. Assigned the coding-system (Lisp
2343 symbol) `shift-jis' by default.
2345 o coding-category-iso-7
2347 The category for a coding system which has the same code range
2348 as ISO2022 of 7-bit environment. Assigned the coding-system
2349 (Lisp symbol) `iso-2022-7' by default.
2351 o coding-category-iso-8-1
2353 The category for a coding system which has the same code range
2354 as ISO2022 of 8-bit environment and graphic plane 1 used only
2355 for DIMENSION1 charset. Assigned the coding-system (Lisp
2356 symbol) `iso-8859-1' by default.
2358 o coding-category-iso-8-2
2360 The category for a coding system which has the same code range
2361 as ISO2022 of 8-bit environment and graphic plane 1 used only
2362 for DIMENSION2 charset. Assigned the coding-system (Lisp
2363 symbol) `euc-japan' by default.
2365 o coding-category-iso-else
2367 The category for a coding system which has the same code range
2368 as ISO2022 but not belongs to any of the above three
2369 categories. Assigned the coding-system (Lisp symbol)
2370 `iso-2022-ss2-7' by default.
2372 o coding-category-big5
2374 The category for a coding system which has the same code range
2375 as BIG5. Assigned the coding-system (Lisp symbol)
2376 `cn-big5' by default.
2378 o coding-category-binary
2380 The category for a coding system not categorized in any of the
2381 above. Assigned the coding-system (Lisp symbol)
2382 `no-conversion' by default.
2384 Each of them is a Lisp symbol and the value is an actual
2385 `coding-system's (this is also a Lisp symbol) assigned by a user.
2386 What Emacs does actually is to detect a category of coding system.
2387 Then, it uses a `coding-system' assigned to it. If Emacs can't
2388 decide only one possible category, it selects a category of the
2389 highest priority. Priorities of categories are also specified by a
2390 user in a Lisp variable `coding-category-list'.
2394 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2395 If it detects possible coding systems, return an integer in which
2396 appropriate flag bits are set. Flag bits are defined by macros
2397 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2400 detect_coding_mask (src, src_bytes)
2401 unsigned char *src;
2402 int src_bytes;
2404 register unsigned char c;
2405 unsigned char *src_end = src + src_bytes;
2406 int mask;
2408 /* At first, skip all ASCII characters and control characters except
2409 for three ISO2022 specific control characters. */
2410 label_loop_detect_coding:
2411 while (src < src_end)
2413 c = *src;
2414 if (c >= 0x80
2415 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2416 break;
2417 src++;
2420 if (src >= src_end)
2421 /* We found nothing other than ASCII. There's nothing to do. */
2422 return CODING_CATEGORY_MASK_ANY;
2424 /* The text seems to be encoded in some multilingual coding system.
2425 Now, try to find in which coding system the text is encoded. */
2426 if (c < 0x80)
2428 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2429 /* C is an ISO2022 specific control code of C0. */
2430 mask = detect_coding_iso2022 (src, src_end);
2431 src++;
2432 if (mask == CODING_CATEGORY_MASK_ANY)
2433 /* No valid ISO2022 code follows C. Try again. */
2434 goto label_loop_detect_coding;
2436 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2437 /* C is an ISO2022 specific control code of C1,
2438 or the first byte of SJIS's 2-byte character code,
2439 or a leading code of Emacs. */
2440 mask = (detect_coding_iso2022 (src, src_end)
2441 | detect_coding_sjis (src, src_end)
2442 | detect_coding_internal (src, src_end));
2444 else if (c < 0xA0)
2445 /* C is the first byte of SJIS character code,
2446 or a leading-code of Emacs. */
2447 mask = (detect_coding_sjis (src, src_end)
2448 | detect_coding_internal (src, src_end));
2450 else
2451 /* C is a character of ISO2022 in graphic plane right,
2452 or a SJIS's 1-byte character code (i.e. JISX0201),
2453 or the first byte of BIG5's 2-byte code. */
2454 mask = (detect_coding_iso2022 (src, src_end)
2455 | detect_coding_sjis (src, src_end)
2456 | detect_coding_big5 (src, src_end));
2458 return mask;
2461 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2462 The information of the detected coding system is set in CODING. */
2464 void
2465 detect_coding (coding, src, src_bytes)
2466 struct coding_system *coding;
2467 unsigned char *src;
2468 int src_bytes;
2470 int mask = detect_coding_mask (src, src_bytes);
2471 int idx;
2473 if (mask == CODING_CATEGORY_MASK_ANY)
2474 /* We found nothing other than ASCII. There's nothing to do. */
2475 return;
2477 if (!mask)
2478 /* The source text seems to be encoded in unknown coding system.
2479 Emacs regards the category of such a kind of coding system as
2480 `coding-category-binary'. We assume that a user has assigned
2481 an appropriate coding system for a `coding-category-binary'. */
2482 idx = CODING_CATEGORY_IDX_BINARY;
2483 else
2485 /* We found some plausible coding systems. Let's use a coding
2486 system of the highest priority. */
2487 Lisp_Object val = Vcoding_category_list;
2489 if (CONSP (val))
2490 while (!NILP (val))
2492 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2493 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2494 break;
2495 val = XCONS (val)->cdr;
2497 else
2498 val = Qnil;
2500 if (NILP (val))
2502 /* For unknown reason, `Vcoding_category_list' contains none
2503 of found categories. Let's use any of them. */
2504 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2505 if (mask & (1 << idx))
2506 break;
2509 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2512 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2513 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2514 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2517 detect_eol_type (src, src_bytes)
2518 unsigned char *src;
2519 int src_bytes;
2521 unsigned char *src_end = src + src_bytes;
2522 unsigned char c;
2524 while (src < src_end)
2526 c = *src++;
2527 if (c == '\n')
2528 return CODING_EOL_LF;
2529 else if (c == '\r')
2531 if (src < src_end && *src == '\n')
2532 return CODING_EOL_CRLF;
2533 else
2534 return CODING_EOL_CR;
2537 return CODING_EOL_AUTOMATIC;
2540 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2541 is encoded. If it detects an appropriate format of end-of-line, it
2542 sets the information in *CODING. */
2544 void
2545 detect_eol (coding, src, src_bytes)
2546 struct coding_system *coding;
2547 unsigned char *src;
2548 int src_bytes;
2550 Lisp_Object val;
2551 int eol_type = detect_eol_type (src, src_bytes);
2553 if (eol_type == CODING_EOL_AUTOMATIC)
2554 /* We found no end-of-line in the source text. */
2555 return;
2557 val = Fget (coding->symbol, Qeol_type);
2558 if (VECTORP (val) && XVECTOR (val)->size == 3)
2559 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2562 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2563 decoding, it may detect coding system and format of end-of-line if
2564 those are not yet decided. */
2567 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2568 struct coding_system *coding;
2569 unsigned char *source, *destination;
2570 int src_bytes, dst_bytes;
2571 int *consumed;
2573 int produced;
2575 if (src_bytes <= 0)
2577 *consumed = 0;
2578 return 0;
2581 if (coding->type == coding_type_automatic)
2582 detect_coding (coding, source, src_bytes);
2584 if (coding->eol_type == CODING_EOL_AUTOMATIC)
2585 detect_eol (coding, source, src_bytes);
2587 coding->carryover_size = 0;
2588 switch (coding->type)
2590 case coding_type_no_conversion:
2591 label_no_conversion:
2592 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2593 bcopy (source, destination, produced);
2594 *consumed = produced;
2595 break;
2597 case coding_type_internal:
2598 case coding_type_automatic:
2599 if (coding->eol_type == CODING_EOL_LF
2600 || coding->eol_type == CODING_EOL_AUTOMATIC)
2601 goto label_no_conversion;
2602 produced = decode_eol (coding, source, destination,
2603 src_bytes, dst_bytes, consumed);
2604 break;
2606 case coding_type_sjis:
2607 produced = decode_coding_sjis_big5 (coding, source, destination,
2608 src_bytes, dst_bytes, consumed,
2610 break;
2612 case coding_type_iso2022:
2613 produced = decode_coding_iso2022 (coding, source, destination,
2614 src_bytes, dst_bytes, consumed);
2615 break;
2617 case coding_type_big5:
2618 produced = decode_coding_sjis_big5 (coding, source, destination,
2619 src_bytes, dst_bytes, consumed,
2621 break;
2623 case coding_type_ccl:
2624 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2625 src_bytes, dst_bytes, consumed);
2626 break;
2629 return produced;
2632 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2635 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2636 struct coding_system *coding;
2637 unsigned char *source, *destination;
2638 int src_bytes, dst_bytes;
2639 int *consumed;
2641 int produced;
2643 coding->carryover_size = 0;
2644 switch (coding->type)
2646 case coding_type_no_conversion:
2647 label_no_conversion:
2648 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2649 if (produced > 0)
2651 bcopy (source, destination, produced);
2652 if (coding->selective)
2654 unsigned char *p = destination, *pend = destination + produced;
2655 while (p < pend)
2656 if (*p++ == '\015') p[-1] = '\n';
2659 *consumed = produced;
2660 break;
2662 case coding_type_internal:
2663 case coding_type_automatic:
2664 if (coding->eol_type == CODING_EOL_LF
2665 || coding->eol_type == CODING_EOL_AUTOMATIC)
2666 goto label_no_conversion;
2667 produced = encode_eol (coding, source, destination,
2668 src_bytes, dst_bytes, consumed);
2669 break;
2671 case coding_type_sjis:
2672 produced = encode_coding_sjis_big5 (coding, source, destination,
2673 src_bytes, dst_bytes, consumed,
2675 break;
2677 case coding_type_iso2022:
2678 produced = encode_coding_iso2022 (coding, source, destination,
2679 src_bytes, dst_bytes, consumed);
2680 break;
2682 case coding_type_big5:
2683 produced = encode_coding_sjis_big5 (coding, source, destination,
2684 src_bytes, dst_bytes, consumed,
2686 break;
2688 case coding_type_ccl:
2689 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2690 src_bytes, dst_bytes, consumed);
2691 break;
2694 return produced;
2697 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2699 /* Return maximum size (bytes) of a buffer enough for decoding
2700 SRC_BYTES of text encoded in CODING. */
2703 decoding_buffer_size (coding, src_bytes)
2704 struct coding_system *coding;
2705 int src_bytes;
2707 int magnification;
2709 if (coding->type == coding_type_iso2022)
2710 magnification = 3;
2711 else if (coding->type == coding_type_ccl)
2712 magnification = coding->spec.ccl.decoder.buf_magnification;
2713 else
2714 magnification = 2;
2716 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2719 /* Return maximum size (bytes) of a buffer enough for encoding
2720 SRC_BYTES of text to CODING. */
2723 encoding_buffer_size (coding, src_bytes)
2724 struct coding_system *coding;
2725 int src_bytes;
2727 int magnification;
2729 if (coding->type == coding_type_ccl)
2730 magnification = coding->spec.ccl.encoder.buf_magnification;
2731 else
2732 magnification = 3;
2734 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2737 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2738 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2739 #endif
2741 char *conversion_buffer;
2742 int conversion_buffer_size;
2744 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2745 or decoding. Sufficient memory is allocated automatically. If we
2746 run out of memory, return NULL. */
2748 char *
2749 get_conversion_buffer (size)
2750 int size;
2752 if (size > conversion_buffer_size)
2754 char *buf;
2755 int real_size = conversion_buffer_size * 2;
2757 while (real_size < size) real_size *= 2;
2758 buf = (char *) xmalloc (real_size);
2759 xfree (conversion_buffer);
2760 conversion_buffer = buf;
2761 conversion_buffer_size = real_size;
2763 return conversion_buffer;
2767 #ifdef emacs
2768 /*** 7. Emacs Lisp library functions ***/
2770 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2771 1, 1, 0,
2772 "Return coding-vector of CODING-SYSTEM.\n\
2773 If CODING-SYSTEM is not a valid coding-system, return nil.")
2774 (obj)
2775 Lisp_Object obj;
2777 while (SYMBOLP (obj) && !NILP (obj))
2778 obj = Fget (obj, Qcoding_system);
2779 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2780 ? Qnil : obj);
2783 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2784 "Return t if OBJECT is nil or a coding-system.\n\
2785 See document of make-coding-system for coding-system object.")
2786 (obj)
2787 Lisp_Object obj;
2789 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2792 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2793 Sread_non_nil_coding_system, 1, 1, 0,
2794 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2795 (prompt)
2796 Lisp_Object prompt;
2798 Lisp_Object val;
2801 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2802 Qt, Qnil, Qnil, Qnil);
2804 while (XSTRING (val)->size == 0);
2805 return (Fintern (val, Qnil));
2808 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2809 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2810 (prompt)
2811 Lisp_Object prompt;
2813 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2814 Qt, Qnil, Qnil, Qnil);
2815 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2818 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2819 1, 1, 0,
2820 "Check validity of CODING-SYSTEM.\n\
2821 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2822 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2823 The value of property should be a vector of length 5.")
2824 (coding_system)
2825 Lisp_Object coding_system;
2827 CHECK_SYMBOL (coding_system, 0);
2828 if (!NILP (Fcoding_system_p (coding_system)))
2829 return coding_system;
2830 while (1)
2831 Fsignal (Qcoding_system_error, coding_system);
2834 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2835 2, 2, 0,
2836 "Detect coding-system of the text in the region between START and END.\n\
2837 Return a list of possible coding-systems ordered by priority.\n\
2838 If only ASCII characters are found, it returns `automatic-conversion'\n\
2839 or its subsidiary coding-system according to a detected end-of-line format.")
2840 (b, e)
2841 Lisp_Object b, e;
2843 int coding_mask, eol_type;
2844 Lisp_Object val;
2845 int beg, end;
2847 validate_region (&b, &e);
2848 beg = XINT (b), end = XINT (e);
2849 if (beg < GPT && end >= GPT) move_gap (end);
2851 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2852 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2854 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2856 val = intern ("automatic-conversion");
2857 if (eol_type != CODING_EOL_AUTOMATIC)
2859 Lisp_Object val2 = Fget (val, Qeol_type);
2860 if (VECTORP (val2))
2861 val = XVECTOR (val2)->contents[eol_type];
2864 else
2866 Lisp_Object val2;
2868 /* At first, gather possible coding-systems in VAL in a reverse
2869 order. */
2870 val = Qnil;
2871 for (val2 = Vcoding_category_list;
2872 !NILP (val2);
2873 val2 = XCONS (val2)->cdr)
2875 int idx
2876 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2877 if (coding_mask & (1 << idx))
2878 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2881 /* Then, change the order of the list, while getting subsidiary
2882 coding-systems. */
2883 val2 = val;
2884 val = Qnil;
2885 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2887 if (eol_type == CODING_EOL_AUTOMATIC)
2888 val = Fcons (XCONS (val2)->car, val);
2889 else
2891 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2892 if (VECTORP (val3))
2893 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2894 else
2895 val = Fcons (XCONS (val2)->car, val);
2900 return val;
2903 /* Scan text in the region between *BEGP and *ENDP, skip characters
2904 which we never have to encode to (iff ENCODEP is 1) or decode from
2905 coding system CODING at the head and tail, then set BEGP and ENDP
2906 to the addresses of start and end of the text we actually convert. */
2908 void
2909 shrink_conversion_area (begp, endp, coding, encodep)
2910 unsigned char **begp, **endp;
2911 struct coding_system *coding;
2912 int encodep;
2914 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2916 if (coding->eol_type != CODING_EOL_LF
2917 && coding->eol_type != CODING_EOL_AUTOMATIC)
2918 /* Since we anyway have to convert end-of-line format, it is not
2919 worth skipping at most 100 bytes or so. */
2920 return;
2922 if (encodep) /* for encoding */
2924 switch (coding->type)
2926 case coding_type_no_conversion:
2927 case coding_type_internal:
2928 case coding_type_automatic:
2929 /* We need no conversion. */
2930 *begp = *endp;
2931 return;
2932 case coding_type_ccl:
2933 /* We can't skip any data. */
2934 return;
2935 case coding_type_iso2022:
2936 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2938 unsigned char *bol = beg_addr;
2939 while (beg_addr < end_addr && *beg_addr < 0x80)
2941 beg_addr++;
2942 if (*(beg_addr - 1) == '\n')
2943 bol = beg_addr;
2945 beg_addr = bol;
2946 goto label_skip_tail;
2948 /* fall down ... */
2949 default:
2950 /* We can skip all ASCII characters at the head and tail. */
2951 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2952 label_skip_tail:
2953 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2954 break;
2957 else /* for decoding */
2959 switch (coding->type)
2961 case coding_type_no_conversion:
2962 /* We need no conversion. */
2963 *begp = *endp;
2964 return;
2965 case coding_type_internal:
2966 if (coding->eol_type == CODING_EOL_LF)
2968 /* We need no conversion. */
2969 *begp = *endp;
2970 return;
2972 /* We can skip all but carriage-return. */
2973 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2974 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2975 break;
2976 case coding_type_sjis:
2977 case coding_type_big5:
2978 /* We can skip all ASCII characters at the head. */
2979 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2980 /* We can skip all ASCII characters at the tail except for
2981 the second byte of SJIS or BIG5 code. */
2982 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2983 if (end_addr != *endp)
2984 end_addr++;
2985 break;
2986 case coding_type_ccl:
2987 /* We can't skip any data. */
2988 return;
2989 default: /* i.e. case coding_type_iso2022: */
2991 unsigned char c;
2993 /* We can skip all ASCII characters except for a few
2994 control codes at the head. */
2995 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2996 && c != ISO_CODE_CR && c != ISO_CODE_SO
2997 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2998 beg_addr++;
3000 break;
3003 *begp = beg_addr;
3004 *endp = end_addr;
3005 return;
3008 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3009 text between B and E. B and E are buffer position. */
3011 Lisp_Object
3012 code_convert_region (b, e, coding, encodep)
3013 Lisp_Object b, e;
3014 struct coding_system *coding;
3015 int encodep;
3017 int beg, end, len, consumed, produced;
3018 char *buf;
3019 unsigned char *begp, *endp;
3020 int pos = PT;
3022 validate_region (&b, &e);
3023 beg = XINT (b), end = XINT (e);
3024 if (beg < GPT && end >= GPT)
3025 move_gap (end);
3027 if (encodep && !NILP (coding->pre_write_conversion))
3029 /* We must call a pre-conversion function which may put a new
3030 text to be converted in a new buffer. */
3031 struct buffer *old = current_buffer, *new;
3033 TEMP_SET_PT (beg);
3034 call2 (coding->pre_write_conversion, b, e);
3035 if (old != current_buffer)
3037 /* Replace the original text by the text just generated. */
3038 len = ZV - BEGV;
3039 new = current_buffer;
3040 set_buffer_internal (old);
3041 del_range (beg, end);
3042 insert_from_buffer (new, 1, len, 0);
3043 end = beg + len;
3047 /* We may be able to shrink the conversion region. */
3048 begp = POS_ADDR (beg); endp = begp + (end - beg);
3049 shrink_conversion_area (&begp, &endp, coding, encodep);
3051 if (begp == endp)
3052 /* We need no conversion. */
3053 len = end - beg;
3054 else
3056 beg += begp - POS_ADDR (beg);
3057 end = beg + (endp - begp);
3059 if (encodep)
3060 len = encoding_buffer_size (coding, end - beg);
3061 else
3062 len = decoding_buffer_size (coding, end - beg);
3063 buf = get_conversion_buffer (len);
3065 coding->last_block = 1;
3066 produced = (encodep
3067 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3068 &consumed)
3069 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3070 &consumed));
3072 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3074 TEMP_SET_PT (beg);
3075 insert (buf, produced);
3076 del_range (PT, PT + end - beg);
3077 if (pos >= end)
3078 pos = PT + (pos - end);
3079 else if (pos > beg)
3080 pos = beg;
3081 TEMP_SET_PT (pos);
3084 if (!encodep && !NILP (coding->post_read_conversion))
3086 /* We must call a post-conversion function which may alter
3087 the text just converted. */
3088 Lisp_Object insval;
3090 beg = XINT (b);
3091 TEMP_SET_PT (beg);
3092 insval = call1 (coding->post_read_conversion, make_number (len));
3093 CHECK_NUMBER (insval, 0);
3094 len = XINT (insval);
3097 return make_number (len);
3100 Lisp_Object
3101 code_convert_string (str, coding, encodep, nocopy)
3102 Lisp_Object str, nocopy;
3103 struct coding_system *coding;
3104 int encodep;
3106 int len, consumed, produced;
3107 char *buf;
3108 unsigned char *begp, *endp;
3109 int head_skip, tail_skip;
3110 struct gcpro gcpro1;
3112 if (encodep && !NILP (coding->pre_write_conversion)
3113 || !encodep && !NILP (coding->post_read_conversion))
3115 /* Since we have to call Lisp functions which assume target text
3116 is in a buffer, after setting a temporary buffer, call
3117 code_convert_region. */
3118 int count = specpdl_ptr - specpdl;
3119 int len = XSTRING (str)->size;
3120 Lisp_Object result;
3121 struct buffer *old = current_buffer;
3123 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3124 temp_output_buffer_setup (" *code-converting-work*");
3125 set_buffer_internal (XBUFFER (Vstandard_output));
3126 insert_from_string (str, 0, len, 0);
3127 code_convert_region (make_number (BEGV), make_number (ZV),
3128 coding, encodep);
3129 result = make_buffer_string (BEGV, ZV, 0);
3130 set_buffer_internal (old);
3131 return unbind_to (count, result);
3134 /* We may be able to shrink the conversion region. */
3135 begp = XSTRING (str)->data;
3136 endp = begp + XSTRING (str)->size;
3137 shrink_conversion_area (&begp, &endp, coding, encodep);
3139 if (begp == endp)
3140 /* We need no conversion. */
3141 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3143 head_skip = begp - XSTRING (str)->data;
3144 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3146 GCPRO1 (str);
3148 if (encodep)
3149 len = encoding_buffer_size (coding, endp - begp);
3150 else
3151 len = decoding_buffer_size (coding, endp - begp);
3152 buf = get_conversion_buffer (len + head_skip + tail_skip);
3154 bcopy (XSTRING (str)->data, buf, head_skip);
3155 coding->last_block = 1;
3156 produced = (encodep
3157 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3158 buf + head_skip, endp - begp, len, &consumed)
3159 : decode_coding (coding, XSTRING (str)->data + head_skip,
3160 buf + head_skip, endp - begp, len, &consumed));
3161 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3162 buf + head_skip + produced,
3163 tail_skip);
3165 UNGCPRO;
3167 return make_string (buf, head_skip + produced + tail_skip);
3170 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3171 3, 3, "r\nzCoding system: ",
3172 "Decode current region by specified coding system.\n\
3173 When called from a program, takes three arguments:\n\
3174 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3175 Return length of decoded text.")
3176 (b, e, coding_system)
3177 Lisp_Object b, e, coding_system;
3179 struct coding_system coding;
3181 CHECK_NUMBER_COERCE_MARKER (b, 0);
3182 CHECK_NUMBER_COERCE_MARKER (e, 1);
3183 CHECK_SYMBOL (coding_system, 2);
3185 if (NILP (coding_system))
3186 return make_number (XFASTINT (e) - XFASTINT (b));
3187 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3188 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3190 return code_convert_region (b, e, &coding, 0);
3193 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3194 3, 3, "r\nzCoding system: ",
3195 "Encode current region by specified coding system.\n\
3196 When called from a program, takes three arguments:\n\
3197 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3198 Return length of encoded text.")
3199 (b, e, coding_system)
3200 Lisp_Object b, e, coding_system;
3202 struct coding_system coding;
3204 CHECK_NUMBER_COERCE_MARKER (b, 0);
3205 CHECK_NUMBER_COERCE_MARKER (e, 1);
3206 CHECK_SYMBOL (coding_system, 2);
3208 if (NILP (coding_system))
3209 return make_number (XFASTINT (e) - XFASTINT (b));
3210 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3211 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3213 return code_convert_region (b, e, &coding, 1);
3216 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3217 2, 3, 0,
3218 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3219 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3220 of decoding.")
3221 (string, coding_system, nocopy)
3222 Lisp_Object string, coding_system, nocopy;
3224 struct coding_system coding;
3226 CHECK_STRING (string, 0);
3227 CHECK_SYMBOL (coding_system, 1);
3229 if (NILP (coding_system))
3230 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3231 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3232 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3234 return code_convert_string (string, &coding, 0, nocopy);
3237 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3238 2, 3, 0,
3239 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3240 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3241 of encoding.")
3242 (string, coding_system, nocopy)
3243 Lisp_Object string, coding_system, nocopy;
3245 struct coding_system coding;
3247 CHECK_STRING (string, 0);
3248 CHECK_SYMBOL (coding_system, 1);
3250 if (NILP (coding_system))
3251 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3252 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3253 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3255 return code_convert_string (string, &coding, 1, nocopy);
3258 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3259 "Decode a JISX0208 character of shift-jis encoding.\n\
3260 CODE is the character code in SJIS.\n\
3261 Return the corresponding character.")
3262 (code)
3263 Lisp_Object code;
3265 unsigned char c1, c2, s1, s2;
3266 Lisp_Object val;
3268 CHECK_NUMBER (code, 0);
3269 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3270 DECODE_SJIS (s1, s2, c1, c2);
3271 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3272 return val;
3275 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3276 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3277 Return the corresponding character code in SJIS.")
3278 (ch)
3279 Lisp_Object ch;
3281 int charset, c1, c2, s1, s2;
3282 Lisp_Object val;
3284 CHECK_NUMBER (ch, 0);
3285 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3286 if (charset == charset_jisx0208)
3288 ENCODE_SJIS (c1, c2, s1, s2);
3289 XSETFASTINT (val, (s1 << 8) | s2);
3291 else
3292 XSETFASTINT (val, 0);
3293 return val;
3296 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3297 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3298 CODE is the character code in BIG5.\n\
3299 Return the corresponding character.")
3300 (code)
3301 Lisp_Object code;
3303 int charset;
3304 unsigned char b1, b2, c1, c2;
3305 Lisp_Object val;
3307 CHECK_NUMBER (code, 0);
3308 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3309 DECODE_BIG5 (b1, b2, charset, c1, c2);
3310 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3311 return val;
3314 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3315 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3316 Return the corresponding character code in Big5.")
3317 (ch)
3318 Lisp_Object ch;
3320 int charset, c1, c2, b1, b2;
3321 Lisp_Object val;
3323 CHECK_NUMBER (ch, 0);
3324 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3325 if (charset == charset_big5_1 || charset == charset_big5_2)
3327 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3328 XSETFASTINT (val, (b1 << 8) | b2);
3330 else
3331 XSETFASTINT (val, 0);
3332 return val;
3335 DEFUN ("set-terminal-coding-system",
3336 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3337 "zCoding-system for terminal display: ",
3338 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3339 All outputs to terminal are encoded to this coding-system.")
3340 (coding_system)
3341 Lisp_Object coding_system;
3343 CHECK_SYMBOL (coding_system, 0);
3344 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3345 update_mode_lines++;
3346 if (!NILP (Finteractive_p ()))
3347 Fredraw_display ();
3348 return Qnil;
3351 DEFUN ("terminal-coding-system",
3352 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3353 "Return coding-system of your terminal.")
3356 return terminal_coding.symbol;
3359 DEFUN ("set-keyboard-coding-system",
3360 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1, 0,
3361 "Set coding-system of codes sent from terminal keyboard to CODING-SYSTEM.\n\
3362 In Encoded-kbd minor mode, user inputs are decoded\n\
3363 accoding to CODING-SYSTEM.\n\
3364 Do not call this function directly, but use the command\n\
3365 encoded-kbd-set-coding-system to activate Encoded-kbd mode\n\
3366 with a specific coding system.")
3367 (coding_system)
3368 Lisp_Object coding_system;
3370 CHECK_SYMBOL (coding_system, 0);
3371 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3372 return Qnil;
3375 DEFUN ("keyboard-coding-system",
3376 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3377 "Return coding-system of what is sent from terminal keyboard.")
3380 return keyboard_coding.symbol;
3384 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3385 1, MANY, 0,
3386 "Choose a coding system for a file operation based on file name.\n\
3387 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3388 ENCODING-SYSTEM is the coding system to use for encoding\n\
3389 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3390 for decoding (in case OPERATION does decoding).\n\
3392 The first argument OPERATION specifies an I/O primitive:\n\
3393 For file I/O, `insert-file-contents' or `write-region'.\n\
3394 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3395 For network I/O, `open-network-stream'.\n\
3397 The remaining arguments should be the same arguments that were passed\n\
3398 to the primitive. Depending on which primitive, one of those arguments\n\
3399 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3400 whichever argument specifies the file name is TARGET.\n\
3402 TARGET has a meaning which depends on OPERATION:\n\
3403 For file I/O, TARGET is a file name.\n\
3404 For process I/O, TARGET is a process name.\n\
3405 For network I/O, TARGET is a service name or a port number\n\
3407 This function looks up what `coding-system-alist' specifies for\n\
3408 OPERATION and TARGET. It may specify a cons cell which represents\n\
3409 a particular coding system or it may have a function to call.\n\
3410 In the latter case, we call the function with one argument,\n\
3411 which is a list of all the arguments given to `find-coding-system'.")
3412 (nargs, args)
3413 int nargs;
3414 Lisp_Object *args;
3416 Lisp_Object operation, target_idx, target, val;
3417 register Lisp_Object chain;
3419 if (nargs < 2)
3420 error ("Too few arguments");
3421 operation = args[0];
3422 if (!SYMBOLP (operation)
3423 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3424 error ("Invalid first arguement");
3425 if (nargs < 1 + XINT (target_idx))
3426 error ("Too few arguments for operation: %s",
3427 XSYMBOL (operation)->name->data);
3428 target = args[XINT (target_idx) + 1];
3429 if (!(STRINGP (target)
3430 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3431 error ("Invalid %dth argument", XINT (target_idx) + 1);
3433 chain = Fassq (operation, Vcoding_system_alist);
3434 if (NILP (chain))
3435 return Qnil;
3437 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3439 Lisp_Object elt = XCONS (chain)->car;
3441 if (CONSP (elt)
3442 && ((STRINGP (target)
3443 && STRINGP (XCONS (elt)->car)
3444 && fast_string_match (XCONS (elt)->car, target) >= 0)
3445 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3446 return (val = XCONS (elt)->cdr, CONSP (val)
3447 ? val
3448 : ((SYMBOLP (val) && !NILP (Fboundp (val))
3449 ? call2 (val, Flist (nargs, args))
3450 : Qnil)));
3452 return Qnil;
3455 #endif /* emacs */
3458 /*** 8. Post-amble ***/
3460 init_coding_once ()
3462 int i;
3464 /* Emacs internal format specific initialize routine. */
3465 for (i = 0; i <= 0x20; i++)
3466 emacs_code_class[i] = EMACS_control_code;
3467 emacs_code_class[0x0A] = EMACS_linefeed_code;
3468 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3469 for (i = 0x21 ; i < 0x7F; i++)
3470 emacs_code_class[i] = EMACS_ascii_code;
3471 emacs_code_class[0x7F] = EMACS_control_code;
3472 emacs_code_class[0x80] = EMACS_leading_code_composition;
3473 for (i = 0x81; i < 0xFF; i++)
3474 emacs_code_class[i] = EMACS_invalid_code;
3475 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3476 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3477 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3478 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3480 /* ISO2022 specific initialize routine. */
3481 for (i = 0; i < 0x20; i++)
3482 iso_code_class[i] = ISO_control_code;
3483 for (i = 0x21; i < 0x7F; i++)
3484 iso_code_class[i] = ISO_graphic_plane_0;
3485 for (i = 0x80; i < 0xA0; i++)
3486 iso_code_class[i] = ISO_control_code;
3487 for (i = 0xA1; i < 0xFF; i++)
3488 iso_code_class[i] = ISO_graphic_plane_1;
3489 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3490 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3491 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3492 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3493 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3494 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3495 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3496 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3497 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3498 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3500 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3501 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3503 setup_coding_system (Qnil, &keyboard_coding);
3504 setup_coding_system (Qnil, &terminal_coding);
3507 #ifdef emacs
3509 syms_of_coding ()
3511 Qtarget_idx = intern ("target-idx");
3512 staticpro (&Qtarget_idx);
3514 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3515 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3517 Qcall_process = intern ("call-process");
3518 staticpro (&Qcall_process);
3519 Fput (Qcall_process, Qtarget_idx, make_number (0));
3521 Qcall_process_region = intern ("call-process-region");
3522 staticpro (&Qcall_process_region);
3523 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3525 Qstart_process = intern ("start-process");
3526 staticpro (&Qstart_process);
3527 Fput (Qstart_process, Qtarget_idx, make_number (2));
3529 Qopen_network_stream = intern ("open-network-stream");
3530 staticpro (&Qopen_network_stream);
3531 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3533 Qcoding_system = intern ("coding-system");
3534 staticpro (&Qcoding_system);
3536 Qeol_type = intern ("eol-type");
3537 staticpro (&Qeol_type);
3539 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3540 staticpro (&Qbuffer_file_coding_system);
3542 Qpost_read_conversion = intern ("post-read-conversion");
3543 staticpro (&Qpost_read_conversion);
3545 Qpre_write_conversion = intern ("pre-write-conversion");
3546 staticpro (&Qpre_write_conversion);
3548 Qcoding_system_vector = intern ("coding-system-vector");
3549 staticpro (&Qcoding_system_vector);
3551 Qcoding_system_p = intern ("coding-system-p");
3552 staticpro (&Qcoding_system_p);
3554 Qcoding_system_error = intern ("coding-system-error");
3555 staticpro (&Qcoding_system_error);
3557 Fput (Qcoding_system_error, Qerror_conditions,
3558 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3559 Fput (Qcoding_system_error, Qerror_message,
3560 build_string ("Coding-system error"));
3562 Qcoding_category_index = intern ("coding-category-index");
3563 staticpro (&Qcoding_category_index);
3566 int i;
3567 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3569 coding_category_table[i] = intern (coding_category_name[i]);
3570 staticpro (&coding_category_table[i]);
3571 Fput (coding_category_table[i], Qcoding_category_index,
3572 make_number (i));
3576 Qcharacter_unification_table = intern ("character-unification-table");
3577 staticpro (&Qcharacter_unification_table);
3578 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3579 make_number (0));
3581 defsubr (&Scoding_system_vector);
3582 defsubr (&Scoding_system_p);
3583 defsubr (&Sread_coding_system);
3584 defsubr (&Sread_non_nil_coding_system);
3585 defsubr (&Scheck_coding_system);
3586 defsubr (&Sdetect_coding_region);
3587 defsubr (&Sdecode_coding_region);
3588 defsubr (&Sencode_coding_region);
3589 defsubr (&Sdecode_coding_string);
3590 defsubr (&Sencode_coding_string);
3591 defsubr (&Sdecode_sjis_char);
3592 defsubr (&Sencode_sjis_char);
3593 defsubr (&Sdecode_big5_char);
3594 defsubr (&Sencode_big5_char);
3595 defsubr (&Sset_terminal_coding_system);
3596 defsubr (&Sterminal_coding_system);
3597 defsubr (&Sset_keyboard_coding_system);
3598 defsubr (&Skeyboard_coding_system);
3599 defsubr (&Sfind_coding_system);
3601 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3602 "List of coding-categories (symbols) ordered by priority.");
3604 int i;
3606 Vcoding_category_list = Qnil;
3607 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3608 Vcoding_category_list
3609 = Fcons (coding_category_table[i], Vcoding_category_list);
3612 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3613 "A variable of internal use only.\n\
3614 If the value is a coding system, it is used for decoding on read operation.\n\
3615 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3616 Vcoding_system_for_read = Qnil;
3618 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3619 "A variable of internal use only.\n\
3620 If the value is a coding system, it is used for encoding on write operation.\n\
3621 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3622 Vcoding_system_for_write = Qnil;
3624 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3625 "Coding-system used in the latest file or process I/O.");
3626 Vlast_coding_system_used = Qnil;
3628 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3629 "Nested alist to decide a coding system for a specific I/O operation.\n\
3630 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3632 OPERATION is one of the following Emacs I/O primitives:\n\
3633 For file I/O, insert-file-contents and write-region.\n\
3634 For process I/O, call-process, call-process-region, and start-process.\n\
3635 For network I/O, open-network-stream.\n\
3636 In addition, for process I/O, `process-argument' can be specified for\n\
3637 encoding arguments of the process.\n\
3639 REGEXP is a regular expression matching a target of OPERATION, where\n\
3640 target is a file name for file I/O operations, a process name for\n\
3641 process I/O operations, or a service name for network I/O\n\
3642 operations. REGEXP might be a port number for network I/O operation.\n\
3644 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3645 character code on OPERATION, or a function symbol returning the cons.\n\
3646 See the documentation of `find-coding-system' for more detail.");
3647 Vcoding_system_alist = Qnil;
3649 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3650 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3651 eol_mnemonic_unix = '.';
3653 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3654 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3655 eol_mnemonic_dos = ':';
3657 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3658 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3659 eol_mnemonic_mac = '\'';
3661 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3662 "Mnemonic character indicating end-of-line format is not yet decided.");
3663 eol_mnemonic_undecided = '-';
3665 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3666 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3667 Venable_character_unification = Qt;
3669 DEFVAR_LISP ("standard-character-unification-table-for-read",
3670 &Vstandard_character_unification_table_for_read,
3671 "Table for unifying characters when reading.");
3672 Vstandard_character_unification_table_for_read = Qnil;
3674 DEFVAR_LISP ("standard-character-unification-table-for-write",
3675 &Vstandard_character_unification_table_for_write,
3676 "Table for unifying characters when writing.");
3677 Vstandard_character_unification_table_for_write = Qnil;
3679 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3680 "Alist of charsets vs revision numbers.\n\
3681 While encoding, if a charset (car part of an element) is found,\n\
3682 designate it with the escape sequence identifing revision (cdr part of the element).");
3683 Vcharset_revision_alist = Qnil;
3686 #endif /* emacs */