(Ffind_coding_system): Doc fix.
[emacs.git] / src / coding.c
blobaf3e33668728e4bda7496aedf6f8dd5614540e98
1 /* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6 This file is part of GNU Emacs.
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /*** TABLE OF CONTENTS ***
25 1. Preamble
26 2. Emacs' internal format handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
32 8. Post-amble
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format, and when we say "encode", it means
42 converting Emacs' internal format to some other coding system.
44 0. Emacs' internal format
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
49 1. ISO2022
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
62 3. BIG5
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
70 4. Else
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107 #if 0
109 detect_coding_internal (src, src_end)
110 unsigned char *src, *src_end;
114 #endif
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format. The resulting text goes to a
120 place pointed by DESTINATION, the length of which should not exceed
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
122 The return value is the length of the decoded text. Below is a
123 template of these functions. */
124 #if 0
125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
133 #endif
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
137 These functions encode SRC_BYTES length text at SOURCE of Emacs
138 internal format to CODING. The resulting text goes to a place
139 pointed by DESTINATION, the length of which should not exceed
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
141 The return value is the length of the encoded text. Below is a
142 template of these functions. */
143 #if 0
144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
152 #endif
154 /*** COMMONLY USED MACROS ***/
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
162 #define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
170 #define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
178 #define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
186 /* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
193 /* Decode one ASCII character C. */
195 #define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
228 /*** 1. Preamble ***/
230 #include <stdio.h>
232 #ifdef emacs
234 #include <config.h>
235 #include "lisp.h"
236 #include "buffer.h"
237 #include "charset.h"
238 #include "ccl.h"
239 #include "coding.h"
240 #include "window.h"
242 #else /* not emacs */
244 #include "mulelib.h"
246 #endif /* not emacs */
248 Lisp_Object Qcoding_system, Qeol_type;
249 Lisp_Object Qbuffer_file_coding_system;
250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254 Lisp_Object Qstart_process, Qopen_network_stream;
255 Lisp_Object Qtarget_idx;
257 /* Mnemonic character of each format of end-of-line. */
258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259 /* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261 int eol_mnemonic_undecided;
263 #ifdef emacs
265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269 /* Coding-system for reading files and receiving data from process. */
270 Lisp_Object Vcoding_system_for_read;
271 /* Coding-system for writing files and sending data to process. */
272 Lisp_Object Vcoding_system_for_write;
273 /* Coding-system actually used in the latest I/O. */
274 Lisp_Object Vlast_coding_system_used;
276 /* Coding-system of what terminal accept for displaying. */
277 struct coding_system terminal_coding;
279 /* Coding-system of what is sent from terminal keyboard. */
280 struct coding_system keyboard_coding;
282 Lisp_Object Vcoding_system_alist;
284 #endif /* emacs */
286 Lisp_Object Qcoding_category_index;
288 /* List of symbols `coding-category-xxx' ordered by priority. */
289 Lisp_Object Vcoding_category_list;
291 /* Table of coding-systems currently assigned to each coding-category. */
292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
294 /* Table of names of symbol for each coding-category. */
295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
296 "coding-category-internal",
297 "coding-category-sjis",
298 "coding-category-iso-7",
299 "coding-category-iso-8-1",
300 "coding-category-iso-8-2",
301 "coding-category-iso-else",
302 "coding-category-big5",
303 "coding-category-binary"
306 /* Alist of charsets vs the alternate charsets. */
307 Lisp_Object Valternate_charset_table;
309 /* Alist of charsets vs revision number. */
310 Lisp_Object Vcharset_revision_alist;
313 /*** 2. Emacs internal format handlers ***/
315 /* Emacs' internal format for encoding multiple character sets is a
316 kind of multi-byte encoding, i.e. encoding a character by a sequence
317 of one-byte codes of variable length. ASCII characters and control
318 characters (e.g. `tab', `newline') are represented by one-byte as
319 is. It takes the range 0x00 through 0x7F. The other characters
320 are represented by a sequence of `base leading-code', optional
321 `extended leading-code', and one or two `position-code's. Length
322 of the sequence is decided by the base leading-code. Leading-code
323 takes the range 0x80 through 0x9F, whereas extended leading-code
324 and position-code take the range 0xA0 through 0xFF. See the
325 document of `charset.h' for more detail about leading-code and
326 position-code.
328 There's one exception in this rule. Special leading-code
329 `leading-code-composition' denotes that the following several
330 characters should be composed into one character. Leading-codes of
331 components (except for ASCII) are added 0x20. An ASCII character
332 component is represented by a 2-byte sequence of `0xA0' and
333 `ASCII-code + 0x80'. See also the document in `charset.h' for the
334 detail of composite character. Hence, we can summarize the code
335 range as follows:
337 --- CODE RANGE of Emacs' internal format ---
338 (character set) (range)
339 ASCII 0x00 .. 0x7F
340 ELSE (1st byte) 0x80 .. 0x9F
341 (rest bytes) 0xA0 .. 0xFF
342 ---------------------------------------------
346 enum emacs_code_class_type emacs_code_class[256];
348 /* Go to the next statement only if *SRC is accessible and the code is
349 greater than 0xA0. */
350 #define CHECK_CODE_RANGE_A0_FF \
351 do { \
352 if (src >= src_end) \
353 goto label_end_of_switch; \
354 else if (*src++ < 0xA0) \
355 return 0; \
356 } while (0)
358 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
359 Check if a text is encoded in Emacs' internal format. If it is,
360 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
363 detect_coding_internal (src, src_end)
364 unsigned char *src, *src_end;
366 unsigned char c;
367 int composing = 0;
369 while (src < src_end)
371 c = *src++;
373 if (composing)
375 if (c < 0xA0)
376 composing = 0;
377 else
378 c -= 0x20;
381 switch (emacs_code_class[c])
383 case EMACS_ascii_code:
384 case EMACS_linefeed_code:
385 break;
387 case EMACS_control_code:
388 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
389 return 0;
390 break;
392 case EMACS_invalid_code:
393 return 0;
395 case EMACS_leading_code_composition: /* c == 0x80 */
396 if (composing)
397 CHECK_CODE_RANGE_A0_FF;
398 else
399 composing = 1;
400 break;
402 case EMACS_leading_code_4:
403 CHECK_CODE_RANGE_A0_FF;
404 /* fall down to check it two more times ... */
406 case EMACS_leading_code_3:
407 CHECK_CODE_RANGE_A0_FF;
408 /* fall down to check it one more time ... */
410 case EMACS_leading_code_2:
411 CHECK_CODE_RANGE_A0_FF;
412 break;
414 default:
415 label_end_of_switch:
416 break;
419 return CODING_CATEGORY_MASK_INTERNAL;
423 /*** 3. ISO2022 handlers ***/
425 /* The following note describes the coding system ISO2022 briefly.
426 Since the intension of this note is to help understanding of the
427 programs in this file, some parts are NOT ACCURATE or OVERLY
428 SIMPLIFIED. For the thorough understanding, please refer to the
429 original document of ISO2022.
431 ISO2022 provides many mechanisms to encode several character sets
432 in 7-bit and 8-bit environment. If one choose 7-bite environment,
433 all text is encoded by codes of less than 128. This may make the
434 encoded text a little bit longer, but the text get more stability
435 to pass through several gateways (some of them split MSB off).
437 There are two kind of character set: control character set and
438 graphic character set. The former contains control characters such
439 as `newline' and `escape' to provide control functions (control
440 functions are provided also by escape sequence). The latter
441 contains graphic characters such as ' A' and '-'. Emacs recognizes
442 two control character sets and many graphic character sets.
444 Graphic character sets are classified into one of the following
445 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
446 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
447 bytes (DIMENSION) and the number of characters in one dimension
448 (CHARS) of the set. In addition, each character set is assigned an
449 identification tag (called "final character" and denoted as <F>
450 here after) which is unique in each class. <F> of each character
451 set is decided by ECMA(*) when it is registered in ISO. Code range
452 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
454 Note (*): ECMA = European Computer Manufacturers Association
456 Here are examples of graphic character set [NAME(<F>)]:
457 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
458 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
459 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
460 o DIMENSION2_CHARS96 -- none for the moment
462 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
463 C0 [0x00..0x1F] -- control character plane 0
464 GL [0x20..0x7F] -- graphic character plane 0
465 C1 [0x80..0x9F] -- control character plane 1
466 GR [0xA0..0xFF] -- graphic character plane 1
468 A control character set is directly designated and invoked to C0 or
469 C1 by an escape sequence. The most common case is that ISO646's
470 control character set is designated/invoked to C0 and ISO6429's
471 control character set is designated/invoked to C1, and usually
472 these designations/invocations are omitted in a coded text. With
473 7-bit environment, only C0 can be used, and a control character for
474 C1 is encoded by an appropriate escape sequence to fit in the
475 environment. All control characters for C1 are defined the
476 corresponding escape sequences.
478 A graphic character set is at first designated to one of four
479 graphic registers (G0 through G3), then these graphic registers are
480 invoked to GL or GR. These designations and invocations can be
481 done independently. The most common case is that G0 is invoked to
482 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
483 these invocations and designations are omitted in a coded text.
484 With 7-bit environment, only GL can be used.
486 When a graphic character set of CHARS94 is invoked to GL, code 0x20
487 and 0x7F of GL area work as control characters SPACE and DEL
488 respectively, and code 0xA0 and 0xFF of GR area should not be used.
490 There are two ways of invocation: locking-shift and single-shift.
491 With locking-shift, the invocation lasts until the next different
492 invocation, whereas with single-shift, the invocation works only
493 for the following character and doesn't affect locking-shift.
494 Invocations are done by the following control characters or escape
495 sequences.
497 ----------------------------------------------------------------------
498 function control char escape sequence description
499 ----------------------------------------------------------------------
500 SI (shift-in) 0x0F none invoke G0 to GL
501 SI (shift-out) 0x0E none invoke G1 to GL
502 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
503 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
504 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
505 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
506 ----------------------------------------------------------------------
507 The first four are for locking-shift. Control characters for these
508 functions are defined by macros ISO_CODE_XXX in `coding.h'.
510 Designations are done by the following escape sequences.
511 ----------------------------------------------------------------------
512 escape sequence description
513 ----------------------------------------------------------------------
514 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
515 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
516 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
517 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
518 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
519 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
520 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
521 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
522 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
523 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
524 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
525 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
526 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
527 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
528 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
529 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
530 ----------------------------------------------------------------------
532 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
533 of dimension 1, chars 94, and final character <F>, and etc.
535 Note (*): Although these designations are not allowed in ISO2022,
536 Emacs accepts them on decoding, and produces them on encoding
537 CHARS96 character set in a coding system which is characterized as
538 7-bit environment, non-locking-shift, and non-single-shift.
540 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
541 '(' can be omitted. We call this as "short-form" here after.
543 Now you may notice that there are a lot of ways for encoding the
544 same multilingual text in ISO2022. Actually, there exist many
545 coding systems such as Compound Text (used in X's inter client
546 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
547 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
548 localized platforms), and all of these are variants of ISO2022.
550 In addition to the above, Emacs handles two more kinds of escape
551 sequences: ISO6429's direction specification and Emacs' private
552 sequence for specifying character composition.
554 ISO6429's direction specification takes the following format:
555 o CSI ']' -- end of the current direction
556 o CSI '0' ']' -- end of the current direction
557 o CSI '1' ']' -- start of left-to-right text
558 o CSI '2' ']' -- start of right-to-left text
559 The control character CSI (0x9B: control sequence introducer) is
560 abbreviated to the escape sequence ESC '[' in 7-bit environment.
562 Character composition specification takes the following format:
563 o ESC '0' -- start character composition
564 o ESC '1' -- end character composition
565 Since these are not standard escape sequences of any ISO, the use
566 of them for these meaning is restricted to Emacs only. */
568 enum iso_code_class_type iso_code_class[256];
570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
571 Check if a text is encoded in ISO2022. If it is, returns an
572 integer in which appropriate flag bits any of:
573 CODING_CATEGORY_MASK_ISO_7
574 CODING_CATEGORY_MASK_ISO_8_1
575 CODING_CATEGORY_MASK_ISO_8_2
576 CODING_CATEGORY_MASK_ISO_ELSE
577 are set. If a code which should never appear in ISO2022 is found,
578 returns 0. */
581 detect_coding_iso2022 (src, src_end)
582 unsigned char *src, *src_end;
584 unsigned char c, g1 = 0;
585 int mask = (CODING_CATEGORY_MASK_ISO_7
586 | CODING_CATEGORY_MASK_ISO_8_1
587 | CODING_CATEGORY_MASK_ISO_8_2);
588 /* We may look ahead at most 4 bytes. */
589 unsigned char *adjusted_src_end = src_end - 4;
590 int i;
592 while (src < src_end)
594 c = *src++;
595 switch (c)
597 case ISO_CODE_ESC:
598 if (src >= src_end)
599 break;
600 c = *src++;
601 if (src + 2 >= src_end
602 && ((c >= '(' && c <= '/')
603 || c == '$' && ((*src >= '(' && *src <= '/')
604 || (*src >= '@' && *src <= 'B'))))
606 /* Valid designation sequence. */
607 if (c == ')' || (c == '$' && *src == ')'))
608 g1 = 1;
609 src++;
610 break;
612 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
613 return CODING_CATEGORY_MASK_ISO_ELSE;
614 break;
616 case ISO_CODE_SO:
617 if (g1)
618 return CODING_CATEGORY_MASK_ISO_ELSE;
619 break;
621 case ISO_CODE_CSI:
622 case ISO_CODE_SS2:
623 case ISO_CODE_SS3:
624 mask &= ~CODING_CATEGORY_MASK_ISO_7;
625 break;
627 default:
628 if (c < 0x80)
629 break;
630 else if (c < 0xA0)
631 return 0;
632 else
634 int count = 1;
636 mask &= ~CODING_CATEGORY_MASK_ISO_7;
637 while (src < src_end && *src >= 0xA0)
638 count++, src++;
639 if (count & 1 && src < src_end)
640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
642 break;
646 return mask;
649 /* Decode a character of which charset is CHARSET and the 1st position
650 code is C1. If dimension of CHARSET 2, the 2nd position code is
651 fetched from SRC and set to C2. If CHARSET is negative, it means
652 that we are decoding ill formed text, and what we can do is just to
653 read C1 as is. */
655 #define DECODE_ISO_CHARACTER(charset, c1) \
656 do { \
657 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
658 ONE_MORE_BYTE (c2); \
659 if (COMPOSING_HEAD_P (coding->composing)) \
661 *dst++ = LEADING_CODE_COMPOSITION; \
662 if (COMPOSING_WITH_RULE_P (coding->composing)) \
663 /* To tell composition rules are embeded. */ \
664 *dst++ = 0xFF; \
665 coding->composing += 2; \
667 if ((charset) < 0) \
668 *dst++ = c1; \
669 else if ((charset) == CHARSET_ASCII) \
670 DECODE_CHARACTER_ASCII (c1); \
671 else if (CHARSET_DIMENSION (charset) == 1) \
672 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
673 else \
674 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell a composition rule follows. */ \
677 coding->composing = COMPOSING_WITH_RULE_RULE; \
678 } while (0)
680 /* Set designation state into CODING. */
681 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
682 do { \
683 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
684 Lisp_Object temp \
685 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
686 if (! NILP (temp)) \
687 charset = get_charset_id (XCONS (temp)->cdr); \
688 if (charset >= 0) \
690 if (coding->direction == 1 \
691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
692 charset = CHARSET_REVERSE_CHARSET (charset); \
693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
695 } while (0)
697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
700 decode_coding_iso2022 (coding, source, destination,
701 src_bytes, dst_bytes, consumed)
702 struct coding_system *coding;
703 unsigned char *source, *destination;
704 int src_bytes, dst_bytes;
705 int *consumed;
707 unsigned char *src = source;
708 unsigned char *src_end = source + src_bytes;
709 unsigned char *dst = destination;
710 unsigned char *dst_end = destination + dst_bytes;
711 /* Since the maximum bytes produced by each loop is 7, we subtract 6
712 from DST_END to assure that overflow checking is necessary only
713 at the head of loop. */
714 unsigned char *adjusted_dst_end = dst_end - 6;
715 int charset;
716 /* Charsets invoked to graphic plane 0 and 1 respectively. */
717 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
718 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
720 while (src < src_end && dst < adjusted_dst_end)
722 /* SRC_BASE remembers the start position in source in each loop.
723 The loop will be exited when there's not enough source text
724 to analyze long escape sequence or 2-byte code (within macros
725 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
726 to SRC_BASE before exiting. */
727 unsigned char *src_base = src;
728 unsigned char c1 = *src++, c2, cmprule;
730 switch (iso_code_class [c1])
732 case ISO_0x20_or_0x7F:
733 if (!coding->composing
734 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
736 /* This is SPACE or DEL. */
737 *dst++ = c1;
738 break;
740 /* This is a graphic character, we fall down ... */
742 case ISO_graphic_plane_0:
743 if (coding->composing == COMPOSING_WITH_RULE_RULE)
745 /* This is a composition rule. */
746 *dst++ = c1 | 0x80;
747 coding->composing = COMPOSING_WITH_RULE_TAIL;
749 else
750 DECODE_ISO_CHARACTER (charset0, c1);
751 break;
753 case ISO_0xA0_or_0xFF:
754 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
756 /* Invalid code. */
757 *dst++ = c1;
758 break;
760 /* This is a graphic character, we fall down ... */
762 case ISO_graphic_plane_1:
763 DECODE_ISO_CHARACTER (charset1, c1);
764 break;
766 case ISO_control_code:
767 /* All ISO2022 control characters in this class have the
768 same representation in Emacs internal format. */
769 *dst++ = c1;
770 break;
772 case ISO_carriage_return:
773 if (coding->eol_type == CODING_EOL_CR)
775 *dst++ = '\n';
777 else if (coding->eol_type == CODING_EOL_CRLF)
779 ONE_MORE_BYTE (c1);
780 if (c1 == ISO_CODE_LF)
781 *dst++ = '\n';
782 else
784 src--;
785 *dst++ = c1;
788 else
790 *dst++ = c1;
792 break;
794 case ISO_shift_out:
795 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
796 goto label_invalid_escape_sequence;
797 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
798 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
799 break;
801 case ISO_shift_in:
802 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
803 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
804 break;
806 case ISO_single_shift_2_7:
807 case ISO_single_shift_2:
808 /* SS2 is handled as an escape sequence of ESC 'N' */
809 c1 = 'N';
810 goto label_escape_sequence;
812 case ISO_single_shift_3:
813 /* SS2 is handled as an escape sequence of ESC 'O' */
814 c1 = 'O';
815 goto label_escape_sequence;
817 case ISO_control_sequence_introducer:
818 /* CSI is handled as an escape sequence of ESC '[' ... */
819 c1 = '[';
820 goto label_escape_sequence;
822 case ISO_escape:
823 ONE_MORE_BYTE (c1);
824 label_escape_sequence:
825 /* Escape sequences handled by Emacs are invocation,
826 designation, direction specification, and character
827 composition specification. */
828 switch (c1)
830 case '&': /* revision of following character set */
831 ONE_MORE_BYTE (c1);
832 if (!(c1 >= '@' && c1 <= '~'))
833 goto label_invalid_escape_sequence;
834 ONE_MORE_BYTE (c1);
835 if (c1 != ISO_CODE_ESC)
836 goto label_invalid_escape_sequence;
837 ONE_MORE_BYTE (c1);
838 goto label_escape_sequence;
840 case '$': /* designation of 2-byte character set */
841 ONE_MORE_BYTE (c1);
842 if (c1 >= '@' && c1 <= 'B')
843 { /* designation of JISX0208.1978, GB2312.1980,
844 or JISX0208.1980 */
845 DECODE_DESIGNATION (0, 2, 94, c1);
847 else if (c1 >= 0x28 && c1 <= 0x2B)
848 { /* designation of DIMENSION2_CHARS94 character set */
849 ONE_MORE_BYTE (c2);
850 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
852 else if (c1 >= 0x2C && c1 <= 0x2F)
853 { /* designation of DIMENSION2_CHARS96 character set */
854 ONE_MORE_BYTE (c2);
855 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
857 else
858 goto label_invalid_escape_sequence;
859 break;
861 case 'n': /* invocation of locking-shift-2 */
862 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
863 goto label_invalid_escape_sequence;
864 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
865 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
866 break;
868 case 'o': /* invocation of locking-shift-3 */
869 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
870 goto label_invalid_escape_sequence;
871 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
872 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
873 break;
875 case 'N': /* invocation of single-shift-2 */
876 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
877 goto label_invalid_escape_sequence;
878 ONE_MORE_BYTE (c1);
879 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
880 DECODE_ISO_CHARACTER (charset, c1);
881 break;
883 case 'O': /* invocation of single-shift-3 */
884 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
885 goto label_invalid_escape_sequence;
886 ONE_MORE_BYTE (c1);
887 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
888 DECODE_ISO_CHARACTER (charset, c1);
889 break;
891 case '0': /* start composing without embeded rules */
892 coding->composing = COMPOSING_NO_RULE_HEAD;
893 break;
895 case '1': /* end composing */
896 coding->composing = COMPOSING_NO;
897 break;
899 case '2': /* start composing with embeded rules */
900 coding->composing = COMPOSING_WITH_RULE_HEAD;
901 break;
903 case '[': /* specification of direction */
904 /* For the moment, nested direction is not supported.
905 So, the value of `coding->direction' is 0 or 1: 0
906 means left-to-right, 1 means right-to-left. */
907 ONE_MORE_BYTE (c1);
908 switch (c1)
910 case ']': /* end of the current direction */
911 coding->direction = 0;
913 case '0': /* end of the current direction */
914 case '1': /* start of left-to-right direction */
915 ONE_MORE_BYTE (c1);
916 if (c1 == ']')
917 coding->direction = 0;
918 else
919 goto label_invalid_escape_sequence;
920 break;
922 case '2': /* start of right-to-left direction */
923 ONE_MORE_BYTE (c1);
924 if (c1 == ']')
925 coding->direction= 1;
926 else
927 goto label_invalid_escape_sequence;
928 break;
930 default:
931 goto label_invalid_escape_sequence;
933 break;
935 default:
936 if (c1 >= 0x28 && c1 <= 0x2B)
937 { /* designation of DIMENSION1_CHARS94 character set */
938 ONE_MORE_BYTE (c2);
939 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
941 else if (c1 >= 0x2C && c1 <= 0x2F)
942 { /* designation of DIMENSION1_CHARS96 character set */
943 ONE_MORE_BYTE (c2);
944 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
946 else
948 goto label_invalid_escape_sequence;
951 /* We must update these variables now. */
952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
953 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
954 break;
956 label_invalid_escape_sequence:
958 int length = src - src_base;
960 bcopy (src_base, dst, length);
961 dst += length;
964 continue;
966 label_end_of_loop:
967 coding->carryover_size = src - src_base;
968 bcopy (src_base, coding->carryover, coding->carryover_size);
969 src = src_base;
970 break;
973 /* If this is the last block of the text to be decoded, we had
974 better just flush out all remaining codes in the text although
975 they are not valid characters. */
976 if (coding->last_block)
978 bcopy (src, dst, src_end - src);
979 dst += (src_end - src);
980 src = src_end;
982 *consumed = src - source;
983 return dst - destination;
986 /* ISO2022 encoding staffs. */
989 It is not enough to say just "ISO2022" on encoding, but we have to
990 specify more details. In Emacs, each coding-system of ISO2022
991 variant has the following specifications:
992 1. Initial designation to G0 thru G3.
993 2. Allows short-form designation?
994 3. ASCII should be designated to G0 before control characters?
995 4. ASCII should be designated to G0 at end of line?
996 5. 7-bit environment or 8-bit environment?
997 6. Use locking-shift?
998 7. Use Single-shift?
999 And the following two are only for Japanese:
1000 8. Use ASCII in place of JIS0201-1976-Roman?
1001 9. Use JISX0208-1983 in place of JISX0208-1978?
1002 These specifications are encoded in `coding->flags' as flag bits
1003 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1004 detail.
1007 /* Produce codes (escape sequence) for designating CHARSET to graphic
1008 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1009 the coding system CODING allows, produce designation sequence of
1010 short-form. */
1012 #define ENCODE_DESIGNATION(charset, reg, coding) \
1013 do { \
1014 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1015 char *intermediate_char_94 = "()*+"; \
1016 char *intermediate_char_96 = ",-./"; \
1017 Lisp_Object temp \
1018 = Fassq (make_number (charset), Vcharset_revision_alist); \
1019 if (! NILP (temp)) \
1021 *dst++ = ISO_CODE_ESC; \
1022 *dst++ = '&'; \
1023 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1025 *dst++ = ISO_CODE_ESC; \
1026 if (CHARSET_DIMENSION (charset) == 1) \
1028 if (CHARSET_CHARS (charset) == 94) \
1029 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1030 else \
1031 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1033 else \
1035 *dst++ = '$'; \
1036 if (CHARSET_CHARS (charset) == 94) \
1038 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1039 || reg != 0 \
1040 || final_char < '@' || final_char > 'B') \
1041 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1043 else \
1044 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1046 *dst++ = final_char; \
1047 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1048 } while (0)
1050 /* The following two macros produce codes (control character or escape
1051 sequence) for ISO2022 single-shift functions (single-shift-2 and
1052 single-shift-3). */
1054 #define ENCODE_SINGLE_SHIFT_2 \
1055 do { \
1056 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1057 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1058 else \
1059 *dst++ = ISO_CODE_SS2; \
1060 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1061 } while (0)
1063 #define ENCODE_SINGLE_SHIFT_3 \
1064 do { \
1065 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1066 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1067 else \
1068 *dst++ = ISO_CODE_SS3; \
1069 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1070 } while (0)
1072 /* The following four macros produce codes (control character or
1073 escape sequence) for ISO2022 locking-shift functions (shift-in,
1074 shift-out, locking-shift-2, and locking-shift-3). */
1076 #define ENCODE_SHIFT_IN \
1077 do { \
1078 *dst++ = ISO_CODE_SI; \
1079 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1080 } while (0)
1082 #define ENCODE_SHIFT_OUT \
1083 do { \
1084 *dst++ = ISO_CODE_SO; \
1085 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1086 } while (0)
1088 #define ENCODE_LOCKING_SHIFT_2 \
1089 do { \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1091 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1092 } while (0)
1094 #define ENCODE_LOCKING_SHIFT_3 \
1095 do { \
1096 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1097 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1098 } while (0)
1100 /* Produce codes for a DIMENSION1 character of which character set is
1101 CHARSET and position-code is C1. Designation and invocation
1102 sequences are also produced in advance if necessary. */
1105 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1106 do { \
1107 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1109 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1110 *dst++ = c1 & 0x7F; \
1111 else \
1112 *dst++ = c1 | 0x80; \
1113 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1114 break; \
1116 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1118 *dst++ = c1 & 0x7F; \
1119 break; \
1121 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1123 *dst++ = c1 | 0x80; \
1124 break; \
1126 else \
1127 /* Since CHARSET is not yet invoked to any graphic planes, we \
1128 must invoke it, or, at first, designate it to some graphic \
1129 register. Then repeat the loop to actually produce the \
1130 character. */ \
1131 dst = encode_invocation_designation (charset, coding, dst); \
1132 } while (1)
1134 /* Produce codes for a DIMENSION2 character of which character set is
1135 CHARSET and position-codes are C1 and C2. Designation and
1136 invocation codes are also produced in advance if necessary. */
1138 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1139 do { \
1140 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1142 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1143 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1144 else \
1145 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1146 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1147 break; \
1149 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1151 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1152 break; \
1154 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1156 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1157 break; \
1159 else \
1160 /* Since CHARSET is not yet invoked to any graphic planes, we \
1161 must invoke it, or, at first, designate it to some graphic \
1162 register. Then repeat the loop to actually produce the \
1163 character. */ \
1164 dst = encode_invocation_designation (charset, coding, dst); \
1165 } while (1)
1167 /* Produce designation and invocation codes at a place pointed by DST
1168 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1169 Return new DST. */
1171 unsigned char *
1172 encode_invocation_designation (charset, coding, dst)
1173 int charset;
1174 struct coding_system *coding;
1175 unsigned char *dst;
1177 int reg; /* graphic register number */
1179 /* At first, check designations. */
1180 for (reg = 0; reg < 4; reg++)
1181 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1182 break;
1184 if (reg >= 4)
1186 /* CHARSET is not yet designated to any graphic registers. */
1187 /* At first check the requested designation. */
1188 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1189 if (reg < 0)
1190 /* Since CHARSET requests no special designation, designate to
1191 graphic register 0. */
1192 reg = 0;
1194 ENCODE_DESIGNATION (charset, reg, coding);
1197 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1198 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1200 /* Since the graphic register REG is not invoked to any graphic
1201 planes, invoke it to graphic plane 0. */
1202 switch (reg)
1204 case 0: /* graphic register 0 */
1205 ENCODE_SHIFT_IN;
1206 break;
1208 case 1: /* graphic register 1 */
1209 ENCODE_SHIFT_OUT;
1210 break;
1212 case 2: /* graphic register 2 */
1213 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1214 ENCODE_SINGLE_SHIFT_2;
1215 else
1216 ENCODE_LOCKING_SHIFT_2;
1217 break;
1219 case 3: /* graphic register 3 */
1220 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1221 ENCODE_SINGLE_SHIFT_3;
1222 else
1223 ENCODE_LOCKING_SHIFT_3;
1224 break;
1227 return dst;
1230 /* The following two macros produce codes for indicating composition. */
1231 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1232 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1233 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1235 /* The following three macros produce codes for indicating direction
1236 of text. */
1237 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1238 do { \
1239 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1240 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1241 else \
1242 *dst++ = ISO_CODE_CSI; \
1243 } while (0)
1245 #define ENCODE_DIRECTION_R2L \
1246 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1248 #define ENCODE_DIRECTION_L2R \
1249 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1251 /* Produce codes for designation and invocation to reset the graphic
1252 planes and registers to initial state. */
1253 #define ENCODE_RESET_PLANE_AND_REGISTER \
1254 do { \
1255 int reg; \
1256 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1257 ENCODE_SHIFT_IN; \
1258 for (reg = 0; reg < 4; reg++) \
1259 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1260 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1261 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1262 ENCODE_DESIGNATION \
1263 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1264 } while (0)
1267 encode_designation_at_bol (coding, src, src_end, dstp)
1268 struct coding_system *coding;
1269 unsigned char *src, *src_end, **dstp;
1271 int charset, reg, r[4];
1272 unsigned char *dst = *dstp, c;
1273 for (reg = 0; reg < 4; reg++) r[reg] = -1;
1274 while (src < src_end && (c = *src++) != '\n')
1276 switch (emacs_code_class[c])
1278 case EMACS_ascii_code:
1279 charset = CHARSET_ASCII;
1280 break;
1281 case EMACS_leading_code_2:
1282 if (++src >= src_end) continue;
1283 charset = c;
1284 break;
1285 case EMACS_leading_code_3:
1286 if ((src += 2) >= src_end) continue;
1287 charset = (c < LEADING_CODE_PRIVATE_11 ? c : *(src - 2));
1288 break;
1289 case EMACS_leading_code_4:
1290 if ((src += 3) >= src_end) continue;
1291 charset = *(src - 3);
1292 break;
1293 default:
1294 continue;
1296 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1297 if (r[reg] < 0
1298 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != charset)
1299 r[reg] = charset;
1301 if (c != '\n' && !coding->last_block)
1302 return -1;
1303 for (reg = 0; reg < 4; reg++)
1304 if (r[reg] >= 0)
1305 ENCODE_DESIGNATION (r[reg], reg, coding);
1306 *dstp = dst;
1307 return 0;
1310 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1313 encode_coding_iso2022 (coding, source, destination,
1314 src_bytes, dst_bytes, consumed)
1315 struct coding_system *coding;
1316 unsigned char *source, *destination;
1317 int src_bytes, dst_bytes;
1318 int *consumed;
1320 unsigned char *src = source;
1321 unsigned char *src_end = source + src_bytes;
1322 unsigned char *dst = destination;
1323 unsigned char *dst_end = destination + dst_bytes;
1324 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1325 from DST_END to assure overflow checking is necessary only at the
1326 head of loop. */
1327 unsigned char *adjusted_dst_end = dst_end - 19;
1329 while (src < src_end && dst < adjusted_dst_end)
1331 /* SRC_BASE remembers the start position in source in each loop.
1332 The loop will be exited when there's not enough source text
1333 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1334 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1335 reset to SRC_BASE before exiting. */
1336 unsigned char *src_base = src;
1337 unsigned char c1, c2, c3, c4;
1338 int charset;
1340 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1341 && CODING_SPEC_ISO_BOL (coding))
1343 /* We have to produce destination sequences now. */
1344 if (encode_designation_at_bol (coding, src, src_end, &dst) < 0)
1345 /* We can't find end of line in the current block. Let's
1346 repeat encoding starting from the current position
1347 pointed by SRC. */
1348 break;
1349 CODING_SPEC_ISO_BOL (coding) = 0;
1352 c1 = *src++;
1353 /* If we are seeing a component of a composite character, we are
1354 seeing a leading-code specially encoded for composition, or a
1355 composition rule if composing with rule. We must set C1
1356 to a normal leading-code or an ASCII code. If we are not at
1357 a composed character, we must reset the composition state. */
1358 if (COMPOSING_P (coding->composing))
1360 if (c1 < 0xA0)
1362 /* We are not in a composite character any longer. */
1363 coding->composing = COMPOSING_NO;
1364 ENCODE_COMPOSITION_END;
1366 else
1368 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1370 *dst++ = c1 & 0x7F;
1371 coding->composing = COMPOSING_WITH_RULE_HEAD;
1372 continue;
1374 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1375 coding->composing = COMPOSING_WITH_RULE_RULE;
1376 if (c1 == 0xA0)
1378 /* This is an ASCII component. */
1379 ONE_MORE_BYTE (c1);
1380 c1 &= 0x7F;
1382 else
1383 /* This is a leading-code of non ASCII component. */
1384 c1 -= 0x20;
1388 /* Now encode one character. C1 is a control character, an
1389 ASCII character, or a leading-code of multi-byte character. */
1390 switch (emacs_code_class[c1])
1392 case EMACS_ascii_code:
1393 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1394 break;
1396 case EMACS_control_code:
1397 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1398 ENCODE_RESET_PLANE_AND_REGISTER;
1399 *dst++ = c1;
1400 break;
1402 case EMACS_carriage_return_code:
1403 if (!coding->selective)
1405 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1406 ENCODE_RESET_PLANE_AND_REGISTER;
1407 *dst++ = c1;
1408 break;
1410 /* fall down to treat '\r' as '\n' ... */
1412 case EMACS_linefeed_code:
1413 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1414 ENCODE_RESET_PLANE_AND_REGISTER;
1415 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1416 bcopy (coding->spec.iso2022.initial_designation,
1417 coding->spec.iso2022.current_designation,
1418 sizeof coding->spec.iso2022.initial_designation);
1419 if (coding->eol_type == CODING_EOL_LF
1420 || coding->eol_type == CODING_EOL_AUTOMATIC)
1421 *dst++ = ISO_CODE_LF;
1422 else if (coding->eol_type == CODING_EOL_CRLF)
1423 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1424 else
1425 *dst++ = ISO_CODE_CR;
1426 CODING_SPEC_ISO_BOL (coding) = 1;
1427 break;
1429 case EMACS_leading_code_2:
1430 ONE_MORE_BYTE (c2);
1431 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1432 break;
1434 case EMACS_leading_code_3:
1435 TWO_MORE_BYTES (c2, c3);
1436 if (c1 < LEADING_CODE_PRIVATE_11)
1437 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1438 else
1439 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1440 break;
1442 case EMACS_leading_code_4:
1443 THREE_MORE_BYTES (c2, c3, c4);
1444 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1445 break;
1447 case EMACS_leading_code_composition:
1448 ONE_MORE_BYTE (c1);
1449 if (c1 == 0xFF)
1451 coding->composing = COMPOSING_WITH_RULE_HEAD;
1452 ENCODE_COMPOSITION_WITH_RULE_START;
1454 else
1456 /* Rewind one byte because it is a character code of
1457 composition elements. */
1458 src--;
1459 coding->composing = COMPOSING_NO_RULE_HEAD;
1460 ENCODE_COMPOSITION_NO_RULE_START;
1462 break;
1464 case EMACS_invalid_code:
1465 *dst++ = c1;
1466 break;
1468 continue;
1469 label_end_of_loop:
1470 coding->carryover_size = src - src_base;
1471 bcopy (src_base, coding->carryover, coding->carryover_size);
1472 src = src_base;
1473 break;
1476 /* If this is the last block of the text to be encoded, we must
1477 reset the state of graphic planes and registers to initial one.
1478 In addition, we had better just flush out all remaining codes in
1479 the text although they are not valid characters. */
1480 if (coding->last_block)
1482 ENCODE_RESET_PLANE_AND_REGISTER;
1483 bcopy(src, dst, src_end - src);
1484 dst += (src_end - src);
1485 src = src_end;
1487 *consumed = src - source;
1488 return dst - destination;
1492 /*** 4. SJIS and BIG5 handlers ***/
1494 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1495 quite widely. So, for the moment, Emacs supports them in the bare
1496 C code. But, in the future, they may be supported only by CCL. */
1498 /* SJIS is a coding system encoding three character sets: ASCII, right
1499 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1500 as is. A character of charset katakana-jisx0201 is encoded by
1501 "position-code + 0x80". A character of charset japanese-jisx0208
1502 is encoded in 2-byte but two position-codes are divided and shifted
1503 so that it fit in the range below.
1505 --- CODE RANGE of SJIS ---
1506 (character set) (range)
1507 ASCII 0x00 .. 0x7F
1508 KATAKANA-JISX0201 0xA0 .. 0xDF
1509 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1510 (2nd byte) 0x40 .. 0xFF
1511 -------------------------------
1515 /* BIG5 is a coding system encoding two character sets: ASCII and
1516 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1517 character set and is encoded in two-byte.
1519 --- CODE RANGE of BIG5 ---
1520 (character set) (range)
1521 ASCII 0x00 .. 0x7F
1522 Big5 (1st byte) 0xA1 .. 0xFE
1523 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1524 --------------------------
1526 Since the number of characters in Big5 is larger than maximum
1527 characters in Emacs' charset (96x96), it can't be handled as one
1528 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1529 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1530 contains frequently used characters and the latter contains less
1531 frequently used characters. */
1533 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1534 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1535 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1536 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1538 /* Number of Big5 characters which have the same code in 1st byte. */
1539 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1541 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1542 do { \
1543 unsigned int temp \
1544 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1545 if (b1 < 0xC9) \
1546 charset = charset_big5_1; \
1547 else \
1549 charset = charset_big5_2; \
1550 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1552 c1 = temp / (0xFF - 0xA1) + 0x21; \
1553 c2 = temp % (0xFF - 0xA1) + 0x21; \
1554 } while (0)
1556 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1557 do { \
1558 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1559 if (charset == charset_big5_2) \
1560 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1561 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1562 b2 = temp % BIG5_SAME_ROW; \
1563 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1564 } while (0)
1566 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1567 Check if a text is encoded in SJIS. If it is, return
1568 CODING_CATEGORY_MASK_SJIS, else return 0. */
1571 detect_coding_sjis (src, src_end)
1572 unsigned char *src, *src_end;
1574 unsigned char c;
1576 while (src < src_end)
1578 c = *src++;
1579 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1580 return 0;
1581 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1583 if (src < src_end && *src++ < 0x40)
1584 return 0;
1587 return CODING_CATEGORY_MASK_SJIS;
1590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591 Check if a text is encoded in BIG5. If it is, return
1592 CODING_CATEGORY_MASK_BIG5, else return 0. */
1595 detect_coding_big5 (src, src_end)
1596 unsigned char *src, *src_end;
1598 unsigned char c;
1600 while (src < src_end)
1602 c = *src++;
1603 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1604 return 0;
1605 if (c >= 0xA1)
1607 if (src >= src_end)
1608 break;
1609 c = *src++;
1610 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1611 return 0;
1614 return CODING_CATEGORY_MASK_BIG5;
1617 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1618 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1621 decode_coding_sjis_big5 (coding, source, destination,
1622 src_bytes, dst_bytes, consumed, sjis_p)
1623 struct coding_system *coding;
1624 unsigned char *source, *destination;
1625 int src_bytes, dst_bytes;
1626 int *consumed;
1627 int sjis_p;
1629 unsigned char *src = source;
1630 unsigned char *src_end = source + src_bytes;
1631 unsigned char *dst = destination;
1632 unsigned char *dst_end = destination + dst_bytes;
1633 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1634 from DST_END to assure overflow checking is necessary only at the
1635 head of loop. */
1636 unsigned char *adjusted_dst_end = dst_end - 3;
1638 while (src < src_end && dst < adjusted_dst_end)
1640 /* SRC_BASE remembers the start position in source in each loop.
1641 The loop will be exited when there's not enough source text
1642 to analyze two-byte character (within macro ONE_MORE_BYTE).
1643 In that case, SRC is reset to SRC_BASE before exiting. */
1644 unsigned char *src_base = src;
1645 unsigned char c1 = *src++, c2, c3, c4;
1647 if (c1 == '\r')
1649 if (coding->eol_type == CODING_EOL_CRLF)
1651 ONE_MORE_BYTE (c2);
1652 if (c2 == '\n')
1653 *dst++ = c2;
1654 else
1655 /* To process C2 again, SRC is subtracted by 1. */
1656 *dst++ = c1, src--;
1658 else
1659 *dst++ = c1;
1661 else if (c1 < 0x80)
1662 *dst++ = c1;
1663 else if (c1 < 0xA0 || c1 >= 0xE0)
1665 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1666 if (sjis_p)
1668 ONE_MORE_BYTE (c2);
1669 DECODE_SJIS (c1, c2, c3, c4);
1670 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1672 else if (c1 >= 0xE0 && c1 < 0xFF)
1674 int charset;
1676 ONE_MORE_BYTE (c2);
1677 DECODE_BIG5 (c1, c2, charset, c3, c4);
1678 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1680 else /* Invalid code */
1681 *dst++ = c1;
1683 else
1685 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1686 if (sjis_p)
1687 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1688 else
1690 int charset;
1692 ONE_MORE_BYTE (c2);
1693 DECODE_BIG5 (c1, c2, charset, c3, c4);
1694 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1697 continue;
1699 label_end_of_loop:
1700 coding->carryover_size = src - src_base;
1701 bcopy (src_base, coding->carryover, coding->carryover_size);
1702 src = src_base;
1703 break;
1706 *consumed = src - source;
1707 return dst - destination;
1710 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1711 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1712 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1713 sure that all these charsets are registered as official charset
1714 (i.e. do not have extended leading-codes). Characters of other
1715 charsets are produced without any encoding. If SJIS_P is 1, encode
1716 SJIS text, else encode BIG5 text. */
1719 encode_coding_sjis_big5 (coding, source, destination,
1720 src_bytes, dst_bytes, consumed, sjis_p)
1721 struct coding_system *coding;
1722 unsigned char *source, *destination;
1723 int src_bytes, dst_bytes;
1724 int *consumed;
1725 int sjis_p;
1727 unsigned char *src = source;
1728 unsigned char *src_end = source + src_bytes;
1729 unsigned char *dst = destination;
1730 unsigned char *dst_end = destination + dst_bytes;
1731 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1732 from DST_END to assure overflow checking is necessary only at the
1733 head of loop. */
1734 unsigned char *adjusted_dst_end = dst_end - 1;
1736 while (src < src_end && dst < adjusted_dst_end)
1738 /* SRC_BASE remembers the start position in source in each loop.
1739 The loop will be exited when there's not enough source text
1740 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1741 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1742 before exiting. */
1743 unsigned char *src_base = src;
1744 unsigned char c1 = *src++, c2, c3, c4;
1746 if (coding->composing)
1748 if (c1 == 0xA0)
1750 ONE_MORE_BYTE (c1);
1751 c1 &= 0x7F;
1753 else if (c1 >= 0xA0)
1754 c1 -= 0x20;
1755 else
1756 coding->composing = 0;
1759 switch (emacs_code_class[c1])
1761 case EMACS_ascii_code:
1762 case EMACS_control_code:
1763 *dst++ = c1;
1764 break;
1766 case EMACS_carriage_return_code:
1767 if (!coding->selective)
1769 *dst++ = c1;
1770 break;
1772 /* fall down to treat '\r' as '\n' ... */
1774 case EMACS_linefeed_code:
1775 if (coding->eol_type == CODING_EOL_LF
1776 || coding->eol_type == CODING_EOL_AUTOMATIC)
1777 *dst++ = '\n';
1778 else if (coding->eol_type == CODING_EOL_CRLF)
1779 *dst++ = '\r', *dst++ = '\n';
1780 else
1781 *dst++ = '\r';
1782 break;
1784 case EMACS_leading_code_2:
1785 ONE_MORE_BYTE (c2);
1786 if (sjis_p && c1 == charset_katakana_jisx0201)
1787 *dst++ = c2;
1788 else
1789 *dst++ = c1, *dst++ = c2;
1790 break;
1792 case EMACS_leading_code_3:
1793 TWO_MORE_BYTES (c2, c3);
1794 c2 &= 0x7F, c3 &= 0x7F;
1795 if (sjis_p && c1 == charset_jisx0208)
1797 unsigned char s1, s2;
1799 ENCODE_SJIS (c2, c3, s1, s2);
1800 *dst++ = s1, *dst++ = s2;
1802 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1804 unsigned char b1, b2;
1806 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1807 *dst++ = b1, *dst++ = b2;
1809 else
1810 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1811 break;
1813 case EMACS_leading_code_4:
1814 THREE_MORE_BYTES (c2, c3, c4);
1815 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1816 break;
1818 case EMACS_leading_code_composition:
1819 coding->composing = 1;
1820 break;
1822 default: /* i.e. case EMACS_invalid_code: */
1823 *dst++ = c1;
1825 continue;
1827 label_end_of_loop:
1828 coding->carryover_size = src - src_base;
1829 bcopy (src_base, coding->carryover, coding->carryover_size);
1830 src = src_base;
1831 break;
1834 *consumed = src - source;
1835 return dst - destination;
1839 /*** 5. End-of-line handlers ***/
1841 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1842 This function is called only when `coding->eol_type' is
1843 CODING_EOL_CRLF or CODING_EOL_CR. */
1845 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1846 struct coding_system *coding;
1847 unsigned char *source, *destination;
1848 int src_bytes, dst_bytes;
1849 int *consumed;
1851 unsigned char *src = source;
1852 unsigned char *src_end = source + src_bytes;
1853 unsigned char *dst = destination;
1854 unsigned char *dst_end = destination + dst_bytes;
1855 int produced;
1857 switch (coding->eol_type)
1859 case CODING_EOL_CRLF:
1861 /* Since the maximum bytes produced by each loop is 2, we
1862 subtract 1 from DST_END to assure overflow checking is
1863 necessary only at the head of loop. */
1864 unsigned char *adjusted_dst_end = dst_end - 1;
1866 while (src < src_end && dst < adjusted_dst_end)
1868 unsigned char *src_base = src;
1869 unsigned char c = *src++;
1870 if (c == '\r')
1872 ONE_MORE_BYTE (c);
1873 if (c != '\n')
1874 *dst++ = '\r';
1875 *dst++ = c;
1877 else
1878 *dst++ = c;
1879 continue;
1881 label_end_of_loop:
1882 coding->carryover_size = src - src_base;
1883 bcopy (src_base, coding->carryover, coding->carryover_size);
1884 src = src_base;
1885 break;
1887 *consumed = src - source;
1888 produced = dst - destination;
1889 break;
1892 case CODING_EOL_CR:
1893 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1894 bcopy (source, destination, produced);
1895 dst_end = destination + produced;
1896 while (dst < dst_end)
1897 if (*dst++ == '\r') dst[-1] = '\n';
1898 *consumed = produced;
1899 break;
1901 default: /* i.e. case: CODING_EOL_LF */
1902 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1903 bcopy (source, destination, produced);
1904 *consumed = produced;
1905 break;
1908 return produced;
1911 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1912 format of end-of-line according to `coding->eol_type'. If
1913 `coding->selective' is 1, code '\r' in source text also means
1914 end-of-line. */
1916 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1917 struct coding_system *coding;
1918 unsigned char *source, *destination;
1919 int src_bytes, dst_bytes;
1920 int *consumed;
1922 unsigned char *src = source;
1923 unsigned char *dst = destination;
1924 int produced;
1926 if (src_bytes <= 0)
1927 return 0;
1929 switch (coding->eol_type)
1931 case CODING_EOL_LF:
1932 case CODING_EOL_AUTOMATIC:
1933 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1934 bcopy (source, destination, produced);
1935 if (coding->selective)
1937 int i = produced;
1938 while (i--)
1939 if (*dst++ == '\r') dst[-1] = '\n';
1941 *consumed = produced;
1943 case CODING_EOL_CRLF:
1945 unsigned char c;
1946 unsigned char *src_end = source + src_bytes;
1947 unsigned char *dst_end = destination + dst_bytes;
1948 /* Since the maximum bytes produced by each loop is 2, we
1949 subtract 1 from DST_END to assure overflow checking is
1950 necessary only at the head of loop. */
1951 unsigned char *adjusted_dst_end = dst_end - 1;
1953 while (src < src_end && dst < adjusted_dst_end)
1955 c = *src++;
1956 if (c == '\n' || (c == '\r' && coding->selective))
1957 *dst++ = '\r', *dst++ = '\n';
1958 else
1959 *dst++ = c;
1961 produced = dst - destination;
1962 *consumed = src - source;
1963 break;
1966 default: /* i.e. case CODING_EOL_CR: */
1967 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1968 bcopy (source, destination, produced);
1970 int i = produced;
1971 while (i--)
1972 if (*dst++ == '\n') dst[-1] = '\r';
1974 *consumed = produced;
1977 return produced;
1981 /*** 6. C library functions ***/
1983 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1984 has a property `coding-system'. The value of this property is a
1985 vector of length 5 (called as coding-vector). Among elements of
1986 this vector, the first (element[0]) and the fifth (element[4])
1987 carry important information for decoding/encoding. Before
1988 decoding/encoding, this information should be set in fields of a
1989 structure of type `coding_system'.
1991 A value of property `coding-system' can be a symbol of another
1992 subsidiary coding-system. In that case, Emacs gets coding-vector
1993 from that symbol.
1995 `element[0]' contains information to be set in `coding->type'. The
1996 value and its meaning is as follows:
1998 0 -- coding_system_internal
1999 1 -- coding_system_sjis
2000 2 -- coding_system_iso2022
2001 3 -- coding_system_big5
2002 4 -- coding_system_ccl
2003 nil -- coding_system_no_conversion
2004 t -- coding_system_automatic
2006 `element[4]' contains information to be set in `coding->flags' and
2007 `coding->spec'. The meaning varies by `coding->type'.
2009 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2010 of length 32 (of which the first 13 sub-elements are used now).
2011 Meanings of these sub-elements are:
2013 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2014 If the value is an integer of valid charset, the charset is
2015 assumed to be designated to graphic register N initially.
2017 If the value is minus, it is a minus value of charset which
2018 reserves graphic register N, which means that the charset is
2019 not designated initially but should be designated to graphic
2020 register N just before encoding a character in that charset.
2022 If the value is nil, graphic register N is never used on
2023 encoding.
2025 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2026 Each value takes t or nil. See the section ISO2022 of
2027 `coding.h' for more information.
2029 If `coding->type' is `coding_type_big5', element[4] is t to denote
2030 BIG5-ETen or nil to denote BIG5-HKU.
2032 If `coding->type' takes the other value, element[4] is ignored.
2034 Emacs Lisp's coding system also carries information about format of
2035 end-of-line in a value of property `eol-type'. If the value is
2036 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2037 means CODING_EOL_CR. If it is not integer, it should be a vector
2038 of subsidiary coding systems of which property `eol-type' has one
2039 of above values.
2043 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2044 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2045 is setup so that no conversion is necessary and return -1, else
2046 return 0. */
2049 setup_coding_system (coding_system, coding)
2050 Lisp_Object coding_system;
2051 struct coding_system *coding;
2053 Lisp_Object type, eol_type;
2055 /* At first, set several fields default values. */
2056 coding->require_flushing = 0;
2057 coding->last_block = 0;
2058 coding->selective = 0;
2059 coding->composing = 0;
2060 coding->direction = 0;
2061 coding->carryover_size = 0;
2062 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2064 Vlast_coding_system_used = coding->symbol = coding_system;
2065 eol_type = Qnil;
2066 /* Get value of property `coding-system' until we get a vector.
2067 While doing that, also get values of properties
2068 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2069 while (!NILP (coding_system) && SYMBOLP (coding_system))
2071 if (NILP (coding->post_read_conversion))
2072 coding->post_read_conversion = Fget (coding_system,
2073 Qpost_read_conversion);
2074 if (NILP (coding->pre_write_conversion))
2075 coding->pre_write_conversion = Fget (coding_system,
2076 Qpre_write_conversion);
2077 if (NILP (eol_type))
2078 eol_type = Fget (coding_system, Qeol_type);
2079 coding_system = Fget (coding_system, Qcoding_system);
2081 if (!VECTORP (coding_system)
2082 || XVECTOR (coding_system)->size != 5)
2083 goto label_invalid_coding_system;
2085 if (VECTORP (eol_type))
2086 coding->eol_type = CODING_EOL_AUTOMATIC;
2087 else if (XFASTINT (eol_type) == 1)
2088 coding->eol_type = CODING_EOL_CRLF;
2089 else if (XFASTINT (eol_type) == 2)
2090 coding->eol_type = CODING_EOL_CR;
2091 else
2092 coding->eol_type = CODING_EOL_LF;
2094 type = XVECTOR (coding_system)->contents[0];
2095 switch (XFASTINT (type))
2097 case 0:
2098 coding->type = coding_type_internal;
2099 break;
2101 case 1:
2102 coding->type = coding_type_sjis;
2103 break;
2105 case 2:
2106 coding->type = coding_type_iso2022;
2108 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2109 Lisp_Object *flags;
2110 int i, charset, default_reg_bits = 0;
2112 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2113 goto label_invalid_coding_system;
2115 flags = XVECTOR (val)->contents;
2116 coding->flags
2117 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2118 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2119 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2120 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2121 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2122 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2123 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2124 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2125 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2126 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2127 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2129 /* Invoke graphic register 0 to plane 0. */
2130 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2131 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2132 CODING_SPEC_ISO_INVOCATION (coding, 1)
2133 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2134 /* Not single shifting at first. */
2135 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2136 /* Beginning of buffer should also be regarded as bol. */
2137 CODING_SPEC_ISO_BOL(coding) = 1;
2139 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2140 FLAGS[REG] can be one of below:
2141 integer CHARSET: CHARSET occupies register I,
2142 t: designate nothing to REG initially, but can be used
2143 by any charsets,
2144 list of integer, nil, or t: designate the first
2145 element (if integer) to REG initially, the remaining
2146 elements (if integer) is designated to REG on request,
2147 if an element is t, REG can be used by any charset,
2148 nil: REG is never used. */
2149 for (charset = 0; charset <= MAX_CHARSET; charset++)
2150 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2151 for (i = 0; i < 4; i++)
2153 if (INTEGERP (flags[i])
2154 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2155 || (charset = get_charset_id (flags[i])) >= 0)
2157 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2158 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2160 else if (EQ (flags[i], Qt))
2162 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2163 default_reg_bits |= 1 << i;
2165 else if (CONSP (flags[i]))
2167 Lisp_Object tail = flags[i];
2169 if (INTEGERP (XCONS (tail)->car)
2170 && (charset = XINT (XCONS (tail)->car),
2171 CHARSET_VALID_P (charset))
2172 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2174 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2175 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2177 else
2178 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2179 tail = XCONS (tail)->cdr;
2180 while (CONSP (tail))
2182 if (INTEGERP (XCONS (tail)->car)
2183 && (charset = XINT (XCONS (tail)->car),
2184 CHARSET_VALID_P (charset))
2185 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2186 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2187 = i;
2188 else if (EQ (XCONS (tail)->car, Qt))
2189 default_reg_bits |= 1 << i;
2190 tail = XCONS (tail)->cdr;
2193 else
2194 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2196 CODING_SPEC_ISO_DESIGNATION (coding, i)
2197 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2200 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2202 /* REG 1 can be used only by locking shift in 7-bit env. */
2203 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2204 default_reg_bits &= ~2;
2205 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2206 /* Without any shifting, only REG 0 and 1 can be used. */
2207 default_reg_bits &= 3;
2210 for (charset = 0; charset <= MAX_CHARSET; charset++)
2211 if (CHARSET_VALID_P (charset)
2212 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2214 /* We have not yet decided where to designate CHARSET. */
2215 int reg_bits = default_reg_bits;
2217 if (CHARSET_CHARS (charset) == 96)
2218 /* A charset of CHARS96 can't be designated to REG 0. */
2219 reg_bits &= ~1;
2221 if (reg_bits)
2222 /* There exist some default graphic register. */
2223 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2224 = (reg_bits & 1
2225 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2226 else
2227 /* We anyway have to designate CHARSET to somewhere. */
2228 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2229 = (CHARSET_CHARS (charset) == 94
2231 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2232 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2234 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2235 ? 2 : 0)));
2238 coding->require_flushing = 1;
2239 break;
2241 case 3:
2242 coding->type = coding_type_big5;
2243 coding->flags
2244 = (NILP (XVECTOR (coding_system)->contents[4])
2245 ? CODING_FLAG_BIG5_HKU
2246 : CODING_FLAG_BIG5_ETEN);
2247 break;
2249 case 4:
2250 coding->type = coding_type_ccl;
2252 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2253 if (CONSP (val)
2254 && VECTORP (XCONS (val)->car)
2255 && VECTORP (XCONS (val)->cdr))
2257 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2258 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2260 else
2261 goto label_invalid_coding_system;
2263 coding->require_flushing = 1;
2264 break;
2266 default:
2267 if (EQ (type, Qt))
2268 coding->type = coding_type_automatic;
2269 else
2270 coding->type = coding_type_no_conversion;
2271 break;
2273 return 0;
2275 label_invalid_coding_system:
2276 coding->type = coding_type_no_conversion;
2277 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2278 = Qnil;
2279 return -1;
2282 /* Emacs has a mechanism to automatically detect a coding system if it
2283 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2284 it's impossible to distinguish some coding systems accurately
2285 because they use the same range of codes. So, at first, coding
2286 systems are categorized into 7, those are:
2288 o coding-category-internal
2290 The category for a coding system which has the same code range
2291 as Emacs' internal format. Assigned the coding-system (Lisp
2292 symbol) `internal' by default.
2294 o coding-category-sjis
2296 The category for a coding system which has the same code range
2297 as SJIS. Assigned the coding-system (Lisp
2298 symbol) `shift-jis' by default.
2300 o coding-category-iso-7
2302 The category for a coding system which has the same code range
2303 as ISO2022 of 7-bit environment. Assigned the coding-system
2304 (Lisp symbol) `iso-2022-7' by default.
2306 o coding-category-iso-8-1
2308 The category for a coding system which has the same code range
2309 as ISO2022 of 8-bit environment and graphic plane 1 used only
2310 for DIMENSION1 charset. Assigned the coding-system (Lisp
2311 symbol) `iso-8859-1' by default.
2313 o coding-category-iso-8-2
2315 The category for a coding system which has the same code range
2316 as ISO2022 of 8-bit environment and graphic plane 1 used only
2317 for DIMENSION2 charset. Assigned the coding-system (Lisp
2318 symbol) `euc-japan' by default.
2320 o coding-category-iso-else
2322 The category for a coding system which has the same code range
2323 as ISO2022 but not belongs to any of the above three
2324 categories. Assigned the coding-system (Lisp symbol)
2325 `iso-2022-ss2-7' by default.
2327 o coding-category-big5
2329 The category for a coding system which has the same code range
2330 as BIG5. Assigned the coding-system (Lisp symbol)
2331 `cn-big5' by default.
2333 o coding-category-binary
2335 The category for a coding system not categorized in any of the
2336 above. Assigned the coding-system (Lisp symbol)
2337 `no-conversion' by default.
2339 Each of them is a Lisp symbol and the value is an actual
2340 `coding-system's (this is also a Lisp symbol) assigned by a user.
2341 What Emacs does actually is to detect a category of coding system.
2342 Then, it uses a `coding-system' assigned to it. If Emacs can't
2343 decide only one possible category, it selects a category of the
2344 highest priority. Priorities of categories are also specified by a
2345 user in a Lisp variable `coding-category-list'.
2349 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2350 If it detects possible coding systems, return an integer in which
2351 appropriate flag bits are set. Flag bits are defined by macros
2352 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2355 detect_coding_mask (src, src_bytes)
2356 unsigned char *src;
2357 int src_bytes;
2359 register unsigned char c;
2360 unsigned char *src_end = src + src_bytes;
2361 int mask;
2363 /* At first, skip all ASCII characters and control characters except
2364 for three ISO2022 specific control characters. */
2365 while (src < src_end)
2367 c = *src;
2368 if (c >= 0x80
2369 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2370 break;
2371 src++;
2374 if (src >= src_end)
2375 /* We found nothing other than ASCII. There's nothing to do. */
2376 return CODING_CATEGORY_MASK_ANY;
2378 /* The text seems to be encoded in some multilingual coding system.
2379 Now, try to find in which coding system the text is encoded. */
2380 if (c < 0x80)
2381 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2382 /* C is an ISO2022 specific control code of C0. */
2383 mask = detect_coding_iso2022 (src, src_end);
2385 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2386 /* C is an ISO2022 specific control code of C1,
2387 or the first byte of SJIS's 2-byte character code,
2388 or a leading code of Emacs. */
2389 mask = (detect_coding_iso2022 (src, src_end)
2390 | detect_coding_sjis (src, src_end)
2391 | detect_coding_internal (src, src_end));
2393 else if (c < 0xA0)
2394 /* C is the first byte of SJIS character code,
2395 or a leading-code of Emacs. */
2396 mask = (detect_coding_sjis (src, src_end)
2397 | detect_coding_internal (src, src_end));
2399 else
2400 /* C is a character of ISO2022 in graphic plane right,
2401 or a SJIS's 1-byte character code (i.e. JISX0201),
2402 or the first byte of BIG5's 2-byte code. */
2403 mask = (detect_coding_iso2022 (src, src_end)
2404 | detect_coding_sjis (src, src_end)
2405 | detect_coding_big5 (src, src_end));
2407 return mask;
2410 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2411 The information of the detected coding system is set in CODING. */
2413 void
2414 detect_coding (coding, src, src_bytes)
2415 struct coding_system *coding;
2416 unsigned char *src;
2417 int src_bytes;
2419 int mask = detect_coding_mask (src, src_bytes);
2420 int idx;
2422 if (mask == CODING_CATEGORY_MASK_ANY)
2423 /* We found nothing other than ASCII. There's nothing to do. */
2424 return;
2426 if (!mask)
2427 /* The source text seems to be encoded in unknown coding system.
2428 Emacs regards the category of such a kind of coding system as
2429 `coding-category-binary'. We assume that a user has assigned
2430 an appropriate coding system for a `coding-category-binary'. */
2431 idx = CODING_CATEGORY_IDX_BINARY;
2432 else
2434 /* We found some plausible coding systems. Let's use a coding
2435 system of the highest priority. */
2436 Lisp_Object val = Vcoding_category_list;
2438 if (CONSP (val))
2439 while (!NILP (val))
2441 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2442 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2443 break;
2444 val = XCONS (val)->cdr;
2446 else
2447 val = Qnil;
2449 if (NILP (val))
2451 /* For unknown reason, `Vcoding_category_list' contains none
2452 of found categories. Let's use any of them. */
2453 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2454 if (mask & (1 << idx))
2455 break;
2458 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2461 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2462 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2463 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2466 detect_eol_type (src, src_bytes)
2467 unsigned char *src;
2468 int src_bytes;
2470 unsigned char *src_end = src + src_bytes;
2471 unsigned char c;
2473 while (src < src_end)
2475 c = *src++;
2476 if (c == '\n')
2477 return CODING_EOL_LF;
2478 else if (c == '\r')
2480 if (src < src_end && *src == '\n')
2481 return CODING_EOL_CRLF;
2482 else
2483 return CODING_EOL_CR;
2486 return CODING_EOL_AUTOMATIC;
2489 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2490 is encoded. If it detects an appropriate format of end-of-line, it
2491 sets the information in *CODING. */
2493 void
2494 detect_eol (coding, src, src_bytes)
2495 struct coding_system *coding;
2496 unsigned char *src;
2497 int src_bytes;
2499 Lisp_Object val;
2500 int eol_type = detect_eol_type (src, src_bytes);
2502 if (eol_type == CODING_EOL_AUTOMATIC)
2503 /* We found no end-of-line in the source text. */
2504 return;
2506 val = Fget (coding->symbol, Qeol_type);
2507 if (VECTORP (val) && XVECTOR (val)->size == 3)
2508 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2511 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2512 decoding, it may detect coding system and format of end-of-line if
2513 those are not yet decided. */
2516 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2517 struct coding_system *coding;
2518 unsigned char *source, *destination;
2519 int src_bytes, dst_bytes;
2520 int *consumed;
2522 int produced;
2524 if (src_bytes <= 0)
2526 *consumed = 0;
2527 return 0;
2530 if (coding->type == coding_type_automatic)
2531 detect_coding (coding, source, src_bytes);
2533 if (coding->eol_type == CODING_EOL_AUTOMATIC)
2534 detect_eol (coding, source, src_bytes);
2536 coding->carryover_size = 0;
2537 switch (coding->type)
2539 case coding_type_no_conversion:
2540 label_no_conversion:
2541 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2542 bcopy (source, destination, produced);
2543 *consumed = produced;
2544 break;
2546 case coding_type_internal:
2547 case coding_type_automatic:
2548 if (coding->eol_type == CODING_EOL_LF
2549 || coding->eol_type == CODING_EOL_AUTOMATIC)
2550 goto label_no_conversion;
2551 produced = decode_eol (coding, source, destination,
2552 src_bytes, dst_bytes, consumed);
2553 break;
2555 case coding_type_sjis:
2556 produced = decode_coding_sjis_big5 (coding, source, destination,
2557 src_bytes, dst_bytes, consumed,
2559 break;
2561 case coding_type_iso2022:
2562 produced = decode_coding_iso2022 (coding, source, destination,
2563 src_bytes, dst_bytes, consumed);
2564 break;
2566 case coding_type_big5:
2567 produced = decode_coding_sjis_big5 (coding, source, destination,
2568 src_bytes, dst_bytes, consumed,
2570 break;
2572 case coding_type_ccl:
2573 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2574 src_bytes, dst_bytes, consumed);
2575 break;
2578 return produced;
2581 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2584 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2585 struct coding_system *coding;
2586 unsigned char *source, *destination;
2587 int src_bytes, dst_bytes;
2588 int *consumed;
2590 int produced;
2592 coding->carryover_size = 0;
2593 switch (coding->type)
2595 case coding_type_no_conversion:
2596 label_no_conversion:
2597 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2598 if (produced > 0)
2600 bcopy (source, destination, produced);
2601 if (coding->selective)
2603 unsigned char *p = destination, *pend = destination + produced;
2604 while (p < pend)
2605 if (*p++ == '\015') p[-1] = '\n';
2608 *consumed = produced;
2609 break;
2611 case coding_type_internal:
2612 case coding_type_automatic:
2613 if (coding->eol_type == CODING_EOL_LF
2614 || coding->eol_type == CODING_EOL_AUTOMATIC)
2615 goto label_no_conversion;
2616 produced = encode_eol (coding, source, destination,
2617 src_bytes, dst_bytes, consumed);
2618 break;
2620 case coding_type_sjis:
2621 produced = encode_coding_sjis_big5 (coding, source, destination,
2622 src_bytes, dst_bytes, consumed,
2624 break;
2626 case coding_type_iso2022:
2627 produced = encode_coding_iso2022 (coding, source, destination,
2628 src_bytes, dst_bytes, consumed);
2629 break;
2631 case coding_type_big5:
2632 produced = encode_coding_sjis_big5 (coding, source, destination,
2633 src_bytes, dst_bytes, consumed,
2635 break;
2637 case coding_type_ccl:
2638 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2639 src_bytes, dst_bytes, consumed);
2640 break;
2643 return produced;
2646 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2648 /* Return maximum size (bytes) of a buffer enough for decoding
2649 SRC_BYTES of text encoded in CODING. */
2652 decoding_buffer_size (coding, src_bytes)
2653 struct coding_system *coding;
2654 int src_bytes;
2656 int magnification;
2658 if (coding->type == coding_type_iso2022)
2659 magnification = 3;
2660 else if (coding->type == coding_type_ccl)
2661 magnification = coding->spec.ccl.decoder.buf_magnification;
2662 else
2663 magnification = 2;
2665 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2668 /* Return maximum size (bytes) of a buffer enough for encoding
2669 SRC_BYTES of text to CODING. */
2672 encoding_buffer_size (coding, src_bytes)
2673 struct coding_system *coding;
2674 int src_bytes;
2676 int magnification;
2678 if (coding->type == coding_type_ccl)
2679 magnification = coding->spec.ccl.encoder.buf_magnification;
2680 else
2681 magnification = 3;
2683 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2686 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2687 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2688 #endif
2690 char *conversion_buffer;
2691 int conversion_buffer_size;
2693 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2694 or decoding. Sufficient memory is allocated automatically. If we
2695 run out of memory, return NULL. */
2697 char *
2698 get_conversion_buffer (size)
2699 int size;
2701 if (size > conversion_buffer_size)
2703 char *buf;
2704 int real_size = conversion_buffer_size * 2;
2706 while (real_size < size) real_size *= 2;
2707 buf = (char *) xmalloc (real_size);
2708 xfree (conversion_buffer);
2709 conversion_buffer = buf;
2710 conversion_buffer_size = real_size;
2712 return conversion_buffer;
2716 #ifdef emacs
2717 /*** 7. Emacs Lisp library functions ***/
2719 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2720 1, 1, 0,
2721 "Return coding-vector of CODING-SYSTEM.\n\
2722 If CODING-SYSTEM is not a valid coding-system, return nil.")
2723 (obj)
2724 Lisp_Object obj;
2726 while (SYMBOLP (obj) && !NILP (obj))
2727 obj = Fget (obj, Qcoding_system);
2728 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2729 ? Qnil : obj);
2732 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2733 "Return t if OBJECT is nil or a coding-system.\n\
2734 See document of make-coding-system for coding-system object.")
2735 (obj)
2736 Lisp_Object obj;
2738 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2741 DEFUN ("read-non-nil-coding-system",
2742 Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
2743 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2744 (prompt)
2745 Lisp_Object prompt;
2747 Lisp_Object val;
2748 do {
2749 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2750 Qt, Qnil, Qnil);
2751 } while (XSTRING (val)->size == 0);
2752 return (Fintern (val, Qnil));
2755 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2756 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2757 (prompt)
2758 Lisp_Object prompt;
2760 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2761 Qt, Qnil, Qnil);
2762 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2765 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2766 1, 1, 0,
2767 "Check validity of CODING-SYSTEM.\n\
2768 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2769 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2770 The value of property should be a vector of length 5.")
2771 (coding_system)
2772 Lisp_Object coding_system;
2774 CHECK_SYMBOL (coding_system, 0);
2775 if (!NILP (Fcoding_system_p (coding_system)))
2776 return coding_system;
2777 while (1)
2778 Fsignal (Qcoding_system_error, coding_system);
2781 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2782 2, 2, 0,
2783 "Detect coding-system of the text in the region between START and END.\n\
2784 Return a list of possible coding-systems ordered by priority.\n\
2785 If only ASCII characters are found, it returns `automatic-conversion'\n\
2786 or its subsidiary coding-system according to a detected end-of-line format.")
2787 (b, e)
2788 Lisp_Object b, e;
2790 int coding_mask, eol_type;
2791 Lisp_Object val;
2792 int beg, end;
2794 validate_region (&b, &e);
2795 beg = XINT (b), end = XINT (e);
2796 if (beg < GPT && end >= GPT) move_gap (end);
2798 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2799 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2801 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2803 val = intern ("automatic-conversion");
2804 if (eol_type != CODING_EOL_AUTOMATIC)
2806 Lisp_Object val2 = Fget (val, Qeol_type);
2807 if (VECTORP (val2))
2808 val = XVECTOR (val2)->contents[eol_type];
2811 else
2813 Lisp_Object val2;
2815 /* At first, gather possible coding-systems in VAL in a reverse
2816 order. */
2817 val = Qnil;
2818 for (val2 = Vcoding_category_list;
2819 !NILP (val2);
2820 val2 = XCONS (val2)->cdr)
2822 int idx
2823 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2824 if (coding_mask & (1 << idx))
2825 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2828 /* Then, change the order of the list, while getting subsidiary
2829 coding-systems. */
2830 val2 = val;
2831 val = Qnil;
2832 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2834 if (eol_type == CODING_EOL_AUTOMATIC)
2835 val = Fcons (XCONS (val2)->car, val);
2836 else
2838 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2839 if (VECTORP (val3))
2840 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2841 else
2842 val = Fcons (XCONS (val2)->car, val);
2847 return val;
2850 /* Scan text in the region between *BEGP and *ENDP, skip characters
2851 which we never have to encode to (iff ENCODEP is 1) or decode from
2852 coding system CODING at the head and tail, then set BEGP and ENDP
2853 to the addresses of start and end of the text we actually convert. */
2855 void
2856 shrink_conversion_area (begp, endp, coding, encodep)
2857 unsigned char **begp, **endp;
2858 struct coding_system *coding;
2859 int encodep;
2861 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2863 if (coding->eol_type != CODING_EOL_LF
2864 && coding->eol_type != CODING_EOL_AUTOMATIC)
2865 /* Since we anyway have to convert end-of-line format, it is not
2866 worth skipping at most 100 bytes or so. */
2867 return;
2869 if (encodep) /* for encoding */
2871 switch (coding->type)
2873 case coding_type_no_conversion:
2874 case coding_type_internal:
2875 case coding_type_automatic:
2876 /* We need no conversion. */
2877 *begp = *endp;
2878 return;
2879 case coding_type_ccl:
2880 /* We can't skip any data. */
2881 return;
2882 case coding_type_iso2022:
2883 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2885 unsigned char *bol = beg_addr;
2886 while (beg_addr < end_addr && *beg_addr < 0x80)
2888 beg_addr++;
2889 if (*(beg_addr - 1) == '\n')
2890 bol = beg_addr;
2892 beg_addr = bol;
2893 goto label_skip_tail;
2895 /* fall down ... */
2896 default:
2897 /* We can skip all ASCII characters at the head and tail. */
2898 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2899 label_skip_tail:
2900 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2901 break;
2904 else /* for decoding */
2906 switch (coding->type)
2908 case coding_type_no_conversion:
2909 /* We need no conversion. */
2910 *begp = *endp;
2911 return;
2912 case coding_type_internal:
2913 if (coding->eol_type == CODING_EOL_LF)
2915 /* We need no conversion. */
2916 *begp = *endp;
2917 return;
2919 /* We can skip all but carriage-return. */
2920 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2921 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2922 break;
2923 case coding_type_sjis:
2924 case coding_type_big5:
2925 /* We can skip all ASCII characters at the head. */
2926 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2927 /* We can skip all ASCII characters at the tail except for
2928 the second byte of SJIS or BIG5 code. */
2929 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2930 if (end_addr != *endp)
2931 end_addr++;
2932 break;
2933 case coding_type_ccl:
2934 /* We can't skip any data. */
2935 return;
2936 default: /* i.e. case coding_type_iso2022: */
2938 unsigned char c;
2940 /* We can skip all ASCII characters except for a few
2941 control codes at the head. */
2942 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2943 && c != ISO_CODE_CR && c != ISO_CODE_SO
2944 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2945 beg_addr++;
2947 break;
2950 *begp = beg_addr;
2951 *endp = end_addr;
2952 return;
2955 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2956 text between B and E. B and E are buffer position. */
2958 Lisp_Object
2959 code_convert_region (b, e, coding, encodep)
2960 Lisp_Object b, e;
2961 struct coding_system *coding;
2962 int encodep;
2964 int beg, end, len, consumed, produced;
2965 char *buf;
2966 unsigned char *begp, *endp;
2967 int pos = PT;
2969 validate_region (&b, &e);
2970 beg = XINT (b), end = XINT (e);
2971 if (beg < GPT && end >= GPT)
2972 move_gap (end);
2974 if (encodep && !NILP (coding->pre_write_conversion))
2976 /* We must call a pre-conversion function which may put a new
2977 text to be converted in a new buffer. */
2978 struct buffer *old = current_buffer, *new;
2980 TEMP_SET_PT (beg);
2981 call2 (coding->pre_write_conversion, b, e);
2982 if (old != current_buffer)
2984 /* Replace the original text by the text just generated. */
2985 len = ZV - BEGV;
2986 new = current_buffer;
2987 set_buffer_internal (old);
2988 del_range (beg, end);
2989 insert_from_buffer (new, 1, len, 0);
2990 end = beg + len;
2994 /* We may be able to shrink the conversion region. */
2995 begp = POS_ADDR (beg); endp = begp + (end - beg);
2996 shrink_conversion_area (&begp, &endp, coding, encodep);
2998 if (begp == endp)
2999 /* We need no conversion. */
3000 len = end - beg;
3001 else
3003 beg += begp - POS_ADDR (beg);
3004 end = beg + (endp - begp);
3006 if (encodep)
3007 len = encoding_buffer_size (coding, end - beg);
3008 else
3009 len = decoding_buffer_size (coding, end - beg);
3010 buf = get_conversion_buffer (len);
3012 coding->last_block = 1;
3013 produced = (encodep
3014 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3015 &consumed)
3016 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3017 &consumed));
3019 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3021 TEMP_SET_PT (beg);
3022 insert (buf, produced);
3023 del_range (PT, PT + end - beg);
3024 if (pos >= end)
3025 pos = PT + (pos - end);
3026 else if (pos > beg)
3027 pos = beg;
3028 TEMP_SET_PT (pos);
3031 if (!encodep && !NILP (coding->post_read_conversion))
3033 /* We must call a post-conversion function which may alter
3034 the text just converted. */
3035 Lisp_Object insval;
3037 beg = XINT (b);
3038 TEMP_SET_PT (beg);
3039 insval = call1 (coding->post_read_conversion, make_number (len));
3040 CHECK_NUMBER (insval, 0);
3041 len = XINT (insval);
3044 return make_number (len);
3047 Lisp_Object
3048 code_convert_string (str, coding, encodep, nocopy)
3049 Lisp_Object str, nocopy;
3050 struct coding_system *coding;
3051 int encodep;
3053 int len, consumed, produced;
3054 char *buf;
3055 unsigned char *begp, *endp;
3056 int head_skip, tail_skip;
3057 struct gcpro gcpro1;
3059 if (encodep && !NILP (coding->pre_write_conversion)
3060 || !encodep && !NILP (coding->post_read_conversion))
3062 /* Since we have to call Lisp functions which assume target text
3063 is in a buffer, after setting a temporary buffer, call
3064 code_convert_region. */
3065 int count = specpdl_ptr - specpdl;
3066 int len = XSTRING (str)->size;
3067 Lisp_Object result;
3068 struct buffer *old = current_buffer;
3070 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3071 temp_output_buffer_setup (" *code-converting-work*");
3072 set_buffer_internal (XBUFFER (Vstandard_output));
3073 insert_from_string (str, 0, len, 0);
3074 code_convert_region (make_number (BEGV), make_number (ZV),
3075 coding, encodep);
3076 result = make_buffer_string (BEGV, ZV, 0);
3077 set_buffer_internal (old);
3078 return unbind_to (count, result);
3081 /* We may be able to shrink the conversion region. */
3082 begp = XSTRING (str)->data;
3083 endp = begp + XSTRING (str)->size;
3084 shrink_conversion_area (&begp, &endp, coding, encodep);
3086 if (begp == endp)
3087 /* We need no conversion. */
3088 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3090 head_skip = begp - XSTRING (str)->data;
3091 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3093 GCPRO1 (str);
3095 if (encodep)
3096 len = encoding_buffer_size (coding, endp - begp);
3097 else
3098 len = decoding_buffer_size (coding, endp - begp);
3099 buf = get_conversion_buffer (len + head_skip + tail_skip);
3101 bcopy (XSTRING (str)->data, buf, head_skip);
3102 coding->last_block = 1;
3103 produced = (encodep
3104 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3105 buf + head_skip, endp - begp, len, &consumed)
3106 : decode_coding (coding, XSTRING (str)->data + head_skip,
3107 buf + head_skip, endp - begp, len, &consumed));
3108 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3109 buf + head_skip + produced,
3110 tail_skip);
3112 UNGCPRO;
3114 return make_string (buf, head_skip + produced + tail_skip);
3117 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3118 3, 3, "r\nzCoding system: ",
3119 "Decode current region by specified coding system.\n\
3120 When called from a program, takes three arguments:\n\
3121 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3122 Return length of decoded text.")
3123 (b, e, coding_system)
3124 Lisp_Object b, e, coding_system;
3126 struct coding_system coding;
3128 CHECK_NUMBER_COERCE_MARKER (b, 0);
3129 CHECK_NUMBER_COERCE_MARKER (e, 1);
3130 CHECK_SYMBOL (coding_system, 2);
3132 if (NILP (coding_system))
3133 return make_number (XFASTINT (e) - XFASTINT (b));
3134 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3135 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3137 return code_convert_region (b, e, &coding, 0);
3140 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3141 3, 3, "r\nzCoding system: ",
3142 "Encode current region by specified coding system.\n\
3143 When called from a program, takes three arguments:\n\
3144 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3145 Return length of encoded text.")
3146 (b, e, coding_system)
3147 Lisp_Object b, e, coding_system;
3149 struct coding_system coding;
3151 CHECK_NUMBER_COERCE_MARKER (b, 0);
3152 CHECK_NUMBER_COERCE_MARKER (e, 1);
3153 CHECK_SYMBOL (coding_system, 2);
3155 if (NILP (coding_system))
3156 return make_number (XFASTINT (e) - XFASTINT (b));
3157 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3158 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3160 return code_convert_region (b, e, &coding, 1);
3163 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3164 2, 3, 0,
3165 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3166 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3167 of decoding.")
3168 (string, coding_system, nocopy)
3169 Lisp_Object string, coding_system, nocopy;
3171 struct coding_system coding;
3173 CHECK_STRING (string, 0);
3174 CHECK_SYMBOL (coding_system, 1);
3176 if (NILP (coding_system))
3177 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3178 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3179 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3181 return code_convert_string (string, &coding, 0, nocopy);
3184 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3185 2, 3, 0,
3186 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3187 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3188 of encoding.")
3189 (string, coding_system, nocopy)
3190 Lisp_Object string, coding_system, nocopy;
3192 struct coding_system coding;
3194 CHECK_STRING (string, 0);
3195 CHECK_SYMBOL (coding_system, 1);
3197 if (NILP (coding_system))
3198 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3199 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3200 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3202 return code_convert_string (string, &coding, 1, nocopy);
3205 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3206 "Decode a JISX0208 character of shift-jis encoding.\n\
3207 CODE is the character code in SJIS.\n\
3208 Return the corresponding character.")
3209 (code)
3210 Lisp_Object code;
3212 unsigned char c1, c2, s1, s2;
3213 Lisp_Object val;
3215 CHECK_NUMBER (code, 0);
3216 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3217 DECODE_SJIS (s1, s2, c1, c2);
3218 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3219 return val;
3222 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3223 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3224 Return the corresponding character code in SJIS.")
3225 (ch)
3226 Lisp_Object ch;
3228 int charset;
3229 unsigned char c1, c2, s1, s2;
3230 Lisp_Object val;
3232 CHECK_NUMBER (ch, 0);
3233 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3234 if (charset == charset_jisx0208)
3236 ENCODE_SJIS (c1, c2, s1, s2);
3237 XSETFASTINT (val, ((int)s1 << 8) | s2);
3239 else
3240 XSETFASTINT (val, 0);
3241 return val;
3244 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3245 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3246 CODE is the character code in BIG5.\n\
3247 Return the corresponding character.")
3248 (code)
3249 Lisp_Object code;
3251 int charset;
3252 unsigned char b1, b2, c1, c2;
3253 Lisp_Object val;
3255 CHECK_NUMBER (code, 0);
3256 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3257 DECODE_BIG5 (b1, b2, charset, c1, c2);
3258 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3259 return val;
3262 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3263 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3264 Return the corresponding character code in Big5.")
3265 (ch)
3266 Lisp_Object ch;
3268 int charset;
3269 unsigned char c1, c2, b1, b2;
3270 Lisp_Object val;
3272 CHECK_NUMBER (ch, 0);
3273 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3274 if (charset == charset_big5_1 || charset == charset_big5_2)
3276 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3277 XSETFASTINT (val, ((int)b1 << 8) | b2);
3279 else
3280 XSETFASTINT (val, 0);
3281 return val;
3284 DEFUN ("set-terminal-coding-system",
3285 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3286 "zCoding-system for terminal display: ",
3287 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3288 All outputs to terminal are encoded to this coding-system.")
3289 (coding_system)
3290 Lisp_Object coding_system;
3292 CHECK_SYMBOL (coding_system, 0);
3293 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3294 update_mode_lines++;
3295 if (!NILP (Finteractive_p ()))
3296 Fredraw_display ();
3297 return Qnil;
3300 DEFUN ("terminal-coding-system",
3301 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3302 "Return coding-system of your terminal.")
3305 return terminal_coding.symbol;
3308 DEFUN ("set-keyboard-coding-system",
3309 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3310 "zCoding-system for keyboard input: ",
3311 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3312 All inputs from terminal are decoded from this coding-system.")
3313 (coding_system)
3314 Lisp_Object coding_system;
3316 CHECK_SYMBOL (coding_system, 0);
3317 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3318 return Qnil;
3321 DEFUN ("keyboard-coding-system",
3322 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3323 "Return coding-system of what is sent from terminal keyboard.")
3326 return keyboard_coding.symbol;
3330 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3331 1, MANY, 0,
3332 "Choose a coding system for a file operation based on file name.\n\
3333 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3334 ENCODING-SYSTEM is the coding system to use for encoding\n\
3335 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3336 for decoding (in case OPERATION does decoding).\n\
3338 The first argument OPERATION specifies an I/O primitive:\n\
3339 For file I/O, `insert-file-contents' or `write-region'.\n\
3340 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3341 For network I/O, `open-network-stream'.\n\
3343 The remaining arguments should be the same arguments that were passed\n\
3344 to the primitive. Depending on which primitive, one of those arguments\n\
3345 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3346 whichever argument specifies the file name is TARGET.\n\
3348 TARGET has a meaning which depends on OPERATION:\n\
3349 For file I/O, TARGET is a file name.\n\
3350 For process I/O, TARGET is a process name.\n\
3351 For network I/O, TARGET is a service name or a port number\n\
3353 This function looks up what `coding-system-alist' specifies for\n\
3354 OPERATION and TARGET. It may specify a cons cell which represents\n\
3355 a particular coding system or it may have a function to call.\n\
3356 In the latter case, we call the function with one argument,\n\
3357 which is a list of all the arguments given to `find-coding-system'.")
3358 (nargs, args)
3359 int nargs;
3360 Lisp_Object *args;
3362 Lisp_Object operation, target_idx, target, val;
3363 register Lisp_Object chain;
3365 if (nargs < 2)
3366 error ("Too few arguments");
3367 operation = args[0];
3368 if (!SYMBOLP (operation)
3369 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3370 error ("Invalid first arguement");
3371 if (nargs < 1 + XINT (target_idx))
3372 error ("Too few arguments for operation: %s",
3373 XSYMBOL (operation)->name->data);
3374 target = args[XINT (target_idx) + 1];
3375 if (!(STRINGP (target)
3376 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3377 error ("Invalid %dth argument", XINT (target_idx) + 1);
3379 chain = Fassq (operation, Vcoding_system_alist);
3380 if (NILP (chain))
3381 return Qnil;
3383 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3385 Lisp_Object elt = XCONS (chain)->car;
3387 if (CONSP (elt)
3388 && ((STRINGP (target)
3389 && STRINGP (XCONS (elt)->car)
3390 && fast_string_match (XCONS (elt)->car, target) >= 0)
3391 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3392 return (CONSP (val = XCONS (elt)->cdr)
3393 ? val
3394 : ((SYMBOLP (val) && Fboundp (val)
3395 ? call2 (val, Flist (nargs, args))
3396 : Qnil)));
3398 return Qnil;
3401 #endif /* emacs */
3404 /*** 8. Post-amble ***/
3406 init_coding_once ()
3408 int i;
3410 /* Emacs internal format specific initialize routine. */
3411 for (i = 0; i <= 0x20; i++)
3412 emacs_code_class[i] = EMACS_control_code;
3413 emacs_code_class[0x0A] = EMACS_linefeed_code;
3414 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3415 for (i = 0x21 ; i < 0x7F; i++)
3416 emacs_code_class[i] = EMACS_ascii_code;
3417 emacs_code_class[0x7F] = EMACS_control_code;
3418 emacs_code_class[0x80] = EMACS_leading_code_composition;
3419 for (i = 0x81; i < 0xFF; i++)
3420 emacs_code_class[i] = EMACS_invalid_code;
3421 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3422 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3423 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3424 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3426 /* ISO2022 specific initialize routine. */
3427 for (i = 0; i < 0x20; i++)
3428 iso_code_class[i] = ISO_control_code;
3429 for (i = 0x21; i < 0x7F; i++)
3430 iso_code_class[i] = ISO_graphic_plane_0;
3431 for (i = 0x80; i < 0xA0; i++)
3432 iso_code_class[i] = ISO_control_code;
3433 for (i = 0xA1; i < 0xFF; i++)
3434 iso_code_class[i] = ISO_graphic_plane_1;
3435 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3436 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3437 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3438 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3439 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3440 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3441 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3442 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3443 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3444 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3446 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3447 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3449 setup_coding_system (Qnil, &keyboard_coding);
3450 setup_coding_system (Qnil, &terminal_coding);
3453 #ifdef emacs
3455 syms_of_coding ()
3457 Qtarget_idx = intern ("target-idx");
3458 staticpro (&Qtarget_idx);
3460 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3461 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3463 Qcall_process = intern ("call-process");
3464 staticpro (&Qcall_process);
3465 Fput (Qcall_process, Qtarget_idx, make_number (0));
3467 Qcall_process_region = intern ("call-process-region");
3468 staticpro (&Qcall_process_region);
3469 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3471 Qstart_process = intern ("start-process");
3472 staticpro (&Qstart_process);
3473 Fput (Qstart_process, Qtarget_idx, make_number (2));
3475 Qopen_network_stream = intern ("open-network-stream");
3476 staticpro (&Qopen_network_stream);
3477 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3479 Qcoding_system = intern ("coding-system");
3480 staticpro (&Qcoding_system);
3482 Qeol_type = intern ("eol-type");
3483 staticpro (&Qeol_type);
3485 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3486 staticpro (&Qbuffer_file_coding_system);
3488 Qpost_read_conversion = intern ("post-read-conversion");
3489 staticpro (&Qpost_read_conversion);
3491 Qpre_write_conversion = intern ("pre-write-conversion");
3492 staticpro (&Qpre_write_conversion);
3494 Qcoding_system_vector = intern ("coding-system-vector");
3495 staticpro (&Qcoding_system_vector);
3497 Qcoding_system_p = intern ("coding-system-p");
3498 staticpro (&Qcoding_system_p);
3500 Qcoding_system_error = intern ("coding-system-error");
3501 staticpro (&Qcoding_system_error);
3503 Fput (Qcoding_system_error, Qerror_conditions,
3504 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3505 Fput (Qcoding_system_error, Qerror_message,
3506 build_string ("Coding-system error"));
3508 Qcoding_category_index = intern ("coding-category-index");
3509 staticpro (&Qcoding_category_index);
3512 int i;
3513 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3515 coding_category_table[i] = intern (coding_category_name[i]);
3516 staticpro (&coding_category_table[i]);
3517 Fput (coding_category_table[i], Qcoding_category_index,
3518 make_number (i));
3522 defsubr (&Scoding_system_vector);
3523 defsubr (&Scoding_system_p);
3524 defsubr (&Sread_coding_system);
3525 defsubr (&Sread_non_nil_coding_system);
3526 defsubr (&Scheck_coding_system);
3527 defsubr (&Sdetect_coding_region);
3528 defsubr (&Sdecode_coding_region);
3529 defsubr (&Sencode_coding_region);
3530 defsubr (&Sdecode_coding_string);
3531 defsubr (&Sencode_coding_string);
3532 defsubr (&Sdecode_sjis_char);
3533 defsubr (&Sencode_sjis_char);
3534 defsubr (&Sdecode_big5_char);
3535 defsubr (&Sencode_big5_char);
3536 defsubr (&Sset_terminal_coding_system);
3537 defsubr (&Sterminal_coding_system);
3538 defsubr (&Sset_keyboard_coding_system);
3539 defsubr (&Skeyboard_coding_system);
3540 defsubr (&Sfind_coding_system);
3542 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3543 "List of coding-categories (symbols) ordered by priority.");
3545 int i;
3547 Vcoding_category_list = Qnil;
3548 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3549 Vcoding_category_list
3550 = Fcons (coding_category_table[i], Vcoding_category_list);
3553 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3554 "A variable of internal use only.\n\
3555 If the value is a coding system, it is used for decoding on read operation.\n\
3556 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3557 Vcoding_system_for_read = Qnil;
3559 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3560 "A variable of internal use only.\n\
3561 If the value is a coding system, it is used for encoding on write operation.\n\
3562 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3563 Vcoding_system_for_write = Qnil;
3565 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3566 "Coding-system used in the latest file or process I/O.");
3567 Vlast_coding_system_used = Qnil;
3569 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3570 "Nested alist to decide a coding system for a specific I/O operation.\n\
3571 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3573 OPERATION is one of the following Emacs I/O primitives:\n\
3574 For file I/O, insert-file-contents and write-region.\n\
3575 For process I/O, call-process, call-process-region, and start-process.\n\
3576 For network I/O, open-network-stream.\n\
3577 In addition, for process I/O, `process-argument' can be specified for\n\
3578 encoding arguments of the process.\n\
3580 REGEXP is a regular expression matching a target of OPERATION, where\n\
3581 target is a file name for file I/O operations, a process name for\n\
3582 process I/O operations, or a service name for network I/O\n\
3583 operations. REGEXP might be a port number for network I/O operation.\n\
3585 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3586 character code on OPERATION, or a function symbol returning the cons.\n\
3587 See the documentation of `find-coding-system' for more detail.");
3588 Vcoding_system_alist = Qnil;
3590 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3591 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3592 eol_mnemonic_unix = '.';
3594 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3595 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3596 eol_mnemonic_dos = ':';
3598 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3599 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3600 eol_mnemonic_mac = '\'';
3602 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3603 "Mnemonic character indicating end-of-line format is not yet decided.");
3604 eol_mnemonic_undecided = '-';
3606 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3607 "Alist of charsets vs the alternate charsets.\n\
3608 While decoding, if a charset (car part of an element) is found,\n\
3609 decode it as the alternate charset (cdr part of the element).");
3610 Valternate_charset_table = Qnil;
3612 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3613 "Alist of charsets vs revision numbers.\n\
3614 While encoding, if a charset (car part of an element) is found,\n\
3615 designate it with the escape sequence identifing revision (cdr part of the element).");
3616 Vcharset_revision_alist = Qnil;
3619 #endif /* emacs */