(make_lispy_event): Distinguish S-SPC from SPC.
[emacs.git] / src / coding.c
blobd3093a58960184680afea64be67c6240346f955f
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
49 1. ISO2022
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
62 3. BIG5
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
70 4. Other
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105 #if 0
107 detect_coding_emacs_mule (src, src_end)
108 unsigned char *src, *src_end;
112 #endif
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
122 #if 0
123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
131 #endif
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
141 #if 0
142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
150 #endif
152 /*** COMMONLY USED MACROS ***/
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
160 #define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
168 #define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
176 #define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
191 /* Decode one ASCII character C. */
193 #define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
226 /*** 1. Preamble ***/
228 #include <stdio.h>
230 #ifdef emacs
232 #include <config.h>
233 #include "lisp.h"
234 #include "buffer.h"
235 #include "charset.h"
236 #include "ccl.h"
237 #include "coding.h"
238 #include "window.h"
240 #else /* not emacs */
242 #include "mulelib.h"
244 #endif /* not emacs */
246 Lisp_Object Qcoding_system, Qeol_type;
247 Lisp_Object Qbuffer_file_coding_system;
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252 Lisp_Object Qstart_process, Qopen_network_stream;
253 Lisp_Object Qtarget_idx;
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257 /* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259 int eol_mnemonic_undecided;
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263 int system_eol_type;
265 #ifdef emacs
267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule;
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used;
281 /* Flag to inhibit code conversion of end-of-line format. */
282 int inhibit_eol_conversion;
284 /* Coding-system of what terminal accept for displaying. */
285 struct coding_system terminal_coding;
287 /* Coding-system of what is sent from terminal keyboard. */
288 struct coding_system keyboard_coding;
290 Lisp_Object Vfile_coding_system_alist;
291 Lisp_Object Vprocess_coding_system_alist;
292 Lisp_Object Vnetwork_coding_system_alist;
294 #endif /* emacs */
296 Lisp_Object Qcoding_category_index;
298 /* List of symbols `coding-category-xxx' ordered by priority. */
299 Lisp_Object Vcoding_category_list;
301 /* Table of coding-systems currently assigned to each coding-category. */
302 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
304 /* Table of names of symbol for each coding-category. */
305 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
306 "coding-category-emacs-mule",
307 "coding-category-sjis",
308 "coding-category-iso-7",
309 "coding-category-iso-8-1",
310 "coding-category-iso-8-2",
311 "coding-category-iso-else",
312 "coding-category-big5",
313 "coding-category-binary"
316 /* Flag to tell if we look up unification table on character code
317 conversion. */
318 Lisp_Object Venable_character_unification;
319 /* Standard unification table to look up on decoding (reading). */
320 Lisp_Object Vstandard_character_unification_table_for_decode;
321 /* Standard unification table to look up on encoding (writing). */
322 Lisp_Object Vstandard_character_unification_table_for_encode;
324 Lisp_Object Qcharacter_unification_table;
325 Lisp_Object Qcharacter_unification_table_for_decode;
326 Lisp_Object Qcharacter_unification_table_for_encode;
328 /* Alist of charsets vs revision number. */
329 Lisp_Object Vcharset_revision_alist;
331 /* Default coding systems used for process I/O. */
332 Lisp_Object Vdefault_process_coding_system;
335 /*** 2. Emacs internal format (emacs-mule) handlers ***/
337 /* Emacs' internal format for encoding multiple character sets is a
338 kind of multi-byte encoding, i.e. characters are encoded by
339 variable-length sequences of one-byte codes. ASCII characters
340 and control characters (e.g. `tab', `newline') are represented by
341 one-byte sequences which are their ASCII codes, in the range 0x00
342 through 0x7F. The other characters are represented by a sequence
343 of `base leading-code', optional `extended leading-code', and one
344 or two `position-code's. The length of the sequence is determined
345 by the base leading-code. Leading-code takes the range 0x80
346 through 0x9F, whereas extended leading-code and position-code take
347 the range 0xA0 through 0xFF. See `charset.h' for more details
348 about leading-code and position-code.
350 There's one exception to this rule. Special leading-code
351 `leading-code-composition' denotes that the following several
352 characters should be composed into one character. Leading-codes of
353 components (except for ASCII) are added 0x20. An ASCII character
354 component is represented by a 2-byte sequence of `0xA0' and
355 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
356 details of composite character. Hence, we can summarize the code
357 range as follows:
359 --- CODE RANGE of Emacs' internal format ---
360 (character set) (range)
361 ASCII 0x00 .. 0x7F
362 ELSE (1st byte) 0x80 .. 0x9F
363 (rest bytes) 0xA0 .. 0xFF
364 ---------------------------------------------
368 enum emacs_code_class_type emacs_code_class[256];
370 /* Go to the next statement only if *SRC is accessible and the code is
371 greater than 0xA0. */
372 #define CHECK_CODE_RANGE_A0_FF \
373 do { \
374 if (src >= src_end) \
375 goto label_end_of_switch; \
376 else if (*src++ < 0xA0) \
377 return 0; \
378 } while (0)
380 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
381 Check if a text is encoded in Emacs' internal format. If it is,
382 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
385 detect_coding_emacs_mule (src, src_end)
386 unsigned char *src, *src_end;
388 unsigned char c;
389 int composing = 0;
391 while (src < src_end)
393 c = *src++;
395 if (composing)
397 if (c < 0xA0)
398 composing = 0;
399 else
400 c -= 0x20;
403 switch (emacs_code_class[c])
405 case EMACS_ascii_code:
406 case EMACS_linefeed_code:
407 break;
409 case EMACS_control_code:
410 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
411 return 0;
412 break;
414 case EMACS_invalid_code:
415 return 0;
417 case EMACS_leading_code_composition: /* c == 0x80 */
418 if (composing)
419 CHECK_CODE_RANGE_A0_FF;
420 else
421 composing = 1;
422 break;
424 case EMACS_leading_code_4:
425 CHECK_CODE_RANGE_A0_FF;
426 /* fall down to check it two more times ... */
428 case EMACS_leading_code_3:
429 CHECK_CODE_RANGE_A0_FF;
430 /* fall down to check it one more time ... */
432 case EMACS_leading_code_2:
433 CHECK_CODE_RANGE_A0_FF;
434 break;
436 default:
437 label_end_of_switch:
438 break;
441 return CODING_CATEGORY_MASK_EMACS_MULE;
445 /*** 3. ISO2022 handlers ***/
447 /* The following note describes the coding system ISO2022 briefly.
448 Since the intention of this note is to help in understanding of
449 the programs in this file, some parts are NOT ACCURATE or OVERLY
450 SIMPLIFIED. For the thorough understanding, please refer to the
451 original document of ISO2022.
453 ISO2022 provides many mechanisms to encode several character sets
454 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
455 all text is encoded by codes of less than 128. This may make the
456 encoded text a little bit longer, but the text gets more stability
457 to pass through several gateways (some of them strip off the MSB).
459 There are two kinds of character set: control character set and
460 graphic character set. The former contains control characters such
461 as `newline' and `escape' to provide control functions (control
462 functions are provided also by escape sequences). The latter
463 contains graphic characters such as ' A' and '-'. Emacs recognizes
464 two control character sets and many graphic character sets.
466 Graphic character sets are classified into one of the following
467 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
468 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
469 bytes (DIMENSION) and the number of characters in one dimension
470 (CHARS) of the set. In addition, each character set is assigned an
471 identification tag (called "final character" and denoted as <F>
472 here after) which is unique in each class. <F> of each character
473 set is decided by ECMA(*) when it is registered in ISO. Code range
474 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
476 Note (*): ECMA = European Computer Manufacturers Association
478 Here are examples of graphic character set [NAME(<F>)]:
479 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
480 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
481 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
482 o DIMENSION2_CHARS96 -- none for the moment
484 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
485 C0 [0x00..0x1F] -- control character plane 0
486 GL [0x20..0x7F] -- graphic character plane 0
487 C1 [0x80..0x9F] -- control character plane 1
488 GR [0xA0..0xFF] -- graphic character plane 1
490 A control character set is directly designated and invoked to C0 or
491 C1 by an escape sequence. The most common case is that ISO646's
492 control character set is designated/invoked to C0 and ISO6429's
493 control character set is designated/invoked to C1, and usually
494 these designations/invocations are omitted in a coded text. With
495 7-bit environment, only C0 can be used, and a control character for
496 C1 is encoded by an appropriate escape sequence to fit in the
497 environment. All control characters for C1 are defined the
498 corresponding escape sequences.
500 A graphic character set is at first designated to one of four
501 graphic registers (G0 through G3), then these graphic registers are
502 invoked to GL or GR. These designations and invocations can be
503 done independently. The most common case is that G0 is invoked to
504 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
505 these invocations and designations are omitted in a coded text.
506 With 7-bit environment, only GL can be used.
508 When a graphic character set of CHARS94 is invoked to GL, code 0x20
509 and 0x7F of GL area work as control characters SPACE and DEL
510 respectively, and code 0xA0 and 0xFF of GR area should not be used.
512 There are two ways of invocation: locking-shift and single-shift.
513 With locking-shift, the invocation lasts until the next different
514 invocation, whereas with single-shift, the invocation works only
515 for the following character and doesn't affect locking-shift.
516 Invocations are done by the following control characters or escape
517 sequences.
519 ----------------------------------------------------------------------
520 function control char escape sequence description
521 ----------------------------------------------------------------------
522 SI (shift-in) 0x0F none invoke G0 to GL
523 SI (shift-out) 0x0E none invoke G1 to GL
524 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
525 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
526 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
527 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
528 ----------------------------------------------------------------------
529 The first four are for locking-shift. Control characters for these
530 functions are defined by macros ISO_CODE_XXX in `coding.h'.
532 Designations are done by the following escape sequences.
533 ----------------------------------------------------------------------
534 escape sequence description
535 ----------------------------------------------------------------------
536 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
537 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
538 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
539 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
540 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
541 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
542 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
543 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
544 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
545 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
546 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
547 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
548 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
549 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
550 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
551 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
552 ----------------------------------------------------------------------
554 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
555 of dimension 1, chars 94, and final character <F>, and etc.
557 Note (*): Although these designations are not allowed in ISO2022,
558 Emacs accepts them on decoding, and produces them on encoding
559 CHARS96 character set in a coding system which is characterized as
560 7-bit environment, non-locking-shift, and non-single-shift.
562 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
563 '(' can be omitted. We call this as "short-form" here after.
565 Now you may notice that there are a lot of ways for encoding the
566 same multilingual text in ISO2022. Actually, there exists many
567 coding systems such as Compound Text (used in X's inter client
568 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
569 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
570 localized platforms), and all of these are variants of ISO2022.
572 In addition to the above, Emacs handles two more kinds of escape
573 sequences: ISO6429's direction specification and Emacs' private
574 sequence for specifying character composition.
576 ISO6429's direction specification takes the following format:
577 o CSI ']' -- end of the current direction
578 o CSI '0' ']' -- end of the current direction
579 o CSI '1' ']' -- start of left-to-right text
580 o CSI '2' ']' -- start of right-to-left text
581 The control character CSI (0x9B: control sequence introducer) is
582 abbreviated to the escape sequence ESC '[' in 7-bit environment.
584 Character composition specification takes the following format:
585 o ESC '0' -- start character composition
586 o ESC '1' -- end character composition
587 Since these are not standard escape sequences of any ISO, the use
588 of them for these meaning is restricted to Emacs only. */
590 enum iso_code_class_type iso_code_class[256];
592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
593 Check if a text is encoded in ISO2022. If it is, returns an
594 integer in which appropriate flag bits any of:
595 CODING_CATEGORY_MASK_ISO_7
596 CODING_CATEGORY_MASK_ISO_8_1
597 CODING_CATEGORY_MASK_ISO_8_2
598 CODING_CATEGORY_MASK_ISO_ELSE
599 are set. If a code which should never appear in ISO2022 is found,
600 returns 0. */
603 detect_coding_iso2022 (src, src_end)
604 unsigned char *src, *src_end;
606 int mask = (CODING_CATEGORY_MASK_ISO_7
607 | CODING_CATEGORY_MASK_ISO_8_1
608 | CODING_CATEGORY_MASK_ISO_8_2
609 | CODING_CATEGORY_MASK_ISO_ELSE);
610 int g1 = 0; /* 1 iff designating to G1. */
611 int c, i;
613 while (src < src_end)
615 c = *src++;
616 switch (c)
618 case ISO_CODE_ESC:
619 if (src >= src_end)
620 break;
621 c = *src++;
622 if (src < src_end
623 && ((c >= '(' && c <= '/')
624 || c == '$' && ((*src >= '(' && *src <= '/')
625 || (*src >= '@' && *src <= 'B'))))
627 /* Valid designation sequence. */
628 if (c == ')' || (c == '$' && *src == ')'))
630 g1 = 1;
631 mask &= ~CODING_CATEGORY_MASK_ISO_7;
633 src++;
634 break;
636 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
637 return CODING_CATEGORY_MASK_ISO_ELSE;
638 break;
640 case ISO_CODE_SO:
641 if (g1)
642 return CODING_CATEGORY_MASK_ISO_ELSE;
643 break;
645 case ISO_CODE_CSI:
646 case ISO_CODE_SS2:
647 case ISO_CODE_SS3:
648 mask &= ~CODING_CATEGORY_MASK_ISO_7;
649 break;
651 default:
652 if (c < 0x80)
653 break;
654 else if (c < 0xA0)
655 return 0;
656 else
658 int count = 1;
660 mask &= ~CODING_CATEGORY_MASK_ISO_7;
661 while (src < src_end && *src >= 0xA0)
662 count++, src++;
663 if (count & 1 && src < src_end)
664 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
666 break;
670 return mask;
673 /* Decode a character of which charset is CHARSET and the 1st position
674 code is C1. If dimension of CHARSET is 2, the 2nd position code is
675 fetched from SRC and set to C2. If CHARSET is negative, it means
676 that we are decoding ill formed text, and what we can do is just to
677 read C1 as is. */
679 #define DECODE_ISO_CHARACTER(charset, c1) \
680 do { \
681 int c_alt, charset_alt = (charset); \
682 if (COMPOSING_HEAD_P (coding->composing)) \
684 *dst++ = LEADING_CODE_COMPOSITION; \
685 if (COMPOSING_WITH_RULE_P (coding->composing)) \
686 /* To tell composition rules are embeded. */ \
687 *dst++ = 0xFF; \
688 coding->composing += 2; \
690 if ((charset) >= 0) \
692 if (CHARSET_DIMENSION (charset) == 2) \
693 ONE_MORE_BYTE (c2); \
694 if (!NILP (unification_table) \
695 && ((c_alt = unify_char (unification_table, \
696 -1, (charset), c1, c2)) >= 0)) \
697 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
699 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
700 DECODE_CHARACTER_ASCII (c1); \
701 else if (CHARSET_DIMENSION (charset_alt) == 1) \
702 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
703 else \
704 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
705 if (COMPOSING_WITH_RULE_P (coding->composing)) \
706 /* To tell a composition rule follows. */ \
707 coding->composing = COMPOSING_WITH_RULE_RULE; \
708 } while (0)
710 /* Set designation state into CODING. */
711 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
712 do { \
713 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
714 make_number (chars), \
715 make_number (final_char)); \
716 if (charset >= 0) \
718 if (coding->direction == 1 \
719 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
720 charset = CHARSET_REVERSE_CHARSET (charset); \
721 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
723 } while (0)
725 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
728 decode_coding_iso2022 (coding, source, destination,
729 src_bytes, dst_bytes, consumed)
730 struct coding_system *coding;
731 unsigned char *source, *destination;
732 int src_bytes, dst_bytes;
733 int *consumed;
735 unsigned char *src = source;
736 unsigned char *src_end = source + src_bytes;
737 unsigned char *dst = destination;
738 unsigned char *dst_end = destination + dst_bytes;
739 /* Since the maximum bytes produced by each loop is 7, we subtract 6
740 from DST_END to assure that overflow checking is necessary only
741 at the head of loop. */
742 unsigned char *adjusted_dst_end = dst_end - 6;
743 int charset;
744 /* Charsets invoked to graphic plane 0 and 1 respectively. */
745 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
746 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
747 Lisp_Object unification_table
748 = coding->character_unification_table_for_decode;
750 if (!NILP (Venable_character_unification) && NILP (unification_table))
751 unification_table = Vstandard_character_unification_table_for_decode;
753 while (src < src_end && dst < adjusted_dst_end)
755 /* SRC_BASE remembers the start position in source in each loop.
756 The loop will be exited when there's not enough source text
757 to analyze long escape sequence or 2-byte code (within macros
758 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
759 to SRC_BASE before exiting. */
760 unsigned char *src_base = src;
761 int c1 = *src++, c2;
763 switch (iso_code_class [c1])
765 case ISO_0x20_or_0x7F:
766 if (!coding->composing
767 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
769 /* This is SPACE or DEL. */
770 *dst++ = c1;
771 break;
773 /* This is a graphic character, we fall down ... */
775 case ISO_graphic_plane_0:
776 if (coding->composing == COMPOSING_WITH_RULE_RULE)
778 /* This is a composition rule. */
779 *dst++ = c1 | 0x80;
780 coding->composing = COMPOSING_WITH_RULE_TAIL;
782 else
783 DECODE_ISO_CHARACTER (charset0, c1);
784 break;
786 case ISO_0xA0_or_0xFF:
787 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
789 /* Invalid code. */
790 *dst++ = c1;
791 break;
793 /* This is a graphic character, we fall down ... */
795 case ISO_graphic_plane_1:
796 DECODE_ISO_CHARACTER (charset1, c1);
797 break;
799 case ISO_control_code:
800 /* All ISO2022 control characters in this class have the
801 same representation in Emacs internal format. */
802 *dst++ = c1;
803 break;
805 case ISO_carriage_return:
806 if (coding->eol_type == CODING_EOL_CR)
808 *dst++ = '\n';
810 else if (coding->eol_type == CODING_EOL_CRLF)
812 ONE_MORE_BYTE (c1);
813 if (c1 == ISO_CODE_LF)
814 *dst++ = '\n';
815 else
817 src--;
818 *dst++ = c1;
821 else
823 *dst++ = c1;
825 break;
827 case ISO_shift_out:
828 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
829 goto label_invalid_escape_sequence;
830 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
831 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
832 break;
834 case ISO_shift_in:
835 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
836 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
837 break;
839 case ISO_single_shift_2_7:
840 case ISO_single_shift_2:
841 /* SS2 is handled as an escape sequence of ESC 'N' */
842 c1 = 'N';
843 goto label_escape_sequence;
845 case ISO_single_shift_3:
846 /* SS2 is handled as an escape sequence of ESC 'O' */
847 c1 = 'O';
848 goto label_escape_sequence;
850 case ISO_control_sequence_introducer:
851 /* CSI is handled as an escape sequence of ESC '[' ... */
852 c1 = '[';
853 goto label_escape_sequence;
855 case ISO_escape:
856 ONE_MORE_BYTE (c1);
857 label_escape_sequence:
858 /* Escape sequences handled by Emacs are invocation,
859 designation, direction specification, and character
860 composition specification. */
861 switch (c1)
863 case '&': /* revision of following character set */
864 ONE_MORE_BYTE (c1);
865 if (!(c1 >= '@' && c1 <= '~'))
866 goto label_invalid_escape_sequence;
867 ONE_MORE_BYTE (c1);
868 if (c1 != ISO_CODE_ESC)
869 goto label_invalid_escape_sequence;
870 ONE_MORE_BYTE (c1);
871 goto label_escape_sequence;
873 case '$': /* designation of 2-byte character set */
874 ONE_MORE_BYTE (c1);
875 if (c1 >= '@' && c1 <= 'B')
876 { /* designation of JISX0208.1978, GB2312.1980,
877 or JISX0208.1980 */
878 DECODE_DESIGNATION (0, 2, 94, c1);
880 else if (c1 >= 0x28 && c1 <= 0x2B)
881 { /* designation of DIMENSION2_CHARS94 character set */
882 ONE_MORE_BYTE (c2);
883 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
885 else if (c1 >= 0x2C && c1 <= 0x2F)
886 { /* designation of DIMENSION2_CHARS96 character set */
887 ONE_MORE_BYTE (c2);
888 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
890 else
891 goto label_invalid_escape_sequence;
892 break;
894 case 'n': /* invocation of locking-shift-2 */
895 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
896 goto label_invalid_escape_sequence;
897 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
898 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
899 break;
901 case 'o': /* invocation of locking-shift-3 */
902 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
903 goto label_invalid_escape_sequence;
904 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
905 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
906 break;
908 case 'N': /* invocation of single-shift-2 */
909 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
910 goto label_invalid_escape_sequence;
911 ONE_MORE_BYTE (c1);
912 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
913 DECODE_ISO_CHARACTER (charset, c1);
914 break;
916 case 'O': /* invocation of single-shift-3 */
917 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
918 goto label_invalid_escape_sequence;
919 ONE_MORE_BYTE (c1);
920 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
921 DECODE_ISO_CHARACTER (charset, c1);
922 break;
924 case '0': /* start composing without embeded rules */
925 coding->composing = COMPOSING_NO_RULE_HEAD;
926 break;
928 case '1': /* end composing */
929 coding->composing = COMPOSING_NO;
930 break;
932 case '2': /* start composing with embeded rules */
933 coding->composing = COMPOSING_WITH_RULE_HEAD;
934 break;
936 case '[': /* specification of direction */
937 /* For the moment, nested direction is not supported.
938 So, the value of `coding->direction' is 0 or 1: 0
939 means left-to-right, 1 means right-to-left. */
940 ONE_MORE_BYTE (c1);
941 switch (c1)
943 case ']': /* end of the current direction */
944 coding->direction = 0;
946 case '0': /* end of the current direction */
947 case '1': /* start of left-to-right direction */
948 ONE_MORE_BYTE (c1);
949 if (c1 == ']')
950 coding->direction = 0;
951 else
952 goto label_invalid_escape_sequence;
953 break;
955 case '2': /* start of right-to-left direction */
956 ONE_MORE_BYTE (c1);
957 if (c1 == ']')
958 coding->direction= 1;
959 else
960 goto label_invalid_escape_sequence;
961 break;
963 default:
964 goto label_invalid_escape_sequence;
966 break;
968 default:
969 if (c1 >= 0x28 && c1 <= 0x2B)
970 { /* designation of DIMENSION1_CHARS94 character set */
971 ONE_MORE_BYTE (c2);
972 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
974 else if (c1 >= 0x2C && c1 <= 0x2F)
975 { /* designation of DIMENSION1_CHARS96 character set */
976 ONE_MORE_BYTE (c2);
977 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
979 else
981 goto label_invalid_escape_sequence;
984 /* We must update these variables now. */
985 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
986 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
987 break;
989 label_invalid_escape_sequence:
991 int length = src - src_base;
993 bcopy (src_base, dst, length);
994 dst += length;
997 continue;
999 label_end_of_loop:
1000 coding->carryover_size = src - src_base;
1001 bcopy (src_base, coding->carryover, coding->carryover_size);
1002 src = src_base;
1003 break;
1006 /* If this is the last block of the text to be decoded, we had
1007 better just flush out all remaining codes in the text although
1008 they are not valid characters. */
1009 if (coding->last_block)
1011 bcopy (src, dst, src_end - src);
1012 dst += (src_end - src);
1013 src = src_end;
1015 *consumed = src - source;
1016 return dst - destination;
1019 /* ISO2022 encoding stuff. */
1022 It is not enough to say just "ISO2022" on encoding, we have to
1023 specify more details. In Emacs, each coding-system of ISO2022
1024 variant has the following specifications:
1025 1. Initial designation to G0 thru G3.
1026 2. Allows short-form designation?
1027 3. ASCII should be designated to G0 before control characters?
1028 4. ASCII should be designated to G0 at end of line?
1029 5. 7-bit environment or 8-bit environment?
1030 6. Use locking-shift?
1031 7. Use Single-shift?
1032 And the following two are only for Japanese:
1033 8. Use ASCII in place of JIS0201-1976-Roman?
1034 9. Use JISX0208-1983 in place of JISX0208-1978?
1035 These specifications are encoded in `coding->flags' as flag bits
1036 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1037 details.
1040 /* Produce codes (escape sequence) for designating CHARSET to graphic
1041 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1042 the coding system CODING allows, produce designation sequence of
1043 short-form. */
1045 #define ENCODE_DESIGNATION(charset, reg, coding) \
1046 do { \
1047 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1048 char *intermediate_char_94 = "()*+"; \
1049 char *intermediate_char_96 = ",-./"; \
1050 Lisp_Object temp \
1051 = Fassq (make_number (charset), Vcharset_revision_alist); \
1052 if (! NILP (temp)) \
1054 *dst++ = ISO_CODE_ESC; \
1055 *dst++ = '&'; \
1056 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1058 *dst++ = ISO_CODE_ESC; \
1059 if (CHARSET_DIMENSION (charset) == 1) \
1061 if (CHARSET_CHARS (charset) == 94) \
1062 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1063 else \
1064 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1066 else \
1068 *dst++ = '$'; \
1069 if (CHARSET_CHARS (charset) == 94) \
1071 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1072 || reg != 0 \
1073 || final_char < '@' || final_char > 'B') \
1074 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1076 else \
1077 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1079 *dst++ = final_char; \
1080 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1081 } while (0)
1083 /* The following two macros produce codes (control character or escape
1084 sequence) for ISO2022 single-shift functions (single-shift-2 and
1085 single-shift-3). */
1087 #define ENCODE_SINGLE_SHIFT_2 \
1088 do { \
1089 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1091 else \
1092 *dst++ = ISO_CODE_SS2; \
1093 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1094 } while (0)
1096 #define ENCODE_SINGLE_SHIFT_3 \
1097 do { \
1098 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1099 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1100 else \
1101 *dst++ = ISO_CODE_SS3; \
1102 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1103 } while (0)
1105 /* The following four macros produce codes (control character or
1106 escape sequence) for ISO2022 locking-shift functions (shift-in,
1107 shift-out, locking-shift-2, and locking-shift-3). */
1109 #define ENCODE_SHIFT_IN \
1110 do { \
1111 *dst++ = ISO_CODE_SI; \
1112 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1113 } while (0)
1115 #define ENCODE_SHIFT_OUT \
1116 do { \
1117 *dst++ = ISO_CODE_SO; \
1118 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1119 } while (0)
1121 #define ENCODE_LOCKING_SHIFT_2 \
1122 do { \
1123 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1124 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1125 } while (0)
1127 #define ENCODE_LOCKING_SHIFT_3 \
1128 do { \
1129 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1130 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1131 } while (0)
1133 /* Produce codes for a DIMENSION1 character whose character set is
1134 CHARSET and whose position-code is C1. Designation and invocation
1135 sequences are also produced in advance if necessary. */
1138 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1139 do { \
1140 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1142 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1143 *dst++ = c1 & 0x7F; \
1144 else \
1145 *dst++ = c1 | 0x80; \
1146 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1147 break; \
1149 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1151 *dst++ = c1 & 0x7F; \
1152 break; \
1154 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1156 *dst++ = c1 | 0x80; \
1157 break; \
1159 else \
1160 /* Since CHARSET is not yet invoked to any graphic planes, we \
1161 must invoke it, or, at first, designate it to some graphic \
1162 register. Then repeat the loop to actually produce the \
1163 character. */ \
1164 dst = encode_invocation_designation (charset, coding, dst); \
1165 } while (1)
1167 /* Produce codes for a DIMENSION2 character whose character set is
1168 CHARSET and whose position-codes are C1 and C2. Designation and
1169 invocation codes are also produced in advance if necessary. */
1171 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1172 do { \
1173 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1175 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1176 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1177 else \
1178 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1179 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1180 break; \
1182 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1184 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1185 break; \
1187 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1189 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1190 break; \
1192 else \
1193 /* Since CHARSET is not yet invoked to any graphic planes, we \
1194 must invoke it, or, at first, designate it to some graphic \
1195 register. Then repeat the loop to actually produce the \
1196 character. */ \
1197 dst = encode_invocation_designation (charset, coding, dst); \
1198 } while (1)
1200 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1201 do { \
1202 int c_alt, charset_alt; \
1203 if (!NILP (unification_table) \
1204 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1205 >= 0)) \
1206 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1207 else \
1208 charset_alt = charset; \
1209 if (CHARSET_DIMENSION (charset_alt) == 1) \
1210 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1211 else \
1212 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1213 } while (0)
1215 /* Produce designation and invocation codes at a place pointed by DST
1216 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1217 Return new DST. */
1219 unsigned char *
1220 encode_invocation_designation (charset, coding, dst)
1221 int charset;
1222 struct coding_system *coding;
1223 unsigned char *dst;
1225 int reg; /* graphic register number */
1227 /* At first, check designations. */
1228 for (reg = 0; reg < 4; reg++)
1229 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1230 break;
1232 if (reg >= 4)
1234 /* CHARSET is not yet designated to any graphic registers. */
1235 /* At first check the requested designation. */
1236 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1237 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1238 /* Since CHARSET requests no special designation, designate it
1239 to graphic register 0. */
1240 reg = 0;
1242 ENCODE_DESIGNATION (charset, reg, coding);
1245 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1246 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1248 /* Since the graphic register REG is not invoked to any graphic
1249 planes, invoke it to graphic plane 0. */
1250 switch (reg)
1252 case 0: /* graphic register 0 */
1253 ENCODE_SHIFT_IN;
1254 break;
1256 case 1: /* graphic register 1 */
1257 ENCODE_SHIFT_OUT;
1258 break;
1260 case 2: /* graphic register 2 */
1261 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1262 ENCODE_SINGLE_SHIFT_2;
1263 else
1264 ENCODE_LOCKING_SHIFT_2;
1265 break;
1267 case 3: /* graphic register 3 */
1268 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1269 ENCODE_SINGLE_SHIFT_3;
1270 else
1271 ENCODE_LOCKING_SHIFT_3;
1272 break;
1275 return dst;
1278 /* The following two macros produce codes for indicating composition. */
1279 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1280 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1281 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1283 /* The following three macros produce codes for indicating direction
1284 of text. */
1285 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1286 do { \
1287 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1288 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1289 else \
1290 *dst++ = ISO_CODE_CSI; \
1291 } while (0)
1293 #define ENCODE_DIRECTION_R2L \
1294 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1296 #define ENCODE_DIRECTION_L2R \
1297 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1299 /* Produce codes for designation and invocation to reset the graphic
1300 planes and registers to initial state. */
1301 #define ENCODE_RESET_PLANE_AND_REGISTER \
1302 do { \
1303 int reg; \
1304 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1305 ENCODE_SHIFT_IN; \
1306 for (reg = 0; reg < 4; reg++) \
1307 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1308 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1309 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1310 ENCODE_DESIGNATION \
1311 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1312 } while (0)
1314 /* Produce designation sequences of charsets in the line started from
1315 *SRC to a place pointed by DSTP.
1317 If the current block ends before any end-of-line, we may fail to
1318 find all the necessary *designations. */
1319 encode_designation_at_bol (coding, table, src, src_end, dstp)
1320 struct coding_system *coding;
1321 Lisp_Object table;
1322 unsigned char *src, *src_end, **dstp;
1324 int charset, c, found = 0, reg;
1325 /* Table of charsets to be designated to each graphic register. */
1326 int r[4];
1327 unsigned char *dst = *dstp;
1329 for (reg = 0; reg < 4; reg++)
1330 r[reg] = -1;
1332 while (src < src_end && *src != '\n' && found < 4)
1334 int bytes = BYTES_BY_CHAR_HEAD (*src);
1336 if (NILP (table))
1337 charset = CHARSET_AT (src);
1338 else
1340 int c_alt, c1, c2;
1342 SPLIT_STRING(src, bytes, charset, c1, c2);
1343 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1344 charset = CHAR_CHARSET (c_alt);
1347 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1348 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1350 found++;
1351 r[reg] = charset;
1354 src += bytes;
1357 if (found)
1359 for (reg = 0; reg < 4; reg++)
1360 if (r[reg] >= 0
1361 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1362 ENCODE_DESIGNATION (r[reg], reg, coding);
1363 *dstp = dst;
1367 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1370 encode_coding_iso2022 (coding, source, destination,
1371 src_bytes, dst_bytes, consumed)
1372 struct coding_system *coding;
1373 unsigned char *source, *destination;
1374 int src_bytes, dst_bytes;
1375 int *consumed;
1377 unsigned char *src = source;
1378 unsigned char *src_end = source + src_bytes;
1379 unsigned char *dst = destination;
1380 unsigned char *dst_end = destination + dst_bytes;
1381 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1382 from DST_END to assure overflow checking is necessary only at the
1383 head of loop. */
1384 unsigned char *adjusted_dst_end = dst_end - 19;
1385 Lisp_Object unification_table
1386 = coding->character_unification_table_for_encode;
1388 if (!NILP (Venable_character_unification) && NILP (unification_table))
1389 unification_table = Vstandard_character_unification_table_for_encode;
1391 while (src < src_end && dst < adjusted_dst_end)
1393 /* SRC_BASE remembers the start position in source in each loop.
1394 The loop will be exited when there's not enough source text
1395 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1396 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1397 reset to SRC_BASE before exiting. */
1398 unsigned char *src_base = src;
1399 int charset, c1, c2, c3, c4;
1401 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1402 && CODING_SPEC_ISO_BOL (coding))
1404 /* We have to produce designation sequences if any now. */
1405 encode_designation_at_bol (coding, unification_table,
1406 src, src_end, &dst);
1407 CODING_SPEC_ISO_BOL (coding) = 0;
1410 c1 = *src++;
1411 /* If we are seeing a component of a composite character, we are
1412 seeing a leading-code specially encoded for composition, or a
1413 composition rule if composing with rule. We must set C1
1414 to a normal leading-code or an ASCII code. If we are not at
1415 a composed character, we must reset the composition state. */
1416 if (COMPOSING_P (coding->composing))
1418 if (c1 < 0xA0)
1420 /* We are not in a composite character any longer. */
1421 coding->composing = COMPOSING_NO;
1422 ENCODE_COMPOSITION_END;
1424 else
1426 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1428 *dst++ = c1 & 0x7F;
1429 coding->composing = COMPOSING_WITH_RULE_HEAD;
1430 continue;
1432 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1433 coding->composing = COMPOSING_WITH_RULE_RULE;
1434 if (c1 == 0xA0)
1436 /* This is an ASCII component. */
1437 ONE_MORE_BYTE (c1);
1438 c1 &= 0x7F;
1440 else
1441 /* This is a leading-code of non ASCII component. */
1442 c1 -= 0x20;
1446 /* Now encode one character. C1 is a control character, an
1447 ASCII character, or a leading-code of multi-byte character. */
1448 switch (emacs_code_class[c1])
1450 case EMACS_ascii_code:
1451 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1452 break;
1454 case EMACS_control_code:
1455 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1456 ENCODE_RESET_PLANE_AND_REGISTER;
1457 *dst++ = c1;
1458 break;
1460 case EMACS_carriage_return_code:
1461 if (!coding->selective)
1463 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1464 ENCODE_RESET_PLANE_AND_REGISTER;
1465 *dst++ = c1;
1466 break;
1468 /* fall down to treat '\r' as '\n' ... */
1470 case EMACS_linefeed_code:
1471 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1472 ENCODE_RESET_PLANE_AND_REGISTER;
1473 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1474 bcopy (coding->spec.iso2022.initial_designation,
1475 coding->spec.iso2022.current_designation,
1476 sizeof coding->spec.iso2022.initial_designation);
1477 if (coding->eol_type == CODING_EOL_LF
1478 || coding->eol_type == CODING_EOL_UNDECIDED)
1479 *dst++ = ISO_CODE_LF;
1480 else if (coding->eol_type == CODING_EOL_CRLF)
1481 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1482 else
1483 *dst++ = ISO_CODE_CR;
1484 CODING_SPEC_ISO_BOL (coding) = 1;
1485 break;
1487 case EMACS_leading_code_2:
1488 ONE_MORE_BYTE (c2);
1489 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1490 break;
1492 case EMACS_leading_code_3:
1493 TWO_MORE_BYTES (c2, c3);
1494 if (c1 < LEADING_CODE_PRIVATE_11)
1495 ENCODE_ISO_CHARACTER (c1, c2, c3);
1496 else
1497 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1498 break;
1500 case EMACS_leading_code_4:
1501 THREE_MORE_BYTES (c2, c3, c4);
1502 ENCODE_ISO_CHARACTER (c2, c3, c4);
1503 break;
1505 case EMACS_leading_code_composition:
1506 ONE_MORE_BYTE (c1);
1507 if (c1 == 0xFF)
1509 coding->composing = COMPOSING_WITH_RULE_HEAD;
1510 ENCODE_COMPOSITION_WITH_RULE_START;
1512 else
1514 /* Rewind one byte because it is a character code of
1515 composition elements. */
1516 src--;
1517 coding->composing = COMPOSING_NO_RULE_HEAD;
1518 ENCODE_COMPOSITION_NO_RULE_START;
1520 break;
1522 case EMACS_invalid_code:
1523 *dst++ = c1;
1524 break;
1526 continue;
1527 label_end_of_loop:
1528 coding->carryover_size = src - src_base;
1529 bcopy (src_base, coding->carryover, coding->carryover_size);
1530 break;
1533 /* If this is the last block of the text to be encoded, we must
1534 reset graphic planes and registers to the initial state. */
1535 if (src >= src_end && coding->last_block)
1537 ENCODE_RESET_PLANE_AND_REGISTER;
1538 if (coding->carryover_size > 0
1539 && coding->carryover_size < (dst_end - dst))
1541 bcopy (coding->carryover, dst, coding->carryover_size);
1542 dst += coding->carryover_size;
1543 coding->carryover_size = 0;
1546 *consumed = src - source;
1547 return dst - destination;
1551 /*** 4. SJIS and BIG5 handlers ***/
1553 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1554 quite widely. So, for the moment, Emacs supports them in the bare
1555 C code. But, in the future, they may be supported only by CCL. */
1557 /* SJIS is a coding system encoding three character sets: ASCII, right
1558 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1559 as is. A character of charset katakana-jisx0201 is encoded by
1560 "position-code + 0x80". A character of charset japanese-jisx0208
1561 is encoded in 2-byte but two position-codes are divided and shifted
1562 so that it fit in the range below.
1564 --- CODE RANGE of SJIS ---
1565 (character set) (range)
1566 ASCII 0x00 .. 0x7F
1567 KATAKANA-JISX0201 0xA0 .. 0xDF
1568 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1569 (2nd byte) 0x40 .. 0xFF
1570 -------------------------------
1574 /* BIG5 is a coding system encoding two character sets: ASCII and
1575 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1576 character set and is encoded in two-byte.
1578 --- CODE RANGE of BIG5 ---
1579 (character set) (range)
1580 ASCII 0x00 .. 0x7F
1581 Big5 (1st byte) 0xA1 .. 0xFE
1582 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1583 --------------------------
1585 Since the number of characters in Big5 is larger than maximum
1586 characters in Emacs' charset (96x96), it can't be handled as one
1587 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1588 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1589 contains frequently used characters and the latter contains less
1590 frequently used characters. */
1592 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1593 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1594 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1595 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1597 /* Number of Big5 characters which have the same code in 1st byte. */
1598 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1600 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1601 do { \
1602 unsigned int temp \
1603 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1604 if (b1 < 0xC9) \
1605 charset = charset_big5_1; \
1606 else \
1608 charset = charset_big5_2; \
1609 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1611 c1 = temp / (0xFF - 0xA1) + 0x21; \
1612 c2 = temp % (0xFF - 0xA1) + 0x21; \
1613 } while (0)
1615 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1616 do { \
1617 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1618 if (charset == charset_big5_2) \
1619 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1620 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1621 b2 = temp % BIG5_SAME_ROW; \
1622 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1623 } while (0)
1625 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1626 do { \
1627 int c_alt, charset_alt = (charset); \
1628 if (!NILP (unification_table) \
1629 && ((c_alt = unify_char (unification_table, \
1630 -1, (charset), c1, c2)) >= 0)) \
1631 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1632 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1633 DECODE_CHARACTER_ASCII (c1); \
1634 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1635 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1636 else \
1637 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1638 } while (0)
1640 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1641 do { \
1642 int c_alt, charset_alt; \
1643 if (!NILP (unification_table) \
1644 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1645 >= 0)) \
1646 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1647 else \
1648 charset_alt = charset; \
1649 if (charset_alt == charset_ascii) \
1650 *dst++ = c1; \
1651 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1653 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1654 *dst++ = c1; \
1655 else \
1656 *dst++ = charset_alt, *dst++ = c1; \
1658 else \
1660 c1 &= 0x7F, c2 &= 0x7F; \
1661 if (sjis_p && charset_alt == charset_jisx0208) \
1663 unsigned char s1, s2; \
1665 ENCODE_SJIS (c1, c2, s1, s2); \
1666 *dst++ = s1, *dst++ = s2; \
1668 else if (!sjis_p \
1669 && (charset_alt == charset_big5_1 \
1670 || charset_alt == charset_big5_2)) \
1672 unsigned char b1, b2; \
1674 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1675 *dst++ = b1, *dst++ = b2; \
1677 else \
1678 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1680 } while (0);
1682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1683 Check if a text is encoded in SJIS. If it is, return
1684 CODING_CATEGORY_MASK_SJIS, else return 0. */
1687 detect_coding_sjis (src, src_end)
1688 unsigned char *src, *src_end;
1690 unsigned char c;
1692 while (src < src_end)
1694 c = *src++;
1695 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1696 return 0;
1697 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1699 if (src < src_end && *src++ < 0x40)
1700 return 0;
1703 return CODING_CATEGORY_MASK_SJIS;
1706 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1707 Check if a text is encoded in BIG5. If it is, return
1708 CODING_CATEGORY_MASK_BIG5, else return 0. */
1711 detect_coding_big5 (src, src_end)
1712 unsigned char *src, *src_end;
1714 unsigned char c;
1716 while (src < src_end)
1718 c = *src++;
1719 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1720 return 0;
1721 if (c >= 0xA1)
1723 if (src >= src_end)
1724 break;
1725 c = *src++;
1726 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1727 return 0;
1730 return CODING_CATEGORY_MASK_BIG5;
1733 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1734 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1737 decode_coding_sjis_big5 (coding, source, destination,
1738 src_bytes, dst_bytes, consumed, sjis_p)
1739 struct coding_system *coding;
1740 unsigned char *source, *destination;
1741 int src_bytes, dst_bytes;
1742 int *consumed;
1743 int sjis_p;
1745 unsigned char *src = source;
1746 unsigned char *src_end = source + src_bytes;
1747 unsigned char *dst = destination;
1748 unsigned char *dst_end = destination + dst_bytes;
1749 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1750 from DST_END to assure overflow checking is necessary only at the
1751 head of loop. */
1752 unsigned char *adjusted_dst_end = dst_end - 3;
1753 Lisp_Object unification_table
1754 = coding->character_unification_table_for_decode;
1756 if (!NILP (Venable_character_unification) && NILP (unification_table))
1757 unification_table = Vstandard_character_unification_table_for_decode;
1759 while (src < src_end && dst < adjusted_dst_end)
1761 /* SRC_BASE remembers the start position in source in each loop.
1762 The loop will be exited when there's not enough source text
1763 to analyze two-byte character (within macro ONE_MORE_BYTE).
1764 In that case, SRC is reset to SRC_BASE before exiting. */
1765 unsigned char *src_base = src;
1766 unsigned char c1 = *src++, c2, c3, c4;
1768 if (c1 == '\r')
1770 if (coding->eol_type == CODING_EOL_CRLF)
1772 ONE_MORE_BYTE (c2);
1773 if (c2 == '\n')
1774 *dst++ = c2;
1775 else
1776 /* To process C2 again, SRC is subtracted by 1. */
1777 *dst++ = c1, src--;
1779 else
1780 *dst++ = c1;
1782 else if (c1 < 0x20)
1783 *dst++ = c1;
1784 else if (c1 < 0x80)
1785 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1786 else if (c1 < 0xA0 || c1 >= 0xE0)
1788 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1789 if (sjis_p)
1791 ONE_MORE_BYTE (c2);
1792 DECODE_SJIS (c1, c2, c3, c4);
1793 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1795 else if (c1 >= 0xE0 && c1 < 0xFF)
1797 int charset;
1799 ONE_MORE_BYTE (c2);
1800 DECODE_BIG5 (c1, c2, charset, c3, c4);
1801 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1803 else /* Invalid code */
1804 *dst++ = c1;
1806 else
1808 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1809 if (sjis_p)
1810 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1811 else
1813 int charset;
1815 ONE_MORE_BYTE (c2);
1816 DECODE_BIG5 (c1, c2, charset, c3, c4);
1817 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1820 continue;
1822 label_end_of_loop:
1823 coding->carryover_size = src - src_base;
1824 bcopy (src_base, coding->carryover, coding->carryover_size);
1825 src = src_base;
1826 break;
1829 *consumed = src - source;
1830 return dst - destination;
1833 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1834 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1835 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1836 sure that all these charsets are registered as official charset
1837 (i.e. do not have extended leading-codes). Characters of other
1838 charsets are produced without any encoding. If SJIS_P is 1, encode
1839 SJIS text, else encode BIG5 text. */
1842 encode_coding_sjis_big5 (coding, source, destination,
1843 src_bytes, dst_bytes, consumed, sjis_p)
1844 struct coding_system *coding;
1845 unsigned char *source, *destination;
1846 int src_bytes, dst_bytes;
1847 int *consumed;
1848 int sjis_p;
1850 unsigned char *src = source;
1851 unsigned char *src_end = source + src_bytes;
1852 unsigned char *dst = destination;
1853 unsigned char *dst_end = destination + dst_bytes;
1854 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1855 from DST_END to assure overflow checking is necessary only at the
1856 head of loop. */
1857 unsigned char *adjusted_dst_end = dst_end - 1;
1858 Lisp_Object unification_table
1859 = coding->character_unification_table_for_encode;
1861 if (!NILP (Venable_character_unification) && NILP (unification_table))
1862 unification_table = Vstandard_character_unification_table_for_encode;
1864 while (src < src_end && dst < adjusted_dst_end)
1866 /* SRC_BASE remembers the start position in source in each loop.
1867 The loop will be exited when there's not enough source text
1868 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1869 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1870 before exiting. */
1871 unsigned char *src_base = src;
1872 unsigned char c1 = *src++, c2, c3, c4;
1874 if (coding->composing)
1876 if (c1 == 0xA0)
1878 ONE_MORE_BYTE (c1);
1879 c1 &= 0x7F;
1881 else if (c1 >= 0xA0)
1882 c1 -= 0x20;
1883 else
1884 coding->composing = 0;
1887 switch (emacs_code_class[c1])
1889 case EMACS_ascii_code:
1890 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1891 break;
1893 case EMACS_control_code:
1894 *dst++ = c1;
1895 break;
1897 case EMACS_carriage_return_code:
1898 if (!coding->selective)
1900 *dst++ = c1;
1901 break;
1903 /* fall down to treat '\r' as '\n' ... */
1905 case EMACS_linefeed_code:
1906 if (coding->eol_type == CODING_EOL_LF
1907 || coding->eol_type == CODING_EOL_UNDECIDED)
1908 *dst++ = '\n';
1909 else if (coding->eol_type == CODING_EOL_CRLF)
1910 *dst++ = '\r', *dst++ = '\n';
1911 else
1912 *dst++ = '\r';
1913 break;
1915 case EMACS_leading_code_2:
1916 ONE_MORE_BYTE (c2);
1917 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1918 break;
1920 case EMACS_leading_code_3:
1921 TWO_MORE_BYTES (c2, c3);
1922 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1923 break;
1925 case EMACS_leading_code_4:
1926 THREE_MORE_BYTES (c2, c3, c4);
1927 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1928 break;
1930 case EMACS_leading_code_composition:
1931 coding->composing = 1;
1932 break;
1934 default: /* i.e. case EMACS_invalid_code: */
1935 *dst++ = c1;
1937 continue;
1939 label_end_of_loop:
1940 coding->carryover_size = src - src_base;
1941 bcopy (src_base, coding->carryover, coding->carryover_size);
1942 src = src_base;
1943 break;
1946 *consumed = src - source;
1947 return dst - destination;
1951 /*** 5. End-of-line handlers ***/
1953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1954 This function is called only when `coding->eol_type' is
1955 CODING_EOL_CRLF or CODING_EOL_CR. */
1957 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1958 struct coding_system *coding;
1959 unsigned char *source, *destination;
1960 int src_bytes, dst_bytes;
1961 int *consumed;
1963 unsigned char *src = source;
1964 unsigned char *src_end = source + src_bytes;
1965 unsigned char *dst = destination;
1966 unsigned char *dst_end = destination + dst_bytes;
1967 int produced;
1969 switch (coding->eol_type)
1971 case CODING_EOL_CRLF:
1973 /* Since the maximum bytes produced by each loop is 2, we
1974 subtract 1 from DST_END to assure overflow checking is
1975 necessary only at the head of loop. */
1976 unsigned char *adjusted_dst_end = dst_end - 1;
1978 while (src < src_end && dst < adjusted_dst_end)
1980 unsigned char *src_base = src;
1981 unsigned char c = *src++;
1982 if (c == '\r')
1984 ONE_MORE_BYTE (c);
1985 if (c != '\n')
1986 *dst++ = '\r';
1987 *dst++ = c;
1989 else
1990 *dst++ = c;
1991 continue;
1993 label_end_of_loop:
1994 coding->carryover_size = src - src_base;
1995 bcopy (src_base, coding->carryover, coding->carryover_size);
1996 src = src_base;
1997 break;
1999 *consumed = src - source;
2000 produced = dst - destination;
2001 break;
2004 case CODING_EOL_CR:
2005 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2006 bcopy (source, destination, produced);
2007 dst_end = destination + produced;
2008 while (dst < dst_end)
2009 if (*dst++ == '\r') dst[-1] = '\n';
2010 *consumed = produced;
2011 break;
2013 default: /* i.e. case: CODING_EOL_LF */
2014 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2015 bcopy (source, destination, produced);
2016 *consumed = produced;
2017 break;
2020 return produced;
2023 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2024 format of end-of-line according to `coding->eol_type'. If
2025 `coding->selective' is 1, code '\r' in source text also means
2026 end-of-line. */
2028 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2029 struct coding_system *coding;
2030 unsigned char *source, *destination;
2031 int src_bytes, dst_bytes;
2032 int *consumed;
2034 unsigned char *src = source;
2035 unsigned char *dst = destination;
2036 int produced;
2038 if (src_bytes <= 0)
2039 return 0;
2041 switch (coding->eol_type)
2043 case CODING_EOL_LF:
2044 case CODING_EOL_UNDECIDED:
2045 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2046 bcopy (source, destination, produced);
2047 if (coding->selective)
2049 int i = produced;
2050 while (i--)
2051 if (*dst++ == '\r') dst[-1] = '\n';
2053 *consumed = produced;
2055 case CODING_EOL_CRLF:
2057 unsigned char c;
2058 unsigned char *src_end = source + src_bytes;
2059 unsigned char *dst_end = destination + dst_bytes;
2060 /* Since the maximum bytes produced by each loop is 2, we
2061 subtract 1 from DST_END to assure overflow checking is
2062 necessary only at the head of loop. */
2063 unsigned char *adjusted_dst_end = dst_end - 1;
2065 while (src < src_end && dst < adjusted_dst_end)
2067 c = *src++;
2068 if (c == '\n' || (c == '\r' && coding->selective))
2069 *dst++ = '\r', *dst++ = '\n';
2070 else
2071 *dst++ = c;
2073 produced = dst - destination;
2074 *consumed = src - source;
2075 break;
2078 default: /* i.e. case CODING_EOL_CR: */
2079 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2080 bcopy (source, destination, produced);
2082 int i = produced;
2083 while (i--)
2084 if (*dst++ == '\n') dst[-1] = '\r';
2086 *consumed = produced;
2089 return produced;
2093 /*** 6. C library functions ***/
2095 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2096 has a property `coding-system'. The value of this property is a
2097 vector of length 5 (called as coding-vector). Among elements of
2098 this vector, the first (element[0]) and the fifth (element[4])
2099 carry important information for decoding/encoding. Before
2100 decoding/encoding, this information should be set in fields of a
2101 structure of type `coding_system'.
2103 A value of property `coding-system' can be a symbol of another
2104 subsidiary coding-system. In that case, Emacs gets coding-vector
2105 from that symbol.
2107 `element[0]' contains information to be set in `coding->type'. The
2108 value and its meaning is as follows:
2110 0 -- coding_type_emacs_mule
2111 1 -- coding_type_sjis
2112 2 -- coding_type_iso2022
2113 3 -- coding_type_big5
2114 4 -- coding_type_ccl encoder/decoder written in CCL
2115 nil -- coding_type_no_conversion
2116 t -- coding_type_undecided (automatic conversion on decoding,
2117 no-conversion on encoding)
2119 `element[4]' contains information to be set in `coding->flags' and
2120 `coding->spec'. The meaning varies by `coding->type'.
2122 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2123 of length 32 (of which the first 13 sub-elements are used now).
2124 Meanings of these sub-elements are:
2126 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2127 If the value is an integer of valid charset, the charset is
2128 assumed to be designated to graphic register N initially.
2130 If the value is minus, it is a minus value of charset which
2131 reserves graphic register N, which means that the charset is
2132 not designated initially but should be designated to graphic
2133 register N just before encoding a character in that charset.
2135 If the value is nil, graphic register N is never used on
2136 encoding.
2138 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2139 Each value takes t or nil. See the section ISO2022 of
2140 `coding.h' for more information.
2142 If `coding->type' is `coding_type_big5', element[4] is t to denote
2143 BIG5-ETen or nil to denote BIG5-HKU.
2145 If `coding->type' takes the other value, element[4] is ignored.
2147 Emacs Lisp's coding system also carries information about format of
2148 end-of-line in a value of property `eol-type'. If the value is
2149 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2150 means CODING_EOL_CR. If it is not integer, it should be a vector
2151 of subsidiary coding systems of which property `eol-type' has one
2152 of above values.
2156 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2157 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2158 is setup so that no conversion is necessary and return -1, else
2159 return 0. */
2162 setup_coding_system (coding_system, coding)
2163 Lisp_Object coding_system;
2164 struct coding_system *coding;
2166 Lisp_Object type, eol_type;
2168 /* At first, set several fields to default values. */
2169 coding->require_flushing = 0;
2170 coding->last_block = 0;
2171 coding->selective = 0;
2172 coding->composing = 0;
2173 coding->direction = 0;
2174 coding->carryover_size = 0;
2175 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2176 coding->character_unification_table_for_decode = Qnil;
2177 coding->character_unification_table_for_encode = Qnil;
2179 Vlast_coding_system_used = coding->symbol = coding_system;
2180 eol_type = Qnil;
2181 /* Get value of property `coding-system' until we get a vector.
2182 While doing that, also get values of properties
2183 `post-read-conversion', `pre-write-conversion',
2184 `character-unification-table-for-decode',
2185 `character-unification-table-for-encode' and `eol-type'. */
2186 while (!NILP (coding_system) && SYMBOLP (coding_system))
2188 if (NILP (coding->post_read_conversion))
2189 coding->post_read_conversion = Fget (coding_system,
2190 Qpost_read_conversion);
2191 if (NILP (coding->pre_write_conversion))
2192 coding->pre_write_conversion = Fget (coding_system,
2193 Qpre_write_conversion);
2194 if (!inhibit_eol_conversion && NILP (eol_type))
2195 eol_type = Fget (coding_system, Qeol_type);
2197 if (NILP (coding->character_unification_table_for_decode))
2198 coding->character_unification_table_for_decode
2199 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2201 if (NILP (coding->character_unification_table_for_encode))
2202 coding->character_unification_table_for_encode
2203 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2205 coding_system = Fget (coding_system, Qcoding_system);
2208 while (!NILP (coding->character_unification_table_for_decode)
2209 && SYMBOLP (coding->character_unification_table_for_decode))
2210 coding->character_unification_table_for_decode
2211 = Fget (coding->character_unification_table_for_decode,
2212 Qcharacter_unification_table_for_decode);
2213 if (!NILP (coding->character_unification_table_for_decode)
2214 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2215 coding->character_unification_table_for_decode = Qnil;
2217 while (!NILP (coding->character_unification_table_for_encode)
2218 && SYMBOLP (coding->character_unification_table_for_encode))
2219 coding->character_unification_table_for_encode
2220 = Fget (coding->character_unification_table_for_encode,
2221 Qcharacter_unification_table_for_encode);
2222 if (!NILP (coding->character_unification_table_for_encode)
2223 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2224 coding->character_unification_table_for_encode = Qnil;
2226 if (!VECTORP (coding_system)
2227 || XVECTOR (coding_system)->size != 5)
2228 goto label_invalid_coding_system;
2230 if (VECTORP (eol_type))
2231 coding->eol_type = CODING_EOL_UNDECIDED;
2232 else if (XFASTINT (eol_type) == 1)
2233 coding->eol_type = CODING_EOL_CRLF;
2234 else if (XFASTINT (eol_type) == 2)
2235 coding->eol_type = CODING_EOL_CR;
2236 else
2237 coding->eol_type = CODING_EOL_LF;
2239 type = XVECTOR (coding_system)->contents[0];
2240 switch (XFASTINT (type))
2242 case 0:
2243 coding->type = coding_type_emacs_mule;
2244 break;
2246 case 1:
2247 coding->type = coding_type_sjis;
2248 break;
2250 case 2:
2251 coding->type = coding_type_iso2022;
2253 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2254 Lisp_Object *flags;
2255 int i, charset, default_reg_bits = 0;
2257 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2258 goto label_invalid_coding_system;
2260 flags = XVECTOR (val)->contents;
2261 coding->flags
2262 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2263 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2264 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2265 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2266 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2267 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2268 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2269 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2270 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2271 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2272 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2274 /* Invoke graphic register 0 to plane 0. */
2275 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2276 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2277 CODING_SPEC_ISO_INVOCATION (coding, 1)
2278 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2279 /* Not single shifting at first. */
2280 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2281 /* Beginning of buffer should also be regarded as bol. */
2282 CODING_SPEC_ISO_BOL(coding) = 1;
2284 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2285 FLAGS[REG] can be one of below:
2286 integer CHARSET: CHARSET occupies register I,
2287 t: designate nothing to REG initially, but can be used
2288 by any charsets,
2289 list of integer, nil, or t: designate the first
2290 element (if integer) to REG initially, the remaining
2291 elements (if integer) is designated to REG on request,
2292 if an element is t, REG can be used by any charset,
2293 nil: REG is never used. */
2294 for (charset = 0; charset <= MAX_CHARSET; charset++)
2295 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2296 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2297 for (i = 0; i < 4; i++)
2299 if (INTEGERP (flags[i])
2300 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2301 || (charset = get_charset_id (flags[i])) >= 0)
2303 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2304 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2306 else if (EQ (flags[i], Qt))
2308 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2309 default_reg_bits |= 1 << i;
2311 else if (CONSP (flags[i]))
2313 Lisp_Object tail = flags[i];
2315 if (INTEGERP (XCONS (tail)->car)
2316 && (charset = XINT (XCONS (tail)->car),
2317 CHARSET_VALID_P (charset))
2318 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2320 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2321 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2323 else
2324 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2325 tail = XCONS (tail)->cdr;
2326 while (CONSP (tail))
2328 if (INTEGERP (XCONS (tail)->car)
2329 && (charset = XINT (XCONS (tail)->car),
2330 CHARSET_VALID_P (charset))
2331 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2332 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2333 = i;
2334 else if (EQ (XCONS (tail)->car, Qt))
2335 default_reg_bits |= 1 << i;
2336 tail = XCONS (tail)->cdr;
2339 else
2340 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2342 CODING_SPEC_ISO_DESIGNATION (coding, i)
2343 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2346 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2348 /* REG 1 can be used only by locking shift in 7-bit env. */
2349 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2350 default_reg_bits &= ~2;
2351 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2352 /* Without any shifting, only REG 0 and 1 can be used. */
2353 default_reg_bits &= 3;
2356 for (charset = 0; charset <= MAX_CHARSET; charset++)
2357 if (CHARSET_VALID_P (charset)
2358 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2359 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2361 /* We have not yet decided where to designate CHARSET. */
2362 int reg_bits = default_reg_bits;
2364 if (CHARSET_CHARS (charset) == 96)
2365 /* A charset of CHARS96 can't be designated to REG 0. */
2366 reg_bits &= ~1;
2368 if (reg_bits)
2369 /* There exist some default graphic register. */
2370 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2371 = (reg_bits & 1
2372 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2373 else
2374 /* We anyway have to designate CHARSET to somewhere. */
2375 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2376 = (CHARSET_CHARS (charset) == 94
2378 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2379 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2381 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2382 ? 2 : 0)));
2385 coding->require_flushing = 1;
2386 break;
2388 case 3:
2389 coding->type = coding_type_big5;
2390 coding->flags
2391 = (NILP (XVECTOR (coding_system)->contents[4])
2392 ? CODING_FLAG_BIG5_HKU
2393 : CODING_FLAG_BIG5_ETEN);
2394 break;
2396 case 4:
2397 coding->type = coding_type_ccl;
2399 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2400 if (CONSP (val)
2401 && VECTORP (XCONS (val)->car)
2402 && VECTORP (XCONS (val)->cdr))
2404 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2405 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2407 else
2408 goto label_invalid_coding_system;
2410 coding->require_flushing = 1;
2411 break;
2413 default:
2414 if (EQ (type, Qt))
2415 coding->type = coding_type_undecided;
2416 else
2417 coding->type = coding_type_no_conversion;
2418 break;
2420 return 0;
2422 label_invalid_coding_system:
2423 coding->type = coding_type_no_conversion;
2424 coding->eol_type = CODING_EOL_LF;
2425 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2426 = Qnil;
2427 return -1;
2430 /* Emacs has a mechanism to automatically detect a coding system if it
2431 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2432 it's impossible to distinguish some coding systems accurately
2433 because they use the same range of codes. So, at first, coding
2434 systems are categorized into 7, those are:
2436 o coding-category-emacs-mule
2438 The category for a coding system which has the same code range
2439 as Emacs' internal format. Assigned the coding-system (Lisp
2440 symbol) `emacs-mule' by default.
2442 o coding-category-sjis
2444 The category for a coding system which has the same code range
2445 as SJIS. Assigned the coding-system (Lisp
2446 symbol) `shift-jis' by default.
2448 o coding-category-iso-7
2450 The category for a coding system which has the same code range
2451 as ISO2022 of 7-bit environment. Assigned the coding-system
2452 (Lisp symbol) `iso-2022-7' by default.
2454 o coding-category-iso-8-1
2456 The category for a coding system which has the same code range
2457 as ISO2022 of 8-bit environment and graphic plane 1 used only
2458 for DIMENSION1 charset. Assigned the coding-system (Lisp
2459 symbol) `iso-8859-1' by default.
2461 o coding-category-iso-8-2
2463 The category for a coding system which has the same code range
2464 as ISO2022 of 8-bit environment and graphic plane 1 used only
2465 for DIMENSION2 charset. Assigned the coding-system (Lisp
2466 symbol) `euc-japan' by default.
2468 o coding-category-iso-else
2470 The category for a coding system which has the same code range
2471 as ISO2022 but not belongs to any of the above three
2472 categories. Assigned the coding-system (Lisp symbol)
2473 `iso-2022-ss2-7' by default.
2475 o coding-category-big5
2477 The category for a coding system which has the same code range
2478 as BIG5. Assigned the coding-system (Lisp symbol)
2479 `cn-big5' by default.
2481 o coding-category-binary
2483 The category for a coding system not categorized in any of the
2484 above. Assigned the coding-system (Lisp symbol)
2485 `no-conversion' by default.
2487 Each of them is a Lisp symbol and the value is an actual
2488 `coding-system's (this is also a Lisp symbol) assigned by a user.
2489 What Emacs does actually is to detect a category of coding system.
2490 Then, it uses a `coding-system' assigned to it. If Emacs can't
2491 decide only one possible category, it selects a category of the
2492 highest priority. Priorities of categories are also specified by a
2493 user in a Lisp variable `coding-category-list'.
2497 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2498 If it detects possible coding systems, return an integer in which
2499 appropriate flag bits are set. Flag bits are defined by macros
2500 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2503 detect_coding_mask (src, src_bytes)
2504 unsigned char *src;
2505 int src_bytes;
2507 register unsigned char c;
2508 unsigned char *src_end = src + src_bytes;
2509 int mask;
2511 /* At first, skip all ASCII characters and control characters except
2512 for three ISO2022 specific control characters. */
2513 label_loop_detect_coding:
2514 while (src < src_end)
2516 c = *src;
2517 if (c >= 0x80
2518 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2519 break;
2520 src++;
2523 if (src >= src_end)
2524 /* We found nothing other than ASCII. There's nothing to do. */
2525 return CODING_CATEGORY_MASK_ANY;
2527 /* The text seems to be encoded in some multilingual coding system.
2528 Now, try to find in which coding system the text is encoded. */
2529 if (c < 0x80)
2531 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2532 /* C is an ISO2022 specific control code of C0. */
2533 mask = detect_coding_iso2022 (src, src_end);
2534 src++;
2535 if (mask == CODING_CATEGORY_MASK_ANY)
2536 /* No valid ISO2022 code follows C. Try again. */
2537 goto label_loop_detect_coding;
2539 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2540 /* C is an ISO2022 specific control code of C1,
2541 or the first byte of SJIS's 2-byte character code,
2542 or a leading code of Emacs. */
2543 mask = (detect_coding_iso2022 (src, src_end)
2544 | detect_coding_sjis (src, src_end)
2545 | detect_coding_emacs_mule (src, src_end));
2547 else if (c < 0xA0)
2548 /* C is the first byte of SJIS character code,
2549 or a leading-code of Emacs. */
2550 mask = (detect_coding_sjis (src, src_end)
2551 | detect_coding_emacs_mule (src, src_end));
2553 else
2554 /* C is a character of ISO2022 in graphic plane right,
2555 or a SJIS's 1-byte character code (i.e. JISX0201),
2556 or the first byte of BIG5's 2-byte code. */
2557 mask = (detect_coding_iso2022 (src, src_end)
2558 | detect_coding_sjis (src, src_end)
2559 | detect_coding_big5 (src, src_end));
2561 return mask;
2564 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2565 The information of the detected coding system is set in CODING. */
2567 void
2568 detect_coding (coding, src, src_bytes)
2569 struct coding_system *coding;
2570 unsigned char *src;
2571 int src_bytes;
2573 int mask = detect_coding_mask (src, src_bytes);
2574 int idx;
2576 if (mask == CODING_CATEGORY_MASK_ANY)
2577 /* We found nothing other than ASCII. There's nothing to do. */
2578 return;
2580 if (!mask)
2581 /* The source text seems to be encoded in unknown coding system.
2582 Emacs regards the category of such a kind of coding system as
2583 `coding-category-binary'. We assume that a user has assigned
2584 an appropriate coding system for a `coding-category-binary'. */
2585 idx = CODING_CATEGORY_IDX_BINARY;
2586 else
2588 /* We found some plausible coding systems. Let's use a coding
2589 system of the highest priority. */
2590 Lisp_Object val = Vcoding_category_list;
2592 if (CONSP (val))
2593 while (!NILP (val))
2595 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2596 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2597 break;
2598 val = XCONS (val)->cdr;
2600 else
2601 val = Qnil;
2603 if (NILP (val))
2605 /* For unknown reason, `Vcoding_category_list' contains none
2606 of found categories. Let's use any of them. */
2607 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2608 if (mask & (1 << idx))
2609 break;
2612 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2615 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2616 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2617 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2620 detect_eol_type (src, src_bytes)
2621 unsigned char *src;
2622 int src_bytes;
2624 unsigned char *src_end = src + src_bytes;
2625 unsigned char c;
2627 while (src < src_end)
2629 c = *src++;
2630 if (c == '\n')
2631 return CODING_EOL_LF;
2632 else if (c == '\r')
2634 if (src < src_end && *src == '\n')
2635 return CODING_EOL_CRLF;
2636 else
2637 return CODING_EOL_CR;
2640 return CODING_EOL_UNDECIDED;
2643 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2644 is encoded. If it detects an appropriate format of end-of-line, it
2645 sets the information in *CODING. */
2647 void
2648 detect_eol (coding, src, src_bytes)
2649 struct coding_system *coding;
2650 unsigned char *src;
2651 int src_bytes;
2653 Lisp_Object val;
2654 int eol_type = detect_eol_type (src, src_bytes);
2656 if (eol_type == CODING_EOL_UNDECIDED)
2657 /* We found no end-of-line in the source text. */
2658 return;
2660 val = Fget (coding->symbol, Qeol_type);
2661 if (VECTORP (val) && XVECTOR (val)->size == 3)
2662 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2665 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2666 decoding, it may detect coding system and format of end-of-line if
2667 those are not yet decided. */
2670 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2671 struct coding_system *coding;
2672 unsigned char *source, *destination;
2673 int src_bytes, dst_bytes;
2674 int *consumed;
2676 int produced;
2678 if (src_bytes <= 0)
2680 *consumed = 0;
2681 return 0;
2684 if (coding->type == coding_type_undecided)
2685 detect_coding (coding, source, src_bytes);
2687 if (coding->eol_type == CODING_EOL_UNDECIDED)
2688 detect_eol (coding, source, src_bytes);
2690 coding->carryover_size = 0;
2691 switch (coding->type)
2693 case coding_type_no_conversion:
2694 label_no_conversion:
2695 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2696 bcopy (source, destination, produced);
2697 *consumed = produced;
2698 break;
2700 case coding_type_emacs_mule:
2701 case coding_type_undecided:
2702 if (coding->eol_type == CODING_EOL_LF
2703 || coding->eol_type == CODING_EOL_UNDECIDED)
2704 goto label_no_conversion;
2705 produced = decode_eol (coding, source, destination,
2706 src_bytes, dst_bytes, consumed);
2707 break;
2709 case coding_type_sjis:
2710 produced = decode_coding_sjis_big5 (coding, source, destination,
2711 src_bytes, dst_bytes, consumed,
2713 break;
2715 case coding_type_iso2022:
2716 produced = decode_coding_iso2022 (coding, source, destination,
2717 src_bytes, dst_bytes, consumed);
2718 break;
2720 case coding_type_big5:
2721 produced = decode_coding_sjis_big5 (coding, source, destination,
2722 src_bytes, dst_bytes, consumed,
2724 break;
2726 case coding_type_ccl:
2727 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2728 src_bytes, dst_bytes, consumed);
2729 break;
2732 return produced;
2735 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2738 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2739 struct coding_system *coding;
2740 unsigned char *source, *destination;
2741 int src_bytes, dst_bytes;
2742 int *consumed;
2744 int produced;
2746 coding->carryover_size = 0;
2747 switch (coding->type)
2749 case coding_type_no_conversion:
2750 label_no_conversion:
2751 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2752 if (produced > 0)
2754 bcopy (source, destination, produced);
2755 if (coding->selective)
2757 unsigned char *p = destination, *pend = destination + produced;
2758 while (p < pend)
2759 if (*p++ == '\015') p[-1] = '\n';
2762 *consumed = produced;
2763 break;
2765 case coding_type_emacs_mule:
2766 case coding_type_undecided:
2767 if (coding->eol_type == CODING_EOL_LF
2768 || coding->eol_type == CODING_EOL_UNDECIDED)
2769 goto label_no_conversion;
2770 produced = encode_eol (coding, source, destination,
2771 src_bytes, dst_bytes, consumed);
2772 break;
2774 case coding_type_sjis:
2775 produced = encode_coding_sjis_big5 (coding, source, destination,
2776 src_bytes, dst_bytes, consumed,
2778 break;
2780 case coding_type_iso2022:
2781 produced = encode_coding_iso2022 (coding, source, destination,
2782 src_bytes, dst_bytes, consumed);
2783 break;
2785 case coding_type_big5:
2786 produced = encode_coding_sjis_big5 (coding, source, destination,
2787 src_bytes, dst_bytes, consumed,
2789 break;
2791 case coding_type_ccl:
2792 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2793 src_bytes, dst_bytes, consumed);
2794 break;
2797 return produced;
2800 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2802 /* Return maximum size (bytes) of a buffer enough for decoding
2803 SRC_BYTES of text encoded in CODING. */
2806 decoding_buffer_size (coding, src_bytes)
2807 struct coding_system *coding;
2808 int src_bytes;
2810 int magnification;
2812 if (coding->type == coding_type_iso2022)
2813 magnification = 3;
2814 else if (coding->type == coding_type_ccl)
2815 magnification = coding->spec.ccl.decoder.buf_magnification;
2816 else
2817 magnification = 2;
2819 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2822 /* Return maximum size (bytes) of a buffer enough for encoding
2823 SRC_BYTES of text to CODING. */
2826 encoding_buffer_size (coding, src_bytes)
2827 struct coding_system *coding;
2828 int src_bytes;
2830 int magnification;
2832 if (coding->type == coding_type_ccl)
2833 magnification = coding->spec.ccl.encoder.buf_magnification;
2834 else
2835 magnification = 3;
2837 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2840 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2841 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2842 #endif
2844 char *conversion_buffer;
2845 int conversion_buffer_size;
2847 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2848 or decoding. Sufficient memory is allocated automatically. If we
2849 run out of memory, return NULL. */
2851 char *
2852 get_conversion_buffer (size)
2853 int size;
2855 if (size > conversion_buffer_size)
2857 char *buf;
2858 int real_size = conversion_buffer_size * 2;
2860 while (real_size < size) real_size *= 2;
2861 buf = (char *) xmalloc (real_size);
2862 xfree (conversion_buffer);
2863 conversion_buffer = buf;
2864 conversion_buffer_size = real_size;
2866 return conversion_buffer;
2870 #ifdef emacs
2871 /*** 7. Emacs Lisp library functions ***/
2873 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2874 1, 1, 0,
2875 "Return coding-spec of CODING-SYSTEM.\n\
2876 If CODING-SYSTEM is not a valid coding-system, return nil.")
2877 (obj)
2878 Lisp_Object obj;
2880 while (SYMBOLP (obj) && !NILP (obj))
2881 obj = Fget (obj, Qcoding_system);
2882 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2883 ? Qnil : obj);
2886 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2887 "Return t if OBJECT is nil or a coding-system.\n\
2888 See document of make-coding-system for coding-system object.")
2889 (obj)
2890 Lisp_Object obj;
2892 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2895 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2896 Sread_non_nil_coding_system, 1, 1, 0,
2897 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2898 (prompt)
2899 Lisp_Object prompt;
2901 Lisp_Object val;
2904 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2905 Qt, Qnil, Qnil, Qnil);
2907 while (XSTRING (val)->size == 0);
2908 return (Fintern (val, Qnil));
2911 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2912 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2913 (prompt)
2914 Lisp_Object prompt;
2916 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2917 Qt, Qnil, Qnil, Qnil);
2918 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2921 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2922 1, 1, 0,
2923 "Check validity of CODING-SYSTEM.\n\
2924 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2925 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2926 The value of property should be a vector of length 5.")
2927 (coding_system)
2928 Lisp_Object coding_system;
2930 CHECK_SYMBOL (coding_system, 0);
2931 if (!NILP (Fcoding_system_p (coding_system)))
2932 return coding_system;
2933 while (1)
2934 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2937 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2938 2, 2, 0,
2939 "Detect coding-system of the text in the region between START and END.\n\
2940 Return a list of possible coding-systems ordered by priority.\n\
2941 If only ASCII characters are found, it returns `undecided'\n\
2942 or its subsidiary coding-system according to a detected end-of-line format.")
2943 (b, e)
2944 Lisp_Object b, e;
2946 int coding_mask, eol_type;
2947 Lisp_Object val;
2948 int beg, end;
2950 validate_region (&b, &e);
2951 beg = XINT (b), end = XINT (e);
2952 if (beg < GPT && end >= GPT) move_gap (end);
2954 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2955 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2957 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2959 val = intern ("undecided");
2960 if (eol_type != CODING_EOL_UNDECIDED)
2962 Lisp_Object val2 = Fget (val, Qeol_type);
2963 if (VECTORP (val2))
2964 val = XVECTOR (val2)->contents[eol_type];
2967 else
2969 Lisp_Object val2;
2971 /* At first, gather possible coding-systems in VAL in a reverse
2972 order. */
2973 val = Qnil;
2974 for (val2 = Vcoding_category_list;
2975 !NILP (val2);
2976 val2 = XCONS (val2)->cdr)
2978 int idx
2979 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2980 if (coding_mask & (1 << idx))
2981 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2984 /* Then, change the order of the list, while getting subsidiary
2985 coding-systems. */
2986 val2 = val;
2987 val = Qnil;
2988 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2990 if (eol_type == CODING_EOL_UNDECIDED)
2991 val = Fcons (XCONS (val2)->car, val);
2992 else
2994 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2995 if (VECTORP (val3))
2996 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2997 else
2998 val = Fcons (XCONS (val2)->car, val);
3003 return val;
3006 /* Scan text in the region between *BEGP and *ENDP, skip characters
3007 which we never have to encode to (iff ENCODEP is 1) or decode from
3008 coding system CODING at the head and tail, then set BEGP and ENDP
3009 to the addresses of start and end of the text we actually convert. */
3011 void
3012 shrink_conversion_area (begp, endp, coding, encodep)
3013 unsigned char **begp, **endp;
3014 struct coding_system *coding;
3015 int encodep;
3017 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3019 if (coding->eol_type != CODING_EOL_LF
3020 && coding->eol_type != CODING_EOL_UNDECIDED)
3021 /* Since we anyway have to convert end-of-line format, it is not
3022 worth skipping at most 100 bytes or so. */
3023 return;
3025 if (encodep) /* for encoding */
3027 switch (coding->type)
3029 case coding_type_no_conversion:
3030 case coding_type_emacs_mule:
3031 case coding_type_undecided:
3032 /* We need no conversion. */
3033 *begp = *endp;
3034 return;
3035 case coding_type_ccl:
3036 /* We can't skip any data. */
3037 return;
3038 case coding_type_iso2022:
3039 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3041 unsigned char *bol = beg_addr;
3042 while (beg_addr < end_addr && *beg_addr < 0x80)
3044 beg_addr++;
3045 if (*(beg_addr - 1) == '\n')
3046 bol = beg_addr;
3048 beg_addr = bol;
3049 goto label_skip_tail;
3051 /* fall down ... */
3052 default:
3053 /* We can skip all ASCII characters at the head and tail. */
3054 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3055 label_skip_tail:
3056 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3057 break;
3060 else /* for decoding */
3062 switch (coding->type)
3064 case coding_type_no_conversion:
3065 /* We need no conversion. */
3066 *begp = *endp;
3067 return;
3068 case coding_type_emacs_mule:
3069 if (coding->eol_type == CODING_EOL_LF)
3071 /* We need no conversion. */
3072 *begp = *endp;
3073 return;
3075 /* We can skip all but carriage-return. */
3076 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3077 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3078 break;
3079 case coding_type_sjis:
3080 case coding_type_big5:
3081 /* We can skip all ASCII characters at the head. */
3082 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3083 /* We can skip all ASCII characters at the tail except for
3084 the second byte of SJIS or BIG5 code. */
3085 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3086 if (end_addr != *endp)
3087 end_addr++;
3088 break;
3089 case coding_type_ccl:
3090 /* We can't skip any data. */
3091 return;
3092 default: /* i.e. case coding_type_iso2022: */
3094 unsigned char c;
3096 /* We can skip all ASCII characters except for a few
3097 control codes at the head. */
3098 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3099 && c != ISO_CODE_CR && c != ISO_CODE_SO
3100 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3101 beg_addr++;
3103 break;
3106 *begp = beg_addr;
3107 *endp = end_addr;
3108 return;
3111 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3112 text between B and E. B and E are buffer position. */
3114 Lisp_Object
3115 code_convert_region (b, e, coding, encodep)
3116 Lisp_Object b, e;
3117 struct coding_system *coding;
3118 int encodep;
3120 int beg, end, len, consumed, produced;
3121 char *buf;
3122 unsigned char *begp, *endp;
3123 int pos = PT;
3125 validate_region (&b, &e);
3126 beg = XINT (b), end = XINT (e);
3127 if (beg < GPT && end >= GPT)
3128 move_gap (end);
3130 if (encodep && !NILP (coding->pre_write_conversion))
3132 /* We must call a pre-conversion function which may put a new
3133 text to be converted in a new buffer. */
3134 struct buffer *old = current_buffer, *new;
3136 TEMP_SET_PT (beg);
3137 call2 (coding->pre_write_conversion, b, e);
3138 if (old != current_buffer)
3140 /* Replace the original text by the text just generated. */
3141 len = ZV - BEGV;
3142 new = current_buffer;
3143 set_buffer_internal (old);
3144 del_range (beg, end);
3145 insert_from_buffer (new, 1, len, 0);
3146 end = beg + len;
3150 /* We may be able to shrink the conversion region. */
3151 begp = POS_ADDR (beg); endp = begp + (end - beg);
3152 shrink_conversion_area (&begp, &endp, coding, encodep);
3154 if (begp == endp)
3155 /* We need no conversion. */
3156 len = end - beg;
3157 else
3159 beg += begp - POS_ADDR (beg);
3160 end = beg + (endp - begp);
3162 if (encodep)
3163 len = encoding_buffer_size (coding, end - beg);
3164 else
3165 len = decoding_buffer_size (coding, end - beg);
3166 buf = get_conversion_buffer (len);
3168 coding->last_block = 1;
3169 produced = (encodep
3170 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3171 &consumed)
3172 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3173 &consumed));
3175 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3177 TEMP_SET_PT (beg);
3178 insert (buf, produced);
3179 del_range (PT, PT + end - beg);
3180 if (pos >= end)
3181 pos = PT + (pos - end);
3182 else if (pos > beg)
3183 pos = beg;
3184 TEMP_SET_PT (pos);
3187 if (!encodep && !NILP (coding->post_read_conversion))
3189 /* We must call a post-conversion function which may alter
3190 the text just converted. */
3191 Lisp_Object insval;
3193 beg = XINT (b);
3194 TEMP_SET_PT (beg);
3195 insval = call1 (coding->post_read_conversion, make_number (len));
3196 CHECK_NUMBER (insval, 0);
3197 len = XINT (insval);
3200 return make_number (len);
3203 Lisp_Object
3204 code_convert_string (str, coding, encodep, nocopy)
3205 Lisp_Object str, nocopy;
3206 struct coding_system *coding;
3207 int encodep;
3209 int len, consumed, produced;
3210 char *buf;
3211 unsigned char *begp, *endp;
3212 int head_skip, tail_skip;
3213 struct gcpro gcpro1;
3215 if (encodep && !NILP (coding->pre_write_conversion)
3216 || !encodep && !NILP (coding->post_read_conversion))
3218 /* Since we have to call Lisp functions which assume target text
3219 is in a buffer, after setting a temporary buffer, call
3220 code_convert_region. */
3221 int count = specpdl_ptr - specpdl;
3222 int len = XSTRING (str)->size;
3223 Lisp_Object result;
3224 struct buffer *old = current_buffer;
3226 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3227 temp_output_buffer_setup (" *code-converting-work*");
3228 set_buffer_internal (XBUFFER (Vstandard_output));
3229 insert_from_string (str, 0, len, 0);
3230 code_convert_region (make_number (BEGV), make_number (ZV),
3231 coding, encodep);
3232 result = make_buffer_string (BEGV, ZV, 0);
3233 set_buffer_internal (old);
3234 return unbind_to (count, result);
3237 /* We may be able to shrink the conversion region. */
3238 begp = XSTRING (str)->data;
3239 endp = begp + XSTRING (str)->size;
3240 shrink_conversion_area (&begp, &endp, coding, encodep);
3242 if (begp == endp)
3243 /* We need no conversion. */
3244 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3246 head_skip = begp - XSTRING (str)->data;
3247 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3249 GCPRO1 (str);
3251 if (encodep)
3252 len = encoding_buffer_size (coding, endp - begp);
3253 else
3254 len = decoding_buffer_size (coding, endp - begp);
3255 buf = get_conversion_buffer (len + head_skip + tail_skip);
3257 bcopy (XSTRING (str)->data, buf, head_skip);
3258 coding->last_block = 1;
3259 produced = (encodep
3260 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3261 buf + head_skip, endp - begp, len, &consumed)
3262 : decode_coding (coding, XSTRING (str)->data + head_skip,
3263 buf + head_skip, endp - begp, len, &consumed));
3264 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3265 buf + head_skip + produced,
3266 tail_skip);
3268 UNGCPRO;
3270 return make_string (buf, head_skip + produced + tail_skip);
3273 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3274 3, 3, "r\nzCoding system: ",
3275 "Decode current region by specified coding system.\n\
3276 When called from a program, takes three arguments:\n\
3277 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3278 Return length of decoded text.")
3279 (b, e, coding_system)
3280 Lisp_Object b, e, coding_system;
3282 struct coding_system coding;
3284 CHECK_NUMBER_COERCE_MARKER (b, 0);
3285 CHECK_NUMBER_COERCE_MARKER (e, 1);
3286 CHECK_SYMBOL (coding_system, 2);
3288 if (NILP (coding_system))
3289 return make_number (XFASTINT (e) - XFASTINT (b));
3290 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3291 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3293 return code_convert_region (b, e, &coding, 0);
3296 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3297 3, 3, "r\nzCoding system: ",
3298 "Encode current region by specified coding system.\n\
3299 When called from a program, takes three arguments:\n\
3300 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3301 Return length of encoded text.")
3302 (b, e, coding_system)
3303 Lisp_Object b, e, coding_system;
3305 struct coding_system coding;
3307 CHECK_NUMBER_COERCE_MARKER (b, 0);
3308 CHECK_NUMBER_COERCE_MARKER (e, 1);
3309 CHECK_SYMBOL (coding_system, 2);
3311 if (NILP (coding_system))
3312 return make_number (XFASTINT (e) - XFASTINT (b));
3313 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3314 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3316 return code_convert_region (b, e, &coding, 1);
3319 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3320 2, 3, 0,
3321 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3322 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3323 of decoding.")
3324 (string, coding_system, nocopy)
3325 Lisp_Object string, coding_system, nocopy;
3327 struct coding_system coding;
3329 CHECK_STRING (string, 0);
3330 CHECK_SYMBOL (coding_system, 1);
3332 if (NILP (coding_system))
3333 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3334 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3335 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3337 return code_convert_string (string, &coding, 0, nocopy);
3340 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3341 2, 3, 0,
3342 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3343 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3344 of encoding.")
3345 (string, coding_system, nocopy)
3346 Lisp_Object string, coding_system, nocopy;
3348 struct coding_system coding;
3350 CHECK_STRING (string, 0);
3351 CHECK_SYMBOL (coding_system, 1);
3353 if (NILP (coding_system))
3354 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3355 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3356 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3358 return code_convert_string (string, &coding, 1, nocopy);
3361 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3362 "Decode a JISX0208 character of shift-jis encoding.\n\
3363 CODE is the character code in SJIS.\n\
3364 Return the corresponding character.")
3365 (code)
3366 Lisp_Object code;
3368 unsigned char c1, c2, s1, s2;
3369 Lisp_Object val;
3371 CHECK_NUMBER (code, 0);
3372 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3373 DECODE_SJIS (s1, s2, c1, c2);
3374 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3375 return val;
3378 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3379 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3380 Return the corresponding character code in SJIS.")
3381 (ch)
3382 Lisp_Object ch;
3384 int charset, c1, c2, s1, s2;
3385 Lisp_Object val;
3387 CHECK_NUMBER (ch, 0);
3388 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3389 if (charset == charset_jisx0208)
3391 ENCODE_SJIS (c1, c2, s1, s2);
3392 XSETFASTINT (val, (s1 << 8) | s2);
3394 else
3395 XSETFASTINT (val, 0);
3396 return val;
3399 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3400 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3401 CODE is the character code in BIG5.\n\
3402 Return the corresponding character.")
3403 (code)
3404 Lisp_Object code;
3406 int charset;
3407 unsigned char b1, b2, c1, c2;
3408 Lisp_Object val;
3410 CHECK_NUMBER (code, 0);
3411 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3412 DECODE_BIG5 (b1, b2, charset, c1, c2);
3413 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3414 return val;
3417 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3418 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3419 Return the corresponding character code in Big5.")
3420 (ch)
3421 Lisp_Object ch;
3423 int charset, c1, c2, b1, b2;
3424 Lisp_Object val;
3426 CHECK_NUMBER (ch, 0);
3427 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3428 if (charset == charset_big5_1 || charset == charset_big5_2)
3430 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3431 XSETFASTINT (val, (b1 << 8) | b2);
3433 else
3434 XSETFASTINT (val, 0);
3435 return val;
3438 DEFUN ("set-terminal-coding-system-internal",
3439 Fset_terminal_coding_system_internal,
3440 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3441 (coding_system)
3442 Lisp_Object coding_system;
3444 CHECK_SYMBOL (coding_system, 0);
3445 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3446 return Qnil;
3449 DEFUN ("terminal-coding-system",
3450 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3451 "Return coding-system of your terminal.")
3454 return terminal_coding.symbol;
3457 DEFUN ("set-keyboard-coding-system-internal",
3458 Fset_keyboard_coding_system_internal,
3459 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3460 (coding_system)
3461 Lisp_Object coding_system;
3463 CHECK_SYMBOL (coding_system, 0);
3464 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3465 return Qnil;
3468 DEFUN ("keyboard-coding-system",
3469 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3470 "Return coding-system of what is sent from terminal keyboard.")
3473 return keyboard_coding.symbol;
3477 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3478 Sfind_operation_coding_system, 1, MANY, 0,
3479 "Choose a coding system for an operation based on the target name.\n\
3480 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3481 DECODING-SYSTEM is the coding system to use for decoding\n\
3482 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3483 for encoding (in case OPERATION does encoding).\n\
3485 The first argument OPERATION specifies an I/O primitive:\n\
3486 For file I/O, `insert-file-contents' or `write-region'.\n\
3487 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3488 For network I/O, `open-network-stream'.\n\
3490 The remaining arguments should be the same arguments that were passed\n\
3491 to the primitive. Depending on which primitive, one of those arguments\n\
3492 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3493 whichever argument specifies the file name is TARGET.\n\
3495 TARGET has a meaning which depends on OPERATION:\n\
3496 For file I/O, TARGET is a file name.\n\
3497 For process I/O, TARGET is a process name.\n\
3498 For network I/O, TARGET is a service name or a port number\n\
3500 This function looks up what specified for TARGET in,\n\
3501 `file-coding-system-alist', `process-coding-system-alist',\n\
3502 or `network-coding-system-alist' depending on OPERATION.\n\
3503 They may specify a coding system, a cons of coding systems,\n\
3504 or a function symbol to call.\n\
3505 In the last case, we call the function with one argument,\n\
3506 which is a list of all the arguments given to this function.")
3507 (nargs, args)
3508 int nargs;
3509 Lisp_Object *args;
3511 Lisp_Object operation, target_idx, target, val;
3512 register Lisp_Object chain;
3514 if (nargs < 2)
3515 error ("Too few arguments");
3516 operation = args[0];
3517 if (!SYMBOLP (operation)
3518 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3519 error ("Invalid first arguement");
3520 if (nargs < 1 + XINT (target_idx))
3521 error ("Too few arguments for operation: %s",
3522 XSYMBOL (operation)->name->data);
3523 target = args[XINT (target_idx) + 1];
3524 if (!(STRINGP (target)
3525 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3526 error ("Invalid %dth argument", XINT (target_idx) + 1);
3528 chain = ((EQ (operation, Qinsert_file_contents)
3529 || EQ (operation, Qwrite_region))
3530 ? Vfile_coding_system_alist
3531 : (EQ (operation, Qopen_network_stream)
3532 ? Vnetwork_coding_system_alist
3533 : Vprocess_coding_system_alist));
3534 if (NILP (chain))
3535 return Qnil;
3537 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3539 Lisp_Object elt = XCONS (chain)->car;
3541 if (CONSP (elt)
3542 && ((STRINGP (target)
3543 && STRINGP (XCONS (elt)->car)
3544 && fast_string_match (XCONS (elt)->car, target) >= 0)
3545 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3547 val = XCONS (elt)->cdr;
3548 if (CONSP (val))
3549 return val;
3550 if (! SYMBOLP (val))
3551 return Qnil;
3552 if (! NILP (Fcoding_system_p (val)))
3553 return Fcons (val, val);
3554 if (!NILP (Fboundp (val)))
3555 return call1 (val, Flist (nargs, args));
3556 return Qnil;
3559 return Qnil;
3562 #endif /* emacs */
3565 /*** 8. Post-amble ***/
3567 init_coding_once ()
3569 int i;
3571 /* Emacs' internal format specific initialize routine. */
3572 for (i = 0; i <= 0x20; i++)
3573 emacs_code_class[i] = EMACS_control_code;
3574 emacs_code_class[0x0A] = EMACS_linefeed_code;
3575 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3576 for (i = 0x21 ; i < 0x7F; i++)
3577 emacs_code_class[i] = EMACS_ascii_code;
3578 emacs_code_class[0x7F] = EMACS_control_code;
3579 emacs_code_class[0x80] = EMACS_leading_code_composition;
3580 for (i = 0x81; i < 0xFF; i++)
3581 emacs_code_class[i] = EMACS_invalid_code;
3582 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3583 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3584 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3585 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3587 /* ISO2022 specific initialize routine. */
3588 for (i = 0; i < 0x20; i++)
3589 iso_code_class[i] = ISO_control_code;
3590 for (i = 0x21; i < 0x7F; i++)
3591 iso_code_class[i] = ISO_graphic_plane_0;
3592 for (i = 0x80; i < 0xA0; i++)
3593 iso_code_class[i] = ISO_control_code;
3594 for (i = 0xA1; i < 0xFF; i++)
3595 iso_code_class[i] = ISO_graphic_plane_1;
3596 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3597 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3598 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3599 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3600 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3601 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3602 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3603 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3604 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3605 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3607 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3608 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3610 setup_coding_system (Qnil, &keyboard_coding);
3611 setup_coding_system (Qnil, &terminal_coding);
3613 #if defined (MSDOS) || defined (WINDOWSNT)
3614 system_eol_type = CODING_EOL_CRLF;
3615 #else
3616 system_eol_type = CODING_EOL_LF;
3617 #endif
3620 #ifdef emacs
3622 syms_of_coding ()
3624 Qtarget_idx = intern ("target-idx");
3625 staticpro (&Qtarget_idx);
3627 /* Target FILENAME is the first argument. */
3628 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3629 /* Target FILENAME is the third argument. */
3630 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3632 Qcall_process = intern ("call-process");
3633 staticpro (&Qcall_process);
3634 /* Target PROGRAM is the first argument. */
3635 Fput (Qcall_process, Qtarget_idx, make_number (0));
3637 Qcall_process_region = intern ("call-process-region");
3638 staticpro (&Qcall_process_region);
3639 /* Target PROGRAM is the third argument. */
3640 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3642 Qstart_process = intern ("start-process");
3643 staticpro (&Qstart_process);
3644 /* Target PROGRAM is the third argument. */
3645 Fput (Qstart_process, Qtarget_idx, make_number (2));
3647 Qopen_network_stream = intern ("open-network-stream");
3648 staticpro (&Qopen_network_stream);
3649 /* Target SERVICE is the fourth argument. */
3650 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3652 Qcoding_system = intern ("coding-system");
3653 staticpro (&Qcoding_system);
3655 Qeol_type = intern ("eol-type");
3656 staticpro (&Qeol_type);
3658 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3659 staticpro (&Qbuffer_file_coding_system);
3661 Qpost_read_conversion = intern ("post-read-conversion");
3662 staticpro (&Qpost_read_conversion);
3664 Qpre_write_conversion = intern ("pre-write-conversion");
3665 staticpro (&Qpre_write_conversion);
3667 Qcoding_system_spec = intern ("coding-system-spec");
3668 staticpro (&Qcoding_system_spec);
3670 Qcoding_system_p = intern ("coding-system-p");
3671 staticpro (&Qcoding_system_p);
3673 Qcoding_system_error = intern ("coding-system-error");
3674 staticpro (&Qcoding_system_error);
3676 Fput (Qcoding_system_error, Qerror_conditions,
3677 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3678 Fput (Qcoding_system_error, Qerror_message,
3679 build_string ("Invalid coding system"));
3681 Qcoding_category_index = intern ("coding-category-index");
3682 staticpro (&Qcoding_category_index);
3685 int i;
3686 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3688 coding_category_table[i] = intern (coding_category_name[i]);
3689 staticpro (&coding_category_table[i]);
3690 Fput (coding_category_table[i], Qcoding_category_index,
3691 make_number (i));
3695 Qcharacter_unification_table = intern ("character-unification-table");
3696 staticpro (&Qcharacter_unification_table);
3697 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3698 make_number (0));
3700 Qcharacter_unification_table_for_decode
3701 = intern ("character-unification-table-for-decode");
3702 staticpro (&Qcharacter_unification_table_for_decode);
3704 Qcharacter_unification_table_for_encode
3705 = intern ("character-unification-table-for-encode");
3706 staticpro (&Qcharacter_unification_table_for_encode);
3708 Qemacs_mule = intern ("emacs-mule");
3709 staticpro (&Qemacs_mule);
3711 defsubr (&Scoding_system_spec);
3712 defsubr (&Scoding_system_p);
3713 defsubr (&Sread_coding_system);
3714 defsubr (&Sread_non_nil_coding_system);
3715 defsubr (&Scheck_coding_system);
3716 defsubr (&Sdetect_coding_region);
3717 defsubr (&Sdecode_coding_region);
3718 defsubr (&Sencode_coding_region);
3719 defsubr (&Sdecode_coding_string);
3720 defsubr (&Sencode_coding_string);
3721 defsubr (&Sdecode_sjis_char);
3722 defsubr (&Sencode_sjis_char);
3723 defsubr (&Sdecode_big5_char);
3724 defsubr (&Sencode_big5_char);
3725 defsubr (&Sset_terminal_coding_system_internal);
3726 defsubr (&Sterminal_coding_system);
3727 defsubr (&Sset_keyboard_coding_system_internal);
3728 defsubr (&Skeyboard_coding_system);
3729 defsubr (&Sfind_operation_coding_system);
3731 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3732 "List of coding-categories (symbols) ordered by priority.");
3734 int i;
3736 Vcoding_category_list = Qnil;
3737 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3738 Vcoding_category_list
3739 = Fcons (coding_category_table[i], Vcoding_category_list);
3742 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3743 "A variable of internal use only.\n\
3744 If the value is a coding system, it is used for decoding on read operation.\n\
3745 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3746 Vcoding_system_for_read = Qnil;
3748 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3749 "A variable of internal use only.\n\
3750 If the value is a coding system, it is used for encoding on write operation.\n\
3751 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3752 Vcoding_system_for_write = Qnil;
3754 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3755 "Coding-system used in the latest file or process I/O.");
3756 Vlast_coding_system_used = Qnil;
3758 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3759 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3760 inhibit_eol_conversion = 0;
3762 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3763 "Alist to decide a coding system to use for a file I/O operation.\n\
3764 The format is ((PATTERN . VAL) ...),\n\
3765 where PATTERN is a regular expression matching a file name,\n\
3766 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3767 If VAL is a coding system, it is used for both decoding and encoding\n\
3768 the file contents.\n\
3769 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3770 and the cdr part is used for encoding.\n\
3771 If VAL is a function symbol, the function must return a coding system\n\
3772 or a cons of coding systems which are used as above.\n\
3774 See also the function `find-operation-coding-system'.");
3775 Vfile_coding_system_alist = Qnil;
3777 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3778 "Alist to decide a coding system to use for a process I/O operation.\n\
3779 The format is ((PATTERN . VAL) ...),\n\
3780 where PATTERN is a regular expression matching a program name,\n\
3781 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3782 If VAL is a coding system, it is used for both decoding what received\n\
3783 from the program and encoding what sent to the program.\n\
3784 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3785 and the cdr part is used for encoding.\n\
3786 If VAL is a function symbol, the function must return a coding system\n\
3787 or a cons of coding systems which are used as above.\n\
3789 See also the function `find-operation-coding-system'.");
3790 Vprocess_coding_system_alist = Qnil;
3792 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3793 "Alist to decide a coding system to use for a network I/O operation.\n\
3794 The format is ((PATTERN . VAL) ...),\n\
3795 where PATTERN is a regular expression matching a network service name\n\
3796 or is a port number to connect to,\n\
3797 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3798 If VAL is a coding system, it is used for both decoding what received\n\
3799 from the network stream and encoding what sent to the network stream.\n\
3800 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3801 and the cdr part is used for encoding.\n\
3802 If VAL is a function symbol, the function must return a coding system\n\
3803 or a cons of coding systems which are used as above.\n\
3805 See also the function `find-operation-coding-system'.");
3806 Vnetwork_coding_system_alist = Qnil;
3808 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3809 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3810 eol_mnemonic_unix = ':';
3812 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3813 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3814 eol_mnemonic_dos = '\\';
3816 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3817 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3818 eol_mnemonic_mac = '/';
3820 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3821 "Mnemonic character indicating end-of-line format is not yet decided.");
3822 eol_mnemonic_undecided = ':';
3824 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3825 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3826 Venable_character_unification = Qt;
3828 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3829 &Vstandard_character_unification_table_for_decode,
3830 "Table for unifying characters when reading.");
3831 Vstandard_character_unification_table_for_decode = Qnil;
3833 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3834 &Vstandard_character_unification_table_for_encode,
3835 "Table for unifying characters when writing.");
3836 Vstandard_character_unification_table_for_encode = Qnil;
3838 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3839 "Alist of charsets vs revision numbers.\n\
3840 While encoding, if a charset (car part of an element) is found,\n\
3841 designate it with the escape sequence identifing revision (cdr part of the element).");
3842 Vcharset_revision_alist = Qnil;
3844 DEFVAR_LISP ("default-process-coding-system",
3845 &Vdefault_process_coding_system,
3846 "Cons of coding systems used for process I/O by default.\n\
3847 The car part is used for decoding a process output,\n\
3848 the cdr part is used for encoding a text to be sent to a process.");
3849 Vdefault_process_coding_system = Qnil;
3852 #endif /* emacs */