Comment change.
[emacs.git] / src / coding.c
blob7cb5200c08de39fa2e7fd48d5da172eaf2280375
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
49 1. ISO2022
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
62 3. BIG5
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
70 4. Raw text
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
76 5. Other
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding-system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111 #if 0
113 detect_coding_emacs_mule (src, src_end)
114 unsigned char *src, *src_end;
118 #endif
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
128 #if 0
129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
130 struct coding_system *coding;
131 unsigned char *source, *destination;
132 int src_bytes, dst_bytes;
133 int *consumed;
137 #endif
139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
147 #if 0
148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 int *consumed;
156 #endif
158 /*** COMMONLY USED MACROS ***/
160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
166 #define ONE_MORE_BYTE(c1) \
167 do { \
168 if (src < src_end) \
169 c1 = *src++; \
170 else \
171 goto label_end_of_loop; \
172 } while (0)
174 #define TWO_MORE_BYTES(c1, c2) \
175 do { \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
178 else \
179 goto label_end_of_loop; \
180 } while (0)
182 #define THREE_MORE_BYTES(c1, c2, c3) \
183 do { \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
190 /* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
197 /* Decode one ASCII character C. */
199 #define DECODE_CHARACTER_ASCII(c) \
200 do { \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 else \
204 *dst++ = (c); \
205 } while (0)
207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
208 position-code is C. */
210 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
211 do { \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
215 else \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
220 } while (0)
222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
223 position-codes are C1 and C2. */
225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
226 do { \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
229 } while (0)
232 /*** 1. Preamble ***/
234 #include <stdio.h>
236 #ifdef emacs
238 #include <config.h>
239 #include "lisp.h"
240 #include "buffer.h"
241 #include "charset.h"
242 #include "ccl.h"
243 #include "coding.h"
244 #include "window.h"
246 #else /* not emacs */
248 #include "mulelib.h"
250 #endif /* not emacs */
252 Lisp_Object Qcoding_system, Qeol_type;
253 Lisp_Object Qbuffer_file_coding_system;
254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
255 Lisp_Object Qno_conversion, Qundecided;
257 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
258 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
259 Lisp_Object Qstart_process, Qopen_network_stream;
260 Lisp_Object Qtarget_idx;
262 /* Mnemonic character of each format of end-of-line. */
263 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
264 /* Mnemonic character to indicate format of end-of-line is not yet
265 decided. */
266 int eol_mnemonic_undecided;
268 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
269 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
270 int system_eol_type;
272 #ifdef emacs
274 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
276 /* Coding system emacs-mule is for converting only end-of-line format. */
277 Lisp_Object Qemacs_mule;
279 /* Coding-systems are handed between Emacs Lisp programs and C internal
280 routines by the following three variables. */
281 /* Coding-system for reading files and receiving data from process. */
282 Lisp_Object Vcoding_system_for_read;
283 /* Coding-system for writing files and sending data to process. */
284 Lisp_Object Vcoding_system_for_write;
285 /* Coding-system actually used in the latest I/O. */
286 Lisp_Object Vlast_coding_system_used;
288 /* A vector of length 256 which contains information about special
289 Latin codes (espepcially for dealing with Microsoft code). */
290 Lisp_Object Vlatin_extra_code_table;
292 /* Flag to inhibit code conversion of end-of-line format. */
293 int inhibit_eol_conversion;
295 /* Coding system to be used to encode text for terminal display. */
296 struct coding_system terminal_coding;
298 /* Coding system to be used to encode text for terminal display when
299 terminal coding system is nil. */
300 struct coding_system safe_terminal_coding;
302 /* Coding system of what is sent from terminal keyboard. */
303 struct coding_system keyboard_coding;
305 Lisp_Object Vfile_coding_system_alist;
306 Lisp_Object Vprocess_coding_system_alist;
307 Lisp_Object Vnetwork_coding_system_alist;
309 #endif /* emacs */
311 Lisp_Object Qcoding_category_index;
313 /* List of symbols `coding-category-xxx' ordered by priority. */
314 Lisp_Object Vcoding_category_list;
316 /* Table of coding-systems currently assigned to each coding-category. */
317 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
319 /* Table of names of symbol for each coding-category. */
320 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
321 "coding-category-emacs-mule",
322 "coding-category-sjis",
323 "coding-category-iso-7",
324 "coding-category-iso-8-1",
325 "coding-category-iso-8-2",
326 "coding-category-iso-7-else",
327 "coding-category-iso-8-else",
328 "coding-category-big5",
329 "coding-category-raw-text",
330 "coding-category-binary"
333 /* Flag to tell if we look up unification table on character code
334 conversion. */
335 Lisp_Object Venable_character_unification;
336 /* Standard unification table to look up on decoding (reading). */
337 Lisp_Object Vstandard_character_unification_table_for_decode;
338 /* Standard unification table to look up on encoding (writing). */
339 Lisp_Object Vstandard_character_unification_table_for_encode;
341 Lisp_Object Qcharacter_unification_table;
342 Lisp_Object Qcharacter_unification_table_for_decode;
343 Lisp_Object Qcharacter_unification_table_for_encode;
345 /* Alist of charsets vs revision number. */
346 Lisp_Object Vcharset_revision_alist;
348 /* Default coding systems used for process I/O. */
349 Lisp_Object Vdefault_process_coding_system;
352 /*** 2. Emacs internal format (emacs-mule) handlers ***/
354 /* Emacs' internal format for encoding multiple character sets is a
355 kind of multi-byte encoding, i.e. characters are encoded by
356 variable-length sequences of one-byte codes. ASCII characters
357 and control characters (e.g. `tab', `newline') are represented by
358 one-byte sequences which are their ASCII codes, in the range 0x00
359 through 0x7F. The other characters are represented by a sequence
360 of `base leading-code', optional `extended leading-code', and one
361 or two `position-code's. The length of the sequence is determined
362 by the base leading-code. Leading-code takes the range 0x80
363 through 0x9F, whereas extended leading-code and position-code take
364 the range 0xA0 through 0xFF. See `charset.h' for more details
365 about leading-code and position-code.
367 There's one exception to this rule. Special leading-code
368 `leading-code-composition' denotes that the following several
369 characters should be composed into one character. Leading-codes of
370 components (except for ASCII) are added 0x20. An ASCII character
371 component is represented by a 2-byte sequence of `0xA0' and
372 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
373 details of composite character. Hence, we can summarize the code
374 range as follows:
376 --- CODE RANGE of Emacs' internal format ---
377 (character set) (range)
378 ASCII 0x00 .. 0x7F
379 ELSE (1st byte) 0x80 .. 0x9F
380 (rest bytes) 0xA0 .. 0xFF
381 ---------------------------------------------
385 enum emacs_code_class_type emacs_code_class[256];
387 /* Go to the next statement only if *SRC is accessible and the code is
388 greater than 0xA0. */
389 #define CHECK_CODE_RANGE_A0_FF \
390 do { \
391 if (src >= src_end) \
392 goto label_end_of_switch; \
393 else if (*src++ < 0xA0) \
394 return 0; \
395 } while (0)
397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
398 Check if a text is encoded in Emacs' internal format. If it is,
399 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
402 detect_coding_emacs_mule (src, src_end)
403 unsigned char *src, *src_end;
405 unsigned char c;
406 int composing = 0;
408 while (src < src_end)
410 c = *src++;
412 if (composing)
414 if (c < 0xA0)
415 composing = 0;
416 else
417 c -= 0x20;
420 switch (emacs_code_class[c])
422 case EMACS_ascii_code:
423 case EMACS_linefeed_code:
424 break;
426 case EMACS_control_code:
427 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
428 return 0;
429 break;
431 case EMACS_invalid_code:
432 return 0;
434 case EMACS_leading_code_composition: /* c == 0x80 */
435 if (composing)
436 CHECK_CODE_RANGE_A0_FF;
437 else
438 composing = 1;
439 break;
441 case EMACS_leading_code_4:
442 CHECK_CODE_RANGE_A0_FF;
443 /* fall down to check it two more times ... */
445 case EMACS_leading_code_3:
446 CHECK_CODE_RANGE_A0_FF;
447 /* fall down to check it one more time ... */
449 case EMACS_leading_code_2:
450 CHECK_CODE_RANGE_A0_FF;
451 break;
453 default:
454 label_end_of_switch:
455 break;
458 return CODING_CATEGORY_MASK_EMACS_MULE;
462 /*** 3. ISO2022 handlers ***/
464 /* The following note describes the coding system ISO2022 briefly.
465 Since the intention of this note is to help in understanding of
466 the programs in this file, some parts are NOT ACCURATE or OVERLY
467 SIMPLIFIED. For the thorough understanding, please refer to the
468 original document of ISO2022.
470 ISO2022 provides many mechanisms to encode several character sets
471 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
472 all text is encoded by codes of less than 128. This may make the
473 encoded text a little bit longer, but the text gets more stability
474 to pass through several gateways (some of them strip off the MSB).
476 There are two kinds of character set: control character set and
477 graphic character set. The former contains control characters such
478 as `newline' and `escape' to provide control functions (control
479 functions are provided also by escape sequences). The latter
480 contains graphic characters such as ' A' and '-'. Emacs recognizes
481 two control character sets and many graphic character sets.
483 Graphic character sets are classified into one of the following
484 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
485 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
486 bytes (DIMENSION) and the number of characters in one dimension
487 (CHARS) of the set. In addition, each character set is assigned an
488 identification tag (called "final character" and denoted as <F>
489 here after) which is unique in each class. <F> of each character
490 set is decided by ECMA(*) when it is registered in ISO. Code range
491 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
493 Note (*): ECMA = European Computer Manufacturers Association
495 Here are examples of graphic character set [NAME(<F>)]:
496 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
497 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
498 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
499 o DIMENSION2_CHARS96 -- none for the moment
501 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
502 C0 [0x00..0x1F] -- control character plane 0
503 GL [0x20..0x7F] -- graphic character plane 0
504 C1 [0x80..0x9F] -- control character plane 1
505 GR [0xA0..0xFF] -- graphic character plane 1
507 A control character set is directly designated and invoked to C0 or
508 C1 by an escape sequence. The most common case is that ISO646's
509 control character set is designated/invoked to C0 and ISO6429's
510 control character set is designated/invoked to C1, and usually
511 these designations/invocations are omitted in a coded text. With
512 7-bit environment, only C0 can be used, and a control character for
513 C1 is encoded by an appropriate escape sequence to fit in the
514 environment. All control characters for C1 are defined the
515 corresponding escape sequences.
517 A graphic character set is at first designated to one of four
518 graphic registers (G0 through G3), then these graphic registers are
519 invoked to GL or GR. These designations and invocations can be
520 done independently. The most common case is that G0 is invoked to
521 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
522 these invocations and designations are omitted in a coded text.
523 With 7-bit environment, only GL can be used.
525 When a graphic character set of CHARS94 is invoked to GL, code 0x20
526 and 0x7F of GL area work as control characters SPACE and DEL
527 respectively, and code 0xA0 and 0xFF of GR area should not be used.
529 There are two ways of invocation: locking-shift and single-shift.
530 With locking-shift, the invocation lasts until the next different
531 invocation, whereas with single-shift, the invocation works only
532 for the following character and doesn't affect locking-shift.
533 Invocations are done by the following control characters or escape
534 sequences.
536 ----------------------------------------------------------------------
537 function control char escape sequence description
538 ----------------------------------------------------------------------
539 SI (shift-in) 0x0F none invoke G0 to GL
540 SO (shift-out) 0x0E none invoke G1 to GL
541 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
542 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
543 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
544 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
545 ----------------------------------------------------------------------
546 The first four are for locking-shift. Control characters for these
547 functions are defined by macros ISO_CODE_XXX in `coding.h'.
549 Designations are done by the following escape sequences.
550 ----------------------------------------------------------------------
551 escape sequence description
552 ----------------------------------------------------------------------
553 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
554 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
555 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
556 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
557 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
558 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
559 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
560 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
561 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
562 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
563 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
564 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
565 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
566 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
567 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
568 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
569 ----------------------------------------------------------------------
571 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
572 of dimension 1, chars 94, and final character <F>, and etc.
574 Note (*): Although these designations are not allowed in ISO2022,
575 Emacs accepts them on decoding, and produces them on encoding
576 CHARS96 character set in a coding system which is characterized as
577 7-bit environment, non-locking-shift, and non-single-shift.
579 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
580 '(' can be omitted. We call this as "short-form" here after.
582 Now you may notice that there are a lot of ways for encoding the
583 same multilingual text in ISO2022. Actually, there exists many
584 coding systems such as Compound Text (used in X's inter client
585 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
586 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
587 localized platforms), and all of these are variants of ISO2022.
589 In addition to the above, Emacs handles two more kinds of escape
590 sequences: ISO6429's direction specification and Emacs' private
591 sequence for specifying character composition.
593 ISO6429's direction specification takes the following format:
594 o CSI ']' -- end of the current direction
595 o CSI '0' ']' -- end of the current direction
596 o CSI '1' ']' -- start of left-to-right text
597 o CSI '2' ']' -- start of right-to-left text
598 The control character CSI (0x9B: control sequence introducer) is
599 abbreviated to the escape sequence ESC '[' in 7-bit environment.
601 Character composition specification takes the following format:
602 o ESC '0' -- start character composition
603 o ESC '1' -- end character composition
604 Since these are not standard escape sequences of any ISO, the use
605 of them for these meaning is restricted to Emacs only. */
607 enum iso_code_class_type iso_code_class[256];
609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
610 Check if a text is encoded in ISO2022. If it is, returns an
611 integer in which appropriate flag bits any of:
612 CODING_CATEGORY_MASK_ISO_7
613 CODING_CATEGORY_MASK_ISO_8_1
614 CODING_CATEGORY_MASK_ISO_8_2
615 CODING_CATEGORY_MASK_ISO_7_ELSE
616 CODING_CATEGORY_MASK_ISO_8_ELSE
617 are set. If a code which should never appear in ISO2022 is found,
618 returns 0. */
621 detect_coding_iso2022 (src, src_end)
622 unsigned char *src, *src_end;
624 int mask = (CODING_CATEGORY_MASK_ISO_7
625 | CODING_CATEGORY_MASK_ISO_8_1
626 | CODING_CATEGORY_MASK_ISO_8_2
627 | CODING_CATEGORY_MASK_ISO_7_ELSE
628 | CODING_CATEGORY_MASK_ISO_8_ELSE
630 int g1 = 0; /* 1 iff designating to G1. */
631 int c, i;
632 struct coding_system coding_iso_8_1, coding_iso_8_2;
634 /* Coding systems of these categories may accept latin extra codes. */
635 setup_coding_system
636 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
637 &coding_iso_8_1);
638 setup_coding_system
639 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
640 &coding_iso_8_2);
642 while (mask && src < src_end)
644 c = *src++;
645 switch (c)
647 case ISO_CODE_ESC:
648 if (src >= src_end)
649 break;
650 c = *src++;
651 if ((c >= '(' && c <= '/'))
653 /* Designation sequence for a charset of dimension 1. */
654 if (src >= src_end)
655 break;
656 c = *src++;
657 if (c < ' ' || c >= 0x80)
658 /* Invalid designation sequence. */
659 return 0;
661 else if (c == '$')
663 /* Designation sequence for a charset of dimension 2. */
664 if (src >= src_end)
665 break;
666 c = *src++;
667 if (c >= '@' && c <= 'B')
668 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
670 else if (c >= '(' && c <= '/')
672 if (src >= src_end)
673 break;
674 c = *src++;
675 if (c < ' ' || c >= 0x80)
676 /* Invalid designation sequence. */
677 return 0;
679 else
680 /* Invalid designation sequence. */
681 return 0;
683 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
684 /* Locking shift. */
685 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
686 | CODING_CATEGORY_MASK_ISO_8_ELSE);
687 else if (c == '0' || c == '1' || c == '2')
688 /* Start/end composition. */
690 else
691 /* Invalid escape sequence. */
692 return 0;
693 break;
695 case ISO_CODE_SO:
696 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
697 | CODING_CATEGORY_MASK_ISO_8_ELSE);
698 break;
700 case ISO_CODE_CSI:
701 case ISO_CODE_SS2:
702 case ISO_CODE_SS3:
704 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
706 if (VECTORP (Vlatin_extra_code_table)
707 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
709 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
710 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
711 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
712 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
714 mask &= newmask;
716 break;
718 default:
719 if (c < 0x80)
720 break;
721 else if (c < 0xA0)
723 if (VECTORP (Vlatin_extra_code_table)
724 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
726 int newmask = 0;
728 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
729 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
730 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
731 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
732 mask &= newmask;
734 else
735 return 0;
737 else
739 unsigned char *src_begin = src;
741 mask &= ~(CODING_CATEGORY_MASK_ISO_7
742 | CODING_CATEGORY_MASK_ISO_7_ELSE);
743 while (src < src_end && *src >= 0xA0)
744 src++;
745 if ((src - src_begin - 1) & 1 && src < src_end)
746 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
748 break;
752 return mask;
755 /* Decode a character of which charset is CHARSET and the 1st position
756 code is C1. If dimension of CHARSET is 2, the 2nd position code is
757 fetched from SRC and set to C2. If CHARSET is negative, it means
758 that we are decoding ill formed text, and what we can do is just to
759 read C1 as is. */
761 #define DECODE_ISO_CHARACTER(charset, c1) \
762 do { \
763 int c_alt, charset_alt = (charset); \
764 if (COMPOSING_HEAD_P (coding->composing)) \
766 *dst++ = LEADING_CODE_COMPOSITION; \
767 if (COMPOSING_WITH_RULE_P (coding->composing)) \
768 /* To tell composition rules are embeded. */ \
769 *dst++ = 0xFF; \
770 coding->composing += 2; \
772 if ((charset) >= 0) \
774 if (CHARSET_DIMENSION (charset) == 2) \
775 ONE_MORE_BYTE (c2); \
776 if (!NILP (unification_table) \
777 && ((c_alt = unify_char (unification_table, \
778 -1, (charset), c1, c2)) >= 0)) \
779 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
781 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
782 DECODE_CHARACTER_ASCII (c1); \
783 else if (CHARSET_DIMENSION (charset_alt) == 1) \
784 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
785 else \
786 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
787 if (COMPOSING_WITH_RULE_P (coding->composing)) \
788 /* To tell a composition rule follows. */ \
789 coding->composing = COMPOSING_WITH_RULE_RULE; \
790 } while (0)
792 /* Set designation state into CODING. */
793 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
794 do { \
795 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
796 make_number (chars), \
797 make_number (final_char)); \
798 if (charset >= 0) \
800 if (coding->direction == 1 \
801 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
802 charset = CHARSET_REVERSE_CHARSET (charset); \
803 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
805 } while (0)
807 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
810 decode_coding_iso2022 (coding, source, destination,
811 src_bytes, dst_bytes, consumed)
812 struct coding_system *coding;
813 unsigned char *source, *destination;
814 int src_bytes, dst_bytes;
815 int *consumed;
817 unsigned char *src = source;
818 unsigned char *src_end = source + src_bytes;
819 unsigned char *dst = destination;
820 unsigned char *dst_end = destination + dst_bytes;
821 /* Since the maximum bytes produced by each loop is 7, we subtract 6
822 from DST_END to assure that overflow checking is necessary only
823 at the head of loop. */
824 unsigned char *adjusted_dst_end = dst_end - 6;
825 int charset;
826 /* Charsets invoked to graphic plane 0 and 1 respectively. */
827 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
828 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
829 Lisp_Object unification_table
830 = coding->character_unification_table_for_decode;
832 if (!NILP (Venable_character_unification) && NILP (unification_table))
833 unification_table = Vstandard_character_unification_table_for_decode;
835 while (src < src_end && dst < adjusted_dst_end)
837 /* SRC_BASE remembers the start position in source in each loop.
838 The loop will be exited when there's not enough source text
839 to analyze long escape sequence or 2-byte code (within macros
840 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
841 to SRC_BASE before exiting. */
842 unsigned char *src_base = src;
843 int c1 = *src++, c2;
845 switch (iso_code_class [c1])
847 case ISO_0x20_or_0x7F:
848 if (!coding->composing
849 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
851 /* This is SPACE or DEL. */
852 *dst++ = c1;
853 break;
855 /* This is a graphic character, we fall down ... */
857 case ISO_graphic_plane_0:
858 if (coding->composing == COMPOSING_WITH_RULE_RULE)
860 /* This is a composition rule. */
861 *dst++ = c1 | 0x80;
862 coding->composing = COMPOSING_WITH_RULE_TAIL;
864 else
865 DECODE_ISO_CHARACTER (charset0, c1);
866 break;
868 case ISO_0xA0_or_0xFF:
869 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
871 /* Invalid code. */
872 *dst++ = c1;
873 break;
875 /* This is a graphic character, we fall down ... */
877 case ISO_graphic_plane_1:
878 DECODE_ISO_CHARACTER (charset1, c1);
879 break;
881 case ISO_control_code:
882 /* All ISO2022 control characters in this class have the
883 same representation in Emacs internal format. */
884 *dst++ = c1;
885 break;
887 case ISO_carriage_return:
888 if (coding->eol_type == CODING_EOL_CR)
890 *dst++ = '\n';
892 else if (coding->eol_type == CODING_EOL_CRLF)
894 ONE_MORE_BYTE (c1);
895 if (c1 == ISO_CODE_LF)
896 *dst++ = '\n';
897 else
899 src--;
900 *dst++ = c1;
903 else
905 *dst++ = c1;
907 break;
909 case ISO_shift_out:
910 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
911 goto label_invalid_escape_sequence;
912 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
913 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
914 break;
916 case ISO_shift_in:
917 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
918 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
919 break;
921 case ISO_single_shift_2_7:
922 case ISO_single_shift_2:
923 /* SS2 is handled as an escape sequence of ESC 'N' */
924 c1 = 'N';
925 goto label_escape_sequence;
927 case ISO_single_shift_3:
928 /* SS2 is handled as an escape sequence of ESC 'O' */
929 c1 = 'O';
930 goto label_escape_sequence;
932 case ISO_control_sequence_introducer:
933 /* CSI is handled as an escape sequence of ESC '[' ... */
934 c1 = '[';
935 goto label_escape_sequence;
937 case ISO_escape:
938 ONE_MORE_BYTE (c1);
939 label_escape_sequence:
940 /* Escape sequences handled by Emacs are invocation,
941 designation, direction specification, and character
942 composition specification. */
943 switch (c1)
945 case '&': /* revision of following character set */
946 ONE_MORE_BYTE (c1);
947 if (!(c1 >= '@' && c1 <= '~'))
948 goto label_invalid_escape_sequence;
949 ONE_MORE_BYTE (c1);
950 if (c1 != ISO_CODE_ESC)
951 goto label_invalid_escape_sequence;
952 ONE_MORE_BYTE (c1);
953 goto label_escape_sequence;
955 case '$': /* designation of 2-byte character set */
956 ONE_MORE_BYTE (c1);
957 if (c1 >= '@' && c1 <= 'B')
958 { /* designation of JISX0208.1978, GB2312.1980,
959 or JISX0208.1980 */
960 DECODE_DESIGNATION (0, 2, 94, c1);
962 else if (c1 >= 0x28 && c1 <= 0x2B)
963 { /* designation of DIMENSION2_CHARS94 character set */
964 ONE_MORE_BYTE (c2);
965 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
967 else if (c1 >= 0x2C && c1 <= 0x2F)
968 { /* designation of DIMENSION2_CHARS96 character set */
969 ONE_MORE_BYTE (c2);
970 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
972 else
973 goto label_invalid_escape_sequence;
974 break;
976 case 'n': /* invocation of locking-shift-2 */
977 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
978 goto label_invalid_escape_sequence;
979 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
980 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
981 break;
983 case 'o': /* invocation of locking-shift-3 */
984 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
985 goto label_invalid_escape_sequence;
986 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
987 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
988 break;
990 case 'N': /* invocation of single-shift-2 */
991 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
992 goto label_invalid_escape_sequence;
993 ONE_MORE_BYTE (c1);
994 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
995 DECODE_ISO_CHARACTER (charset, c1);
996 break;
998 case 'O': /* invocation of single-shift-3 */
999 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1000 goto label_invalid_escape_sequence;
1001 ONE_MORE_BYTE (c1);
1002 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1003 DECODE_ISO_CHARACTER (charset, c1);
1004 break;
1006 case '0': /* start composing without embeded rules */
1007 coding->composing = COMPOSING_NO_RULE_HEAD;
1008 break;
1010 case '1': /* end composing */
1011 coding->composing = COMPOSING_NO;
1012 break;
1014 case '2': /* start composing with embeded rules */
1015 coding->composing = COMPOSING_WITH_RULE_HEAD;
1016 break;
1018 case '[': /* specification of direction */
1019 /* For the moment, nested direction is not supported.
1020 So, the value of `coding->direction' is 0 or 1: 0
1021 means left-to-right, 1 means right-to-left. */
1022 ONE_MORE_BYTE (c1);
1023 switch (c1)
1025 case ']': /* end of the current direction */
1026 coding->direction = 0;
1028 case '0': /* end of the current direction */
1029 case '1': /* start of left-to-right direction */
1030 ONE_MORE_BYTE (c1);
1031 if (c1 == ']')
1032 coding->direction = 0;
1033 else
1034 goto label_invalid_escape_sequence;
1035 break;
1037 case '2': /* start of right-to-left direction */
1038 ONE_MORE_BYTE (c1);
1039 if (c1 == ']')
1040 coding->direction= 1;
1041 else
1042 goto label_invalid_escape_sequence;
1043 break;
1045 default:
1046 goto label_invalid_escape_sequence;
1048 break;
1050 default:
1051 if (c1 >= 0x28 && c1 <= 0x2B)
1052 { /* designation of DIMENSION1_CHARS94 character set */
1053 ONE_MORE_BYTE (c2);
1054 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1056 else if (c1 >= 0x2C && c1 <= 0x2F)
1057 { /* designation of DIMENSION1_CHARS96 character set */
1058 ONE_MORE_BYTE (c2);
1059 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1061 else
1063 goto label_invalid_escape_sequence;
1066 /* We must update these variables now. */
1067 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1068 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1069 break;
1071 label_invalid_escape_sequence:
1073 int length = src - src_base;
1075 bcopy (src_base, dst, length);
1076 dst += length;
1079 continue;
1081 label_end_of_loop:
1082 coding->carryover_size = src - src_base;
1083 bcopy (src_base, coding->carryover, coding->carryover_size);
1084 src = src_base;
1085 break;
1088 /* If this is the last block of the text to be decoded, we had
1089 better just flush out all remaining codes in the text although
1090 they are not valid characters. */
1091 if (coding->last_block)
1093 bcopy (src, dst, src_end - src);
1094 dst += (src_end - src);
1095 src = src_end;
1097 *consumed = src - source;
1098 return dst - destination;
1101 /* ISO2022 encoding stuff. */
1104 It is not enough to say just "ISO2022" on encoding, we have to
1105 specify more details. In Emacs, each coding-system of ISO2022
1106 variant has the following specifications:
1107 1. Initial designation to G0 thru G3.
1108 2. Allows short-form designation?
1109 3. ASCII should be designated to G0 before control characters?
1110 4. ASCII should be designated to G0 at end of line?
1111 5. 7-bit environment or 8-bit environment?
1112 6. Use locking-shift?
1113 7. Use Single-shift?
1114 And the following two are only for Japanese:
1115 8. Use ASCII in place of JIS0201-1976-Roman?
1116 9. Use JISX0208-1983 in place of JISX0208-1978?
1117 These specifications are encoded in `coding->flags' as flag bits
1118 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1119 details.
1122 /* Produce codes (escape sequence) for designating CHARSET to graphic
1123 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1124 the coding system CODING allows, produce designation sequence of
1125 short-form. */
1127 #define ENCODE_DESIGNATION(charset, reg, coding) \
1128 do { \
1129 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1130 char *intermediate_char_94 = "()*+"; \
1131 char *intermediate_char_96 = ",-./"; \
1132 Lisp_Object temp \
1133 = Fassq (make_number (charset), Vcharset_revision_alist); \
1134 if (! NILP (temp)) \
1136 *dst++ = ISO_CODE_ESC; \
1137 *dst++ = '&'; \
1138 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1140 *dst++ = ISO_CODE_ESC; \
1141 if (CHARSET_DIMENSION (charset) == 1) \
1143 if (CHARSET_CHARS (charset) == 94) \
1144 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1145 else \
1146 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1148 else \
1150 *dst++ = '$'; \
1151 if (CHARSET_CHARS (charset) == 94) \
1153 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1154 || reg != 0 \
1155 || final_char < '@' || final_char > 'B') \
1156 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1158 else \
1159 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1161 *dst++ = final_char; \
1162 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1163 } while (0)
1165 /* The following two macros produce codes (control character or escape
1166 sequence) for ISO2022 single-shift functions (single-shift-2 and
1167 single-shift-3). */
1169 #define ENCODE_SINGLE_SHIFT_2 \
1170 do { \
1171 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1172 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1173 else \
1174 *dst++ = ISO_CODE_SS2; \
1175 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1176 } while (0)
1178 #define ENCODE_SINGLE_SHIFT_3 \
1179 do { \
1180 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1181 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1182 else \
1183 *dst++ = ISO_CODE_SS3; \
1184 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1185 } while (0)
1187 /* The following four macros produce codes (control character or
1188 escape sequence) for ISO2022 locking-shift functions (shift-in,
1189 shift-out, locking-shift-2, and locking-shift-3). */
1191 #define ENCODE_SHIFT_IN \
1192 do { \
1193 *dst++ = ISO_CODE_SI; \
1194 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1195 } while (0)
1197 #define ENCODE_SHIFT_OUT \
1198 do { \
1199 *dst++ = ISO_CODE_SO; \
1200 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1201 } while (0)
1203 #define ENCODE_LOCKING_SHIFT_2 \
1204 do { \
1205 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1206 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1207 } while (0)
1209 #define ENCODE_LOCKING_SHIFT_3 \
1210 do { \
1211 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1212 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1213 } while (0)
1215 /* Produce codes for a DIMENSION1 character whose character set is
1216 CHARSET and whose position-code is C1. Designation and invocation
1217 sequences are also produced in advance if necessary. */
1220 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1221 do { \
1222 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1224 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1225 *dst++ = c1 & 0x7F; \
1226 else \
1227 *dst++ = c1 | 0x80; \
1228 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1229 break; \
1231 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1233 *dst++ = c1 & 0x7F; \
1234 break; \
1236 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1238 *dst++ = c1 | 0x80; \
1239 break; \
1241 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1242 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1244 /* We should not encode this character, instead produce one or \
1245 two `?'s. */ \
1246 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1247 if (CHARSET_WIDTH (charset) == 2) \
1248 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1249 break; \
1251 else \
1252 /* Since CHARSET is not yet invoked to any graphic planes, we \
1253 must invoke it, or, at first, designate it to some graphic \
1254 register. Then repeat the loop to actually produce the \
1255 character. */ \
1256 dst = encode_invocation_designation (charset, coding, dst); \
1257 } while (1)
1259 /* Produce codes for a DIMENSION2 character whose character set is
1260 CHARSET and whose position-codes are C1 and C2. Designation and
1261 invocation codes are also produced in advance if necessary. */
1263 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1264 do { \
1265 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1267 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1268 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1269 else \
1270 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1271 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1272 break; \
1274 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1276 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1277 break; \
1279 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1281 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1282 break; \
1284 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1285 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1287 /* We should not encode this character, instead produce one or \
1288 two `?'s. */ \
1289 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1290 if (CHARSET_WIDTH (charset) == 2) \
1291 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1292 break; \
1294 else \
1295 /* Since CHARSET is not yet invoked to any graphic planes, we \
1296 must invoke it, or, at first, designate it to some graphic \
1297 register. Then repeat the loop to actually produce the \
1298 character. */ \
1299 dst = encode_invocation_designation (charset, coding, dst); \
1300 } while (1)
1302 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1303 do { \
1304 int c_alt, charset_alt; \
1305 if (!NILP (unification_table) \
1306 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1307 >= 0)) \
1308 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1309 else \
1310 charset_alt = charset; \
1311 if (CHARSET_DIMENSION (charset_alt) == 1) \
1312 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1313 else \
1314 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1315 } while (0)
1317 /* Produce designation and invocation codes at a place pointed by DST
1318 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1319 Return new DST. */
1321 unsigned char *
1322 encode_invocation_designation (charset, coding, dst)
1323 int charset;
1324 struct coding_system *coding;
1325 unsigned char *dst;
1327 int reg; /* graphic register number */
1329 /* At first, check designations. */
1330 for (reg = 0; reg < 4; reg++)
1331 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1332 break;
1334 if (reg >= 4)
1336 /* CHARSET is not yet designated to any graphic registers. */
1337 /* At first check the requested designation. */
1338 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1339 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1340 /* Since CHARSET requests no special designation, designate it
1341 to graphic register 0. */
1342 reg = 0;
1344 ENCODE_DESIGNATION (charset, reg, coding);
1347 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1348 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1350 /* Since the graphic register REG is not invoked to any graphic
1351 planes, invoke it to graphic plane 0. */
1352 switch (reg)
1354 case 0: /* graphic register 0 */
1355 ENCODE_SHIFT_IN;
1356 break;
1358 case 1: /* graphic register 1 */
1359 ENCODE_SHIFT_OUT;
1360 break;
1362 case 2: /* graphic register 2 */
1363 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1364 ENCODE_SINGLE_SHIFT_2;
1365 else
1366 ENCODE_LOCKING_SHIFT_2;
1367 break;
1369 case 3: /* graphic register 3 */
1370 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1371 ENCODE_SINGLE_SHIFT_3;
1372 else
1373 ENCODE_LOCKING_SHIFT_3;
1374 break;
1377 return dst;
1380 /* The following two macros produce codes for indicating composition. */
1381 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1382 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1383 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1385 /* The following three macros produce codes for indicating direction
1386 of text. */
1387 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1388 do { \
1389 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1390 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1391 else \
1392 *dst++ = ISO_CODE_CSI; \
1393 } while (0)
1395 #define ENCODE_DIRECTION_R2L \
1396 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1398 #define ENCODE_DIRECTION_L2R \
1399 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1401 /* Produce codes for designation and invocation to reset the graphic
1402 planes and registers to initial state. */
1403 #define ENCODE_RESET_PLANE_AND_REGISTER \
1404 do { \
1405 int reg; \
1406 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1407 ENCODE_SHIFT_IN; \
1408 for (reg = 0; reg < 4; reg++) \
1409 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1410 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1411 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1412 ENCODE_DESIGNATION \
1413 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1414 } while (0)
1416 /* Produce designation sequences of charsets in the line started from
1417 *SRC to a place pointed by DSTP.
1419 If the current block ends before any end-of-line, we may fail to
1420 find all the necessary *designations. */
1421 encode_designation_at_bol (coding, table, src, src_end, dstp)
1422 struct coding_system *coding;
1423 Lisp_Object table;
1424 unsigned char *src, *src_end, **dstp;
1426 int charset, c, found = 0, reg;
1427 /* Table of charsets to be designated to each graphic register. */
1428 int r[4];
1429 unsigned char *dst = *dstp;
1431 for (reg = 0; reg < 4; reg++)
1432 r[reg] = -1;
1434 while (src < src_end && *src != '\n' && found < 4)
1436 int bytes = BYTES_BY_CHAR_HEAD (*src);
1438 if (NILP (table))
1439 charset = CHARSET_AT (src);
1440 else
1442 int c_alt, c1, c2;
1444 SPLIT_STRING(src, bytes, charset, c1, c2);
1445 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1446 charset = CHAR_CHARSET (c_alt);
1449 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1450 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1452 found++;
1453 r[reg] = charset;
1456 src += bytes;
1459 if (found)
1461 for (reg = 0; reg < 4; reg++)
1462 if (r[reg] >= 0
1463 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1464 ENCODE_DESIGNATION (r[reg], reg, coding);
1465 *dstp = dst;
1469 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1472 encode_coding_iso2022 (coding, source, destination,
1473 src_bytes, dst_bytes, consumed)
1474 struct coding_system *coding;
1475 unsigned char *source, *destination;
1476 int src_bytes, dst_bytes;
1477 int *consumed;
1479 unsigned char *src = source;
1480 unsigned char *src_end = source + src_bytes;
1481 unsigned char *dst = destination;
1482 unsigned char *dst_end = destination + dst_bytes;
1483 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1484 from DST_END to assure overflow checking is necessary only at the
1485 head of loop. */
1486 unsigned char *adjusted_dst_end = dst_end - 19;
1487 Lisp_Object unification_table
1488 = coding->character_unification_table_for_encode;
1490 if (!NILP (Venable_character_unification) && NILP (unification_table))
1491 unification_table = Vstandard_character_unification_table_for_encode;
1493 while (src < src_end && dst < adjusted_dst_end)
1495 /* SRC_BASE remembers the start position in source in each loop.
1496 The loop will be exited when there's not enough source text
1497 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1498 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1499 reset to SRC_BASE before exiting. */
1500 unsigned char *src_base = src;
1501 int charset, c1, c2, c3, c4;
1503 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1504 && CODING_SPEC_ISO_BOL (coding))
1506 /* We have to produce designation sequences if any now. */
1507 encode_designation_at_bol (coding, unification_table,
1508 src, src_end, &dst);
1509 CODING_SPEC_ISO_BOL (coding) = 0;
1512 c1 = *src++;
1513 /* If we are seeing a component of a composite character, we are
1514 seeing a leading-code specially encoded for composition, or a
1515 composition rule if composing with rule. We must set C1
1516 to a normal leading-code or an ASCII code. If we are not at
1517 a composed character, we must reset the composition state. */
1518 if (COMPOSING_P (coding->composing))
1520 if (c1 < 0xA0)
1522 /* We are not in a composite character any longer. */
1523 coding->composing = COMPOSING_NO;
1524 ENCODE_COMPOSITION_END;
1526 else
1528 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1530 *dst++ = c1 & 0x7F;
1531 coding->composing = COMPOSING_WITH_RULE_HEAD;
1532 continue;
1534 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1535 coding->composing = COMPOSING_WITH_RULE_RULE;
1536 if (c1 == 0xA0)
1538 /* This is an ASCII component. */
1539 ONE_MORE_BYTE (c1);
1540 c1 &= 0x7F;
1542 else
1543 /* This is a leading-code of non ASCII component. */
1544 c1 -= 0x20;
1548 /* Now encode one character. C1 is a control character, an
1549 ASCII character, or a leading-code of multi-byte character. */
1550 switch (emacs_code_class[c1])
1552 case EMACS_ascii_code:
1553 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1554 break;
1556 case EMACS_control_code:
1557 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1558 ENCODE_RESET_PLANE_AND_REGISTER;
1559 *dst++ = c1;
1560 break;
1562 case EMACS_carriage_return_code:
1563 if (!coding->selective)
1565 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1566 ENCODE_RESET_PLANE_AND_REGISTER;
1567 *dst++ = c1;
1568 break;
1570 /* fall down to treat '\r' as '\n' ... */
1572 case EMACS_linefeed_code:
1573 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1574 ENCODE_RESET_PLANE_AND_REGISTER;
1575 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1576 bcopy (coding->spec.iso2022.initial_designation,
1577 coding->spec.iso2022.current_designation,
1578 sizeof coding->spec.iso2022.initial_designation);
1579 if (coding->eol_type == CODING_EOL_LF
1580 || coding->eol_type == CODING_EOL_UNDECIDED)
1581 *dst++ = ISO_CODE_LF;
1582 else if (coding->eol_type == CODING_EOL_CRLF)
1583 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1584 else
1585 *dst++ = ISO_CODE_CR;
1586 CODING_SPEC_ISO_BOL (coding) = 1;
1587 break;
1589 case EMACS_leading_code_2:
1590 ONE_MORE_BYTE (c2);
1591 if (c2 < 0xA0)
1593 /* invalid sequence */
1594 *dst++ = c1;
1595 *dst++ = c2;
1597 else
1598 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1599 break;
1601 case EMACS_leading_code_3:
1602 TWO_MORE_BYTES (c2, c3);
1603 if (c2 < 0xA0 || c3 < 0xA0)
1605 /* invalid sequence */
1606 *dst++ = c1;
1607 *dst++ = c2;
1608 *dst++ = c3;
1610 else if (c1 < LEADING_CODE_PRIVATE_11)
1611 ENCODE_ISO_CHARACTER (c1, c2, c3);
1612 else
1613 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1614 break;
1616 case EMACS_leading_code_4:
1617 THREE_MORE_BYTES (c2, c3, c4);
1618 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1620 /* invalid sequence */
1621 *dst++ = c1;
1622 *dst++ = c2;
1623 *dst++ = c3;
1624 *dst++ = c4;
1626 else
1627 ENCODE_ISO_CHARACTER (c2, c3, c4);
1628 break;
1630 case EMACS_leading_code_composition:
1631 ONE_MORE_BYTE (c2);
1632 if (c2 < 0xA0)
1634 /* invalid sequence */
1635 *dst++ = c1;
1636 *dst++ = c2;
1638 else if (c2 == 0xFF)
1640 coding->composing = COMPOSING_WITH_RULE_HEAD;
1641 ENCODE_COMPOSITION_WITH_RULE_START;
1643 else
1645 /* Rewind one byte because it is a character code of
1646 composition elements. */
1647 src--;
1648 coding->composing = COMPOSING_NO_RULE_HEAD;
1649 ENCODE_COMPOSITION_NO_RULE_START;
1651 break;
1653 case EMACS_invalid_code:
1654 *dst++ = c1;
1655 break;
1657 continue;
1658 label_end_of_loop:
1659 /* We reach here because the source date ends not at character
1660 boundary. */
1661 coding->carryover_size = src_end - src_base;
1662 bcopy (src_base, coding->carryover, coding->carryover_size);
1663 src = src_end;
1664 break;
1667 /* If this is the last block of the text to be encoded, we must
1668 reset graphic planes and registers to the initial state. */
1669 if (src >= src_end && coding->last_block)
1671 ENCODE_RESET_PLANE_AND_REGISTER;
1672 if (coding->carryover_size > 0
1673 && coding->carryover_size < (dst_end - dst))
1675 bcopy (coding->carryover, dst, coding->carryover_size);
1676 dst += coding->carryover_size;
1677 coding->carryover_size = 0;
1680 *consumed = src - source;
1681 return dst - destination;
1685 /*** 4. SJIS and BIG5 handlers ***/
1687 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1688 quite widely. So, for the moment, Emacs supports them in the bare
1689 C code. But, in the future, they may be supported only by CCL. */
1691 /* SJIS is a coding system encoding three character sets: ASCII, right
1692 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1693 as is. A character of charset katakana-jisx0201 is encoded by
1694 "position-code + 0x80". A character of charset japanese-jisx0208
1695 is encoded in 2-byte but two position-codes are divided and shifted
1696 so that it fit in the range below.
1698 --- CODE RANGE of SJIS ---
1699 (character set) (range)
1700 ASCII 0x00 .. 0x7F
1701 KATAKANA-JISX0201 0xA0 .. 0xDF
1702 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1703 (2nd byte) 0x40 .. 0xFF
1704 -------------------------------
1708 /* BIG5 is a coding system encoding two character sets: ASCII and
1709 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1710 character set and is encoded in two-byte.
1712 --- CODE RANGE of BIG5 ---
1713 (character set) (range)
1714 ASCII 0x00 .. 0x7F
1715 Big5 (1st byte) 0xA1 .. 0xFE
1716 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1717 --------------------------
1719 Since the number of characters in Big5 is larger than maximum
1720 characters in Emacs' charset (96x96), it can't be handled as one
1721 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1722 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1723 contains frequently used characters and the latter contains less
1724 frequently used characters. */
1726 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1727 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1728 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1729 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1731 /* Number of Big5 characters which have the same code in 1st byte. */
1732 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1734 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1735 do { \
1736 unsigned int temp \
1737 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1738 if (b1 < 0xC9) \
1739 charset = charset_big5_1; \
1740 else \
1742 charset = charset_big5_2; \
1743 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1745 c1 = temp / (0xFF - 0xA1) + 0x21; \
1746 c2 = temp % (0xFF - 0xA1) + 0x21; \
1747 } while (0)
1749 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1750 do { \
1751 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1752 if (charset == charset_big5_2) \
1753 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1754 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1755 b2 = temp % BIG5_SAME_ROW; \
1756 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1757 } while (0)
1759 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1760 do { \
1761 int c_alt, charset_alt = (charset); \
1762 if (!NILP (unification_table) \
1763 && ((c_alt = unify_char (unification_table, \
1764 -1, (charset), c1, c2)) >= 0)) \
1765 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1766 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1767 DECODE_CHARACTER_ASCII (c1); \
1768 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1769 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1770 else \
1771 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1772 } while (0)
1774 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1775 do { \
1776 int c_alt, charset_alt; \
1777 if (!NILP (unification_table) \
1778 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1779 >= 0)) \
1780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1781 else \
1782 charset_alt = charset; \
1783 if (charset_alt == charset_ascii) \
1784 *dst++ = c1; \
1785 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1787 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1788 *dst++ = c1; \
1789 else \
1790 *dst++ = charset_alt, *dst++ = c1; \
1792 else \
1794 c1 &= 0x7F, c2 &= 0x7F; \
1795 if (sjis_p && charset_alt == charset_jisx0208) \
1797 unsigned char s1, s2; \
1799 ENCODE_SJIS (c1, c2, s1, s2); \
1800 *dst++ = s1, *dst++ = s2; \
1802 else if (!sjis_p \
1803 && (charset_alt == charset_big5_1 \
1804 || charset_alt == charset_big5_2)) \
1806 unsigned char b1, b2; \
1808 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1809 *dst++ = b1, *dst++ = b2; \
1811 else \
1812 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1814 } while (0);
1816 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1817 Check if a text is encoded in SJIS. If it is, return
1818 CODING_CATEGORY_MASK_SJIS, else return 0. */
1821 detect_coding_sjis (src, src_end)
1822 unsigned char *src, *src_end;
1824 unsigned char c;
1826 while (src < src_end)
1828 c = *src++;
1829 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1830 return 0;
1831 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1833 if (src < src_end && *src++ < 0x40)
1834 return 0;
1837 return CODING_CATEGORY_MASK_SJIS;
1840 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1841 Check if a text is encoded in BIG5. If it is, return
1842 CODING_CATEGORY_MASK_BIG5, else return 0. */
1845 detect_coding_big5 (src, src_end)
1846 unsigned char *src, *src_end;
1848 unsigned char c;
1850 while (src < src_end)
1852 c = *src++;
1853 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1854 return 0;
1855 if (c >= 0xA1)
1857 if (src >= src_end)
1858 break;
1859 c = *src++;
1860 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1861 return 0;
1864 return CODING_CATEGORY_MASK_BIG5;
1867 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1868 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1871 decode_coding_sjis_big5 (coding, source, destination,
1872 src_bytes, dst_bytes, consumed, sjis_p)
1873 struct coding_system *coding;
1874 unsigned char *source, *destination;
1875 int src_bytes, dst_bytes;
1876 int *consumed;
1877 int sjis_p;
1879 unsigned char *src = source;
1880 unsigned char *src_end = source + src_bytes;
1881 unsigned char *dst = destination;
1882 unsigned char *dst_end = destination + dst_bytes;
1883 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1884 from DST_END to assure overflow checking is necessary only at the
1885 head of loop. */
1886 unsigned char *adjusted_dst_end = dst_end - 3;
1887 Lisp_Object unification_table
1888 = coding->character_unification_table_for_decode;
1890 if (!NILP (Venable_character_unification) && NILP (unification_table))
1891 unification_table = Vstandard_character_unification_table_for_decode;
1893 while (src < src_end && dst < adjusted_dst_end)
1895 /* SRC_BASE remembers the start position in source in each loop.
1896 The loop will be exited when there's not enough source text
1897 to analyze two-byte character (within macro ONE_MORE_BYTE).
1898 In that case, SRC is reset to SRC_BASE before exiting. */
1899 unsigned char *src_base = src;
1900 unsigned char c1 = *src++, c2, c3, c4;
1902 if (c1 == '\r')
1904 if (coding->eol_type == CODING_EOL_CRLF)
1906 ONE_MORE_BYTE (c2);
1907 if (c2 == '\n')
1908 *dst++ = c2;
1909 else
1910 /* To process C2 again, SRC is subtracted by 1. */
1911 *dst++ = c1, src--;
1913 else
1914 *dst++ = c1;
1916 else if (c1 < 0x20)
1917 *dst++ = c1;
1918 else if (c1 < 0x80)
1919 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1920 else if (c1 < 0xA0 || c1 >= 0xE0)
1922 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1923 if (sjis_p)
1925 ONE_MORE_BYTE (c2);
1926 DECODE_SJIS (c1, c2, c3, c4);
1927 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1929 else if (c1 >= 0xE0 && c1 < 0xFF)
1931 int charset;
1933 ONE_MORE_BYTE (c2);
1934 DECODE_BIG5 (c1, c2, charset, c3, c4);
1935 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1937 else /* Invalid code */
1938 *dst++ = c1;
1940 else
1942 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1943 if (sjis_p)
1944 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1945 else
1947 int charset;
1949 ONE_MORE_BYTE (c2);
1950 DECODE_BIG5 (c1, c2, charset, c3, c4);
1951 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1954 continue;
1956 label_end_of_loop:
1957 coding->carryover_size = src - src_base;
1958 bcopy (src_base, coding->carryover, coding->carryover_size);
1959 src = src_base;
1960 break;
1963 *consumed = src - source;
1964 return dst - destination;
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1968 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1969 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1970 sure that all these charsets are registered as official charset
1971 (i.e. do not have extended leading-codes). Characters of other
1972 charsets are produced without any encoding. If SJIS_P is 1, encode
1973 SJIS text, else encode BIG5 text. */
1976 encode_coding_sjis_big5 (coding, source, destination,
1977 src_bytes, dst_bytes, consumed, sjis_p)
1978 struct coding_system *coding;
1979 unsigned char *source, *destination;
1980 int src_bytes, dst_bytes;
1981 int *consumed;
1982 int sjis_p;
1984 unsigned char *src = source;
1985 unsigned char *src_end = source + src_bytes;
1986 unsigned char *dst = destination;
1987 unsigned char *dst_end = destination + dst_bytes;
1988 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1989 from DST_END to assure overflow checking is necessary only at the
1990 head of loop. */
1991 unsigned char *adjusted_dst_end = dst_end - 1;
1992 Lisp_Object unification_table
1993 = coding->character_unification_table_for_encode;
1995 if (!NILP (Venable_character_unification) && NILP (unification_table))
1996 unification_table = Vstandard_character_unification_table_for_encode;
1998 while (src < src_end && dst < adjusted_dst_end)
2000 /* SRC_BASE remembers the start position in source in each loop.
2001 The loop will be exited when there's not enough source text
2002 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2003 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2004 before exiting. */
2005 unsigned char *src_base = src;
2006 unsigned char c1 = *src++, c2, c3, c4;
2008 if (coding->composing)
2010 if (c1 == 0xA0)
2012 ONE_MORE_BYTE (c1);
2013 c1 &= 0x7F;
2015 else if (c1 >= 0xA0)
2016 c1 -= 0x20;
2017 else
2018 coding->composing = 0;
2021 switch (emacs_code_class[c1])
2023 case EMACS_ascii_code:
2024 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2025 break;
2027 case EMACS_control_code:
2028 *dst++ = c1;
2029 break;
2031 case EMACS_carriage_return_code:
2032 if (!coding->selective)
2034 *dst++ = c1;
2035 break;
2037 /* fall down to treat '\r' as '\n' ... */
2039 case EMACS_linefeed_code:
2040 if (coding->eol_type == CODING_EOL_LF
2041 || coding->eol_type == CODING_EOL_UNDECIDED)
2042 *dst++ = '\n';
2043 else if (coding->eol_type == CODING_EOL_CRLF)
2044 *dst++ = '\r', *dst++ = '\n';
2045 else
2046 *dst++ = '\r';
2047 break;
2049 case EMACS_leading_code_2:
2050 ONE_MORE_BYTE (c2);
2051 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2052 break;
2054 case EMACS_leading_code_3:
2055 TWO_MORE_BYTES (c2, c3);
2056 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2057 break;
2059 case EMACS_leading_code_4:
2060 THREE_MORE_BYTES (c2, c3, c4);
2061 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2062 break;
2064 case EMACS_leading_code_composition:
2065 coding->composing = 1;
2066 break;
2068 default: /* i.e. case EMACS_invalid_code: */
2069 *dst++ = c1;
2071 continue;
2073 label_end_of_loop:
2074 coding->carryover_size = src_end - src_base;
2075 bcopy (src_base, coding->carryover, coding->carryover_size);
2076 src = src_end;
2077 break;
2080 *consumed = src - source;
2081 return dst - destination;
2085 /*** 5. End-of-line handlers ***/
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2088 This function is called only when `coding->eol_type' is
2089 CODING_EOL_CRLF or CODING_EOL_CR. */
2091 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2092 struct coding_system *coding;
2093 unsigned char *source, *destination;
2094 int src_bytes, dst_bytes;
2095 int *consumed;
2097 unsigned char *src = source;
2098 unsigned char *src_end = source + src_bytes;
2099 unsigned char *dst = destination;
2100 unsigned char *dst_end = destination + dst_bytes;
2101 int produced;
2103 switch (coding->eol_type)
2105 case CODING_EOL_CRLF:
2107 /* Since the maximum bytes produced by each loop is 2, we
2108 subtract 1 from DST_END to assure overflow checking is
2109 necessary only at the head of loop. */
2110 unsigned char *adjusted_dst_end = dst_end - 1;
2112 while (src < src_end && dst < adjusted_dst_end)
2114 unsigned char *src_base = src;
2115 unsigned char c = *src++;
2116 if (c == '\r')
2118 ONE_MORE_BYTE (c);
2119 if (c != '\n')
2120 *dst++ = '\r';
2121 *dst++ = c;
2123 else
2124 *dst++ = c;
2125 continue;
2127 label_end_of_loop:
2128 coding->carryover_size = src - src_base;
2129 bcopy (src_base, coding->carryover, coding->carryover_size);
2130 src = src_base;
2131 break;
2133 *consumed = src - source;
2134 produced = dst - destination;
2135 break;
2138 case CODING_EOL_CR:
2139 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2140 bcopy (source, destination, produced);
2141 dst_end = destination + produced;
2142 while (dst < dst_end)
2143 if (*dst++ == '\r') dst[-1] = '\n';
2144 *consumed = produced;
2145 break;
2147 default: /* i.e. case: CODING_EOL_LF */
2148 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2149 bcopy (source, destination, produced);
2150 *consumed = produced;
2151 break;
2154 return produced;
2157 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2158 format of end-of-line according to `coding->eol_type'. If
2159 `coding->selective' is 1, code '\r' in source text also means
2160 end-of-line. */
2162 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2163 struct coding_system *coding;
2164 unsigned char *source, *destination;
2165 int src_bytes, dst_bytes;
2166 int *consumed;
2168 unsigned char *src = source;
2169 unsigned char *dst = destination;
2170 int produced;
2172 if (src_bytes <= 0)
2173 return 0;
2175 switch (coding->eol_type)
2177 case CODING_EOL_LF:
2178 case CODING_EOL_UNDECIDED:
2179 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2180 bcopy (source, destination, produced);
2181 if (coding->selective)
2183 int i = produced;
2184 while (i--)
2185 if (*dst++ == '\r') dst[-1] = '\n';
2187 *consumed = produced;
2189 case CODING_EOL_CRLF:
2191 unsigned char c;
2192 unsigned char *src_end = source + src_bytes;
2193 unsigned char *dst_end = destination + dst_bytes;
2194 /* Since the maximum bytes produced by each loop is 2, we
2195 subtract 1 from DST_END to assure overflow checking is
2196 necessary only at the head of loop. */
2197 unsigned char *adjusted_dst_end = dst_end - 1;
2199 while (src < src_end && dst < adjusted_dst_end)
2201 c = *src++;
2202 if (c == '\n' || (c == '\r' && coding->selective))
2203 *dst++ = '\r', *dst++ = '\n';
2204 else
2205 *dst++ = c;
2207 produced = dst - destination;
2208 *consumed = src - source;
2209 break;
2212 default: /* i.e. case CODING_EOL_CR: */
2213 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2214 bcopy (source, destination, produced);
2216 int i = produced;
2217 while (i--)
2218 if (*dst++ == '\n') dst[-1] = '\r';
2220 *consumed = produced;
2223 return produced;
2227 /*** 6. C library functions ***/
2229 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2230 has a property `coding-system'. The value of this property is a
2231 vector of length 5 (called as coding-vector). Among elements of
2232 this vector, the first (element[0]) and the fifth (element[4])
2233 carry important information for decoding/encoding. Before
2234 decoding/encoding, this information should be set in fields of a
2235 structure of type `coding_system'.
2237 A value of property `coding-system' can be a symbol of another
2238 subsidiary coding-system. In that case, Emacs gets coding-vector
2239 from that symbol.
2241 `element[0]' contains information to be set in `coding->type'. The
2242 value and its meaning is as follows:
2244 0 -- coding_type_emacs_mule
2245 1 -- coding_type_sjis
2246 2 -- coding_type_iso2022
2247 3 -- coding_type_big5
2248 4 -- coding_type_ccl encoder/decoder written in CCL
2249 nil -- coding_type_no_conversion
2250 t -- coding_type_undecided (automatic conversion on decoding,
2251 no-conversion on encoding)
2253 `element[4]' contains information to be set in `coding->flags' and
2254 `coding->spec'. The meaning varies by `coding->type'.
2256 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2257 of length 32 (of which the first 13 sub-elements are used now).
2258 Meanings of these sub-elements are:
2260 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2261 If the value is an integer of valid charset, the charset is
2262 assumed to be designated to graphic register N initially.
2264 If the value is minus, it is a minus value of charset which
2265 reserves graphic register N, which means that the charset is
2266 not designated initially but should be designated to graphic
2267 register N just before encoding a character in that charset.
2269 If the value is nil, graphic register N is never used on
2270 encoding.
2272 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2273 Each value takes t or nil. See the section ISO2022 of
2274 `coding.h' for more information.
2276 If `coding->type' is `coding_type_big5', element[4] is t to denote
2277 BIG5-ETen or nil to denote BIG5-HKU.
2279 If `coding->type' takes the other value, element[4] is ignored.
2281 Emacs Lisp's coding system also carries information about format of
2282 end-of-line in a value of property `eol-type'. If the value is
2283 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2284 means CODING_EOL_CR. If it is not integer, it should be a vector
2285 of subsidiary coding systems of which property `eol-type' has one
2286 of above values.
2290 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2291 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2292 is setup so that no conversion is necessary and return -1, else
2293 return 0. */
2296 setup_coding_system (coding_system, coding)
2297 Lisp_Object coding_system;
2298 struct coding_system *coding;
2300 Lisp_Object type, eol_type;
2302 /* At first, set several fields to default values. */
2303 coding->require_flushing = 0;
2304 coding->last_block = 0;
2305 coding->selective = 0;
2306 coding->composing = 0;
2307 coding->direction = 0;
2308 coding->carryover_size = 0;
2309 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2310 coding->character_unification_table_for_decode = Qnil;
2311 coding->character_unification_table_for_encode = Qnil;
2313 Vlast_coding_system_used = coding->symbol = coding_system;
2314 eol_type = Qnil;
2315 /* Get value of property `coding-system' until we get a vector.
2316 While doing that, also get values of properties
2317 `post-read-conversion', `pre-write-conversion',
2318 `character-unification-table-for-decode',
2319 `character-unification-table-for-encode' and `eol-type'. */
2320 while (!NILP (coding_system) && SYMBOLP (coding_system))
2322 if (NILP (coding->post_read_conversion))
2323 coding->post_read_conversion = Fget (coding_system,
2324 Qpost_read_conversion);
2325 if (NILP (coding->pre_write_conversion))
2326 coding->pre_write_conversion = Fget (coding_system,
2327 Qpre_write_conversion);
2328 if (!inhibit_eol_conversion && NILP (eol_type))
2329 eol_type = Fget (coding_system, Qeol_type);
2331 if (NILP (coding->character_unification_table_for_decode))
2332 coding->character_unification_table_for_decode
2333 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2335 if (NILP (coding->character_unification_table_for_encode))
2336 coding->character_unification_table_for_encode
2337 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2339 coding_system = Fget (coding_system, Qcoding_system);
2342 while (!NILP (coding->character_unification_table_for_decode)
2343 && SYMBOLP (coding->character_unification_table_for_decode))
2344 coding->character_unification_table_for_decode
2345 = Fget (coding->character_unification_table_for_decode,
2346 Qcharacter_unification_table_for_decode);
2347 if (!NILP (coding->character_unification_table_for_decode)
2348 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2349 coding->character_unification_table_for_decode = Qnil;
2351 while (!NILP (coding->character_unification_table_for_encode)
2352 && SYMBOLP (coding->character_unification_table_for_encode))
2353 coding->character_unification_table_for_encode
2354 = Fget (coding->character_unification_table_for_encode,
2355 Qcharacter_unification_table_for_encode);
2356 if (!NILP (coding->character_unification_table_for_encode)
2357 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2358 coding->character_unification_table_for_encode = Qnil;
2360 if (!VECTORP (coding_system)
2361 || XVECTOR (coding_system)->size != 5)
2362 goto label_invalid_coding_system;
2364 if (VECTORP (eol_type))
2365 coding->eol_type = CODING_EOL_UNDECIDED;
2366 else if (XFASTINT (eol_type) == 1)
2367 coding->eol_type = CODING_EOL_CRLF;
2368 else if (XFASTINT (eol_type) == 2)
2369 coding->eol_type = CODING_EOL_CR;
2370 else
2371 coding->eol_type = CODING_EOL_LF;
2373 type = XVECTOR (coding_system)->contents[0];
2374 switch (XFASTINT (type))
2376 case 0:
2377 coding->type = coding_type_emacs_mule;
2378 break;
2380 case 1:
2381 coding->type = coding_type_sjis;
2382 break;
2384 case 2:
2385 coding->type = coding_type_iso2022;
2387 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2388 Lisp_Object *flags;
2389 int i, charset, default_reg_bits = 0;
2391 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2392 goto label_invalid_coding_system;
2394 flags = XVECTOR (val)->contents;
2395 coding->flags
2396 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2397 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2398 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2399 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2400 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2401 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2402 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2403 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2404 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2405 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2406 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2407 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2408 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2411 /* Invoke graphic register 0 to plane 0. */
2412 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2413 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2414 CODING_SPEC_ISO_INVOCATION (coding, 1)
2415 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2416 /* Not single shifting at first. */
2417 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2418 /* Beginning of buffer should also be regarded as bol. */
2419 CODING_SPEC_ISO_BOL (coding) = 1;
2421 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2422 FLAGS[REG] can be one of below:
2423 integer CHARSET: CHARSET occupies register I,
2424 t: designate nothing to REG initially, but can be used
2425 by any charsets,
2426 list of integer, nil, or t: designate the first
2427 element (if integer) to REG initially, the remaining
2428 elements (if integer) is designated to REG on request,
2429 if an element is t, REG can be used by any charset,
2430 nil: REG is never used. */
2431 for (charset = 0; charset <= MAX_CHARSET; charset++)
2432 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2433 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2434 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2435 for (i = 0; i < 4; i++)
2437 if (INTEGERP (flags[i])
2438 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2439 || (charset = get_charset_id (flags[i])) >= 0)
2441 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2442 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2443 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2445 else if (EQ (flags[i], Qt))
2447 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2448 default_reg_bits |= 1 << i;
2450 else if (CONSP (flags[i]))
2452 Lisp_Object tail = flags[i];
2454 if (INTEGERP (XCONS (tail)->car)
2455 && (charset = XINT (XCONS (tail)->car),
2456 CHARSET_VALID_P (charset))
2457 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2459 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2460 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2461 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2463 else
2464 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2465 tail = XCONS (tail)->cdr;
2466 while (CONSP (tail))
2468 if (INTEGERP (XCONS (tail)->car)
2469 && (charset = XINT (XCONS (tail)->car),
2470 CHARSET_VALID_P (charset))
2471 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2473 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2474 = i;
2475 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2476 = 1;
2478 else if (EQ (XCONS (tail)->car, Qt))
2479 default_reg_bits |= 1 << i;
2480 tail = XCONS (tail)->cdr;
2483 else
2484 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2486 CODING_SPEC_ISO_DESIGNATION (coding, i)
2487 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2490 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2492 /* REG 1 can be used only by locking shift in 7-bit env. */
2493 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2494 default_reg_bits &= ~2;
2495 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2496 /* Without any shifting, only REG 0 and 1 can be used. */
2497 default_reg_bits &= 3;
2500 for (charset = 0; charset <= MAX_CHARSET; charset++)
2501 if (CHARSET_VALID_P (charset)
2502 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2503 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2505 /* We have not yet decided where to designate CHARSET. */
2506 int reg_bits = default_reg_bits;
2508 if (CHARSET_CHARS (charset) == 96)
2509 /* A charset of CHARS96 can't be designated to REG 0. */
2510 reg_bits &= ~1;
2512 if (reg_bits)
2513 /* There exist some default graphic register. */
2514 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2515 = (reg_bits & 1
2516 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2517 else
2518 /* We anyway have to designate CHARSET to somewhere. */
2519 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2520 = (CHARSET_CHARS (charset) == 94
2522 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2523 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2525 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2526 ? 2 : 0)));
2529 coding->require_flushing = 1;
2530 break;
2532 case 3:
2533 coding->type = coding_type_big5;
2534 coding->flags
2535 = (NILP (XVECTOR (coding_system)->contents[4])
2536 ? CODING_FLAG_BIG5_HKU
2537 : CODING_FLAG_BIG5_ETEN);
2538 break;
2540 case 4:
2541 coding->type = coding_type_ccl;
2543 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2544 if (CONSP (val)
2545 && VECTORP (XCONS (val)->car)
2546 && VECTORP (XCONS (val)->cdr))
2548 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2549 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2551 else
2552 goto label_invalid_coding_system;
2554 coding->require_flushing = 1;
2555 break;
2557 case 5:
2558 coding->type = coding_type_raw_text;
2559 break;
2561 default:
2562 if (EQ (type, Qt))
2563 coding->type = coding_type_undecided;
2564 else
2565 coding->type = coding_type_no_conversion;
2566 break;
2568 return 0;
2570 label_invalid_coding_system:
2571 coding->type = coding_type_no_conversion;
2572 coding->eol_type = CODING_EOL_LF;
2573 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2574 = Qnil;
2575 return -1;
2578 /* Emacs has a mechanism to automatically detect a coding system if it
2579 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2580 it's impossible to distinguish some coding systems accurately
2581 because they use the same range of codes. So, at first, coding
2582 systems are categorized into 7, those are:
2584 o coding-category-emacs-mule
2586 The category for a coding system which has the same code range
2587 as Emacs' internal format. Assigned the coding-system (Lisp
2588 symbol) `emacs-mule' by default.
2590 o coding-category-sjis
2592 The category for a coding system which has the same code range
2593 as SJIS. Assigned the coding-system (Lisp
2594 symbol) `japanese-shift-jis' by default.
2596 o coding-category-iso-7
2598 The category for a coding system which has the same code range
2599 as ISO2022 of 7-bit environment. This doesn't use any locking
2600 shift and single shift functions. Assigned the coding-system
2601 (Lisp symbol) `iso-2022-7bit' by default.
2603 o coding-category-iso-8-1
2605 The category for a coding system which has the same code range
2606 as ISO2022 of 8-bit environment and graphic plane 1 used only
2607 for DIMENSION1 charset. This doesn't use any locking shift
2608 and single shift functions. Assigned the coding-system (Lisp
2609 symbol) `iso-latin-1' by default.
2611 o coding-category-iso-8-2
2613 The category for a coding system which has the same code range
2614 as ISO2022 of 8-bit environment and graphic plane 1 used only
2615 for DIMENSION2 charset. This doesn't use any locking shift
2616 and single shift functions. Assigned the coding-system (Lisp
2617 symbol) `japanese-iso-8bit' by default.
2619 o coding-category-iso-7-else
2621 The category for a coding system which has the same code range
2622 as ISO2022 of 7-bit environemnt but uses locking shift or
2623 single shift functions. Assigned the coding-system (Lisp
2624 symbol) `iso-2022-7bit-lock' by default.
2626 o coding-category-iso-8-else
2628 The category for a coding system which has the same code range
2629 as ISO2022 of 8-bit environemnt but uses locking shift or
2630 single shift functions. Assigned the coding-system (Lisp
2631 symbol) `iso-2022-8bit-ss2' by default.
2633 o coding-category-big5
2635 The category for a coding system which has the same code range
2636 as BIG5. Assigned the coding-system (Lisp symbol)
2637 `cn-big5' by default.
2639 o coding-category-binary
2641 The category for a coding system not categorized in any of the
2642 above. Assigned the coding-system (Lisp symbol)
2643 `no-conversion' by default.
2645 Each of them is a Lisp symbol and the value is an actual
2646 `coding-system's (this is also a Lisp symbol) assigned by a user.
2647 What Emacs does actually is to detect a category of coding system.
2648 Then, it uses a `coding-system' assigned to it. If Emacs can't
2649 decide only one possible category, it selects a category of the
2650 highest priority. Priorities of categories are also specified by a
2651 user in a Lisp variable `coding-category-list'.
2655 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2656 If it detects possible coding systems, return an integer in which
2657 appropriate flag bits are set. Flag bits are defined by macros
2658 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2661 detect_coding_mask (src, src_bytes)
2662 unsigned char *src;
2663 int src_bytes;
2665 register unsigned char c;
2666 unsigned char *src_end = src + src_bytes;
2667 int mask;
2669 /* At first, skip all ASCII characters and control characters except
2670 for three ISO2022 specific control characters. */
2671 label_loop_detect_coding:
2672 while (src < src_end)
2674 c = *src;
2675 if (c >= 0x80
2676 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2677 break;
2678 src++;
2681 if (src >= src_end)
2682 /* We found nothing other than ASCII. There's nothing to do. */
2683 return CODING_CATEGORY_MASK_ANY;
2685 /* The text seems to be encoded in some multilingual coding system.
2686 Now, try to find in which coding system the text is encoded. */
2687 if (c < 0x80)
2689 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2690 /* C is an ISO2022 specific control code of C0. */
2691 mask = detect_coding_iso2022 (src, src_end);
2692 src++;
2693 if (mask == CODING_CATEGORY_MASK_ANY)
2694 /* No valid ISO2022 code follows C. Try again. */
2695 goto label_loop_detect_coding;
2697 else if (c < 0xA0)
2699 /* If C is a special latin extra code,
2700 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2701 or is an ISO2022 control-sequence-introducer (CSI),
2702 we should also consider the possibility of ISO2022 codings. */
2703 if ((VECTORP (Vlatin_extra_code_table)
2704 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2705 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2706 || (c == ISO_CODE_CSI
2707 && (src < src_end
2708 && (*src == ']'
2709 || (src + 1 < src_end
2710 && src[1] == ']'
2711 && (*src == '0' || *src == '1' || *src == '2'))))))
2712 mask = (detect_coding_iso2022 (src, src_end)
2713 | detect_coding_sjis (src, src_end)
2714 | detect_coding_emacs_mule (src, src_end)
2715 | CODING_CATEGORY_MASK_RAW_TEXT);
2717 else
2718 /* C is the first byte of SJIS character code,
2719 or a leading-code of Emacs' internal format (emacs-mule). */
2720 mask = (detect_coding_sjis (src, src_end)
2721 | detect_coding_emacs_mule (src, src_end)
2722 | CODING_CATEGORY_MASK_RAW_TEXT);
2724 else
2725 /* C is a character of ISO2022 in graphic plane right,
2726 or a SJIS's 1-byte character code (i.e. JISX0201),
2727 or the first byte of BIG5's 2-byte code. */
2728 mask = (detect_coding_iso2022 (src, src_end)
2729 | detect_coding_sjis (src, src_end)
2730 | detect_coding_big5 (src, src_end)
2731 | CODING_CATEGORY_MASK_RAW_TEXT);
2733 return mask;
2736 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2737 The information of the detected coding system is set in CODING. */
2739 void
2740 detect_coding (coding, src, src_bytes)
2741 struct coding_system *coding;
2742 unsigned char *src;
2743 int src_bytes;
2745 int mask = detect_coding_mask (src, src_bytes);
2746 int idx;
2747 Lisp_Object val = Vcoding_category_list;
2749 if (mask == CODING_CATEGORY_MASK_ANY)
2750 /* We found nothing other than ASCII. There's nothing to do. */
2751 return;
2753 /* We found some plausible coding systems. Let's use a coding
2754 system of the highest priority. */
2756 if (CONSP (val))
2757 while (!NILP (val))
2759 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2760 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2761 break;
2762 val = XCONS (val)->cdr;
2764 else
2765 val = Qnil;
2767 if (NILP (val))
2769 /* For unknown reason, `Vcoding_category_list' contains none of
2770 found categories. Let's use any of them. */
2771 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2772 if (mask & (1 << idx))
2773 break;
2775 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2778 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2779 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2780 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2782 #define MAX_EOL_CHECK_COUNT 3
2785 detect_eol_type (src, src_bytes)
2786 unsigned char *src;
2787 int src_bytes;
2789 unsigned char *src_end = src + src_bytes;
2790 unsigned char c;
2791 int total = 0; /* How many end-of-lines are found so far. */
2792 int eol_type = CODING_EOL_UNDECIDED;
2793 int this_eol_type;
2795 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2797 c = *src++;
2798 if (c == '\n' || c == '\r')
2800 total++;
2801 if (c == '\n')
2802 this_eol_type = CODING_EOL_LF;
2803 else if (src >= src_end || *src != '\n')
2804 this_eol_type = CODING_EOL_CR;
2805 else
2806 this_eol_type = CODING_EOL_CRLF, src++;
2808 if (eol_type == CODING_EOL_UNDECIDED)
2809 /* This is the first end-of-line. */
2810 eol_type = this_eol_type;
2811 else if (eol_type != this_eol_type)
2812 /* The found type is different from what found before.
2813 Let's notice the caller about this inconsistency. */
2814 return CODING_EOL_INCONSISTENT;
2818 return eol_type;
2821 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2822 is encoded. If it detects an appropriate format of end-of-line, it
2823 sets the information in *CODING. */
2825 void
2826 detect_eol (coding, src, src_bytes)
2827 struct coding_system *coding;
2828 unsigned char *src;
2829 int src_bytes;
2831 Lisp_Object val, coding_system;
2832 int eol_type = detect_eol_type (src, src_bytes);
2834 if (eol_type == CODING_EOL_UNDECIDED)
2835 /* We found no end-of-line in the source text. */
2836 return;
2838 if (eol_type == CODING_EOL_INCONSISTENT)
2840 #if 0
2841 /* This code is suppressed until we find a better way to
2842 distinguish raw text file and binary file. */
2844 /* If we have already detected that the coding is raw-text, the
2845 coding should actually be no-conversion. */
2846 if (coding->type == coding_type_raw_text)
2848 setup_coding_system (Qno_conversion, coding);
2849 return;
2851 /* Else, let's decode only text code anyway. */
2852 #endif /* 0 */
2853 eol_type == CODING_EOL_LF;
2856 coding_system = coding->symbol;
2857 while (!NILP (coding_system)
2858 && NILP (val = Fget (coding_system, Qeol_type)))
2859 coding_system = Fget (coding_system, Qcoding_system);
2860 if (VECTORP (val) && XVECTOR (val)->size == 3)
2861 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2864 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2865 decoding, it may detect coding system and format of end-of-line if
2866 those are not yet decided. */
2869 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2870 struct coding_system *coding;
2871 unsigned char *source, *destination;
2872 int src_bytes, dst_bytes;
2873 int *consumed;
2875 int produced;
2877 if (src_bytes <= 0)
2879 *consumed = 0;
2880 return 0;
2883 if (coding->type == coding_type_undecided)
2884 detect_coding (coding, source, src_bytes);
2886 if (coding->eol_type == CODING_EOL_UNDECIDED)
2887 detect_eol (coding, source, src_bytes);
2889 coding->carryover_size = 0;
2890 switch (coding->type)
2892 case coding_type_no_conversion:
2893 label_no_conversion:
2894 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2895 bcopy (source, destination, produced);
2896 *consumed = produced;
2897 break;
2899 case coding_type_emacs_mule:
2900 case coding_type_undecided:
2901 case coding_type_raw_text:
2902 if (coding->eol_type == CODING_EOL_LF
2903 || coding->eol_type == CODING_EOL_UNDECIDED)
2904 goto label_no_conversion;
2905 produced = decode_eol (coding, source, destination,
2906 src_bytes, dst_bytes, consumed);
2907 break;
2909 case coding_type_sjis:
2910 produced = decode_coding_sjis_big5 (coding, source, destination,
2911 src_bytes, dst_bytes, consumed,
2913 break;
2915 case coding_type_iso2022:
2916 produced = decode_coding_iso2022 (coding, source, destination,
2917 src_bytes, dst_bytes, consumed);
2918 break;
2920 case coding_type_big5:
2921 produced = decode_coding_sjis_big5 (coding, source, destination,
2922 src_bytes, dst_bytes, consumed,
2924 break;
2926 case coding_type_ccl:
2927 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2928 src_bytes, dst_bytes, consumed);
2929 break;
2932 return produced;
2935 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2938 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2939 struct coding_system *coding;
2940 unsigned char *source, *destination;
2941 int src_bytes, dst_bytes;
2942 int *consumed;
2944 int produced;
2946 switch (coding->type)
2948 case coding_type_no_conversion:
2949 label_no_conversion:
2950 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2951 if (produced > 0)
2953 bcopy (source, destination, produced);
2954 if (coding->selective)
2956 unsigned char *p = destination, *pend = destination + produced;
2957 while (p < pend)
2958 if (*p++ == '\015') p[-1] = '\n';
2961 *consumed = produced;
2962 break;
2964 case coding_type_emacs_mule:
2965 case coding_type_undecided:
2966 case coding_type_raw_text:
2967 if (coding->eol_type == CODING_EOL_LF
2968 || coding->eol_type == CODING_EOL_UNDECIDED)
2969 goto label_no_conversion;
2970 produced = encode_eol (coding, source, destination,
2971 src_bytes, dst_bytes, consumed);
2972 break;
2974 case coding_type_sjis:
2975 produced = encode_coding_sjis_big5 (coding, source, destination,
2976 src_bytes, dst_bytes, consumed,
2978 break;
2980 case coding_type_iso2022:
2981 produced = encode_coding_iso2022 (coding, source, destination,
2982 src_bytes, dst_bytes, consumed);
2983 break;
2985 case coding_type_big5:
2986 produced = encode_coding_sjis_big5 (coding, source, destination,
2987 src_bytes, dst_bytes, consumed,
2989 break;
2991 case coding_type_ccl:
2992 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2993 src_bytes, dst_bytes, consumed);
2994 break;
2997 return produced;
3000 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3002 /* Return maximum size (bytes) of a buffer enough for decoding
3003 SRC_BYTES of text encoded in CODING. */
3006 decoding_buffer_size (coding, src_bytes)
3007 struct coding_system *coding;
3008 int src_bytes;
3010 int magnification;
3012 if (coding->type == coding_type_iso2022)
3013 magnification = 3;
3014 else if (coding->type == coding_type_ccl)
3015 magnification = coding->spec.ccl.decoder.buf_magnification;
3016 else
3017 magnification = 2;
3019 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3022 /* Return maximum size (bytes) of a buffer enough for encoding
3023 SRC_BYTES of text to CODING. */
3026 encoding_buffer_size (coding, src_bytes)
3027 struct coding_system *coding;
3028 int src_bytes;
3030 int magnification;
3032 if (coding->type == coding_type_ccl)
3033 magnification = coding->spec.ccl.encoder.buf_magnification;
3034 else
3035 magnification = 3;
3037 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3040 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3041 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3042 #endif
3044 char *conversion_buffer;
3045 int conversion_buffer_size;
3047 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3048 or decoding. Sufficient memory is allocated automatically. If we
3049 run out of memory, return NULL. */
3051 char *
3052 get_conversion_buffer (size)
3053 int size;
3055 if (size > conversion_buffer_size)
3057 char *buf;
3058 int real_size = conversion_buffer_size * 2;
3060 while (real_size < size) real_size *= 2;
3061 buf = (char *) xmalloc (real_size);
3062 xfree (conversion_buffer);
3063 conversion_buffer = buf;
3064 conversion_buffer_size = real_size;
3066 return conversion_buffer;
3070 #ifdef emacs
3071 /*** 7. Emacs Lisp library functions ***/
3073 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3074 1, 1, 0,
3075 "Return coding-spec of CODING-SYSTEM.\n\
3076 If CODING-SYSTEM is not a valid coding-system, return nil.")
3077 (obj)
3078 Lisp_Object obj;
3080 while (SYMBOLP (obj) && !NILP (obj))
3081 obj = Fget (obj, Qcoding_system);
3082 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3083 ? Qnil : obj);
3086 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3087 "Return t if OBJECT is nil or a coding-system.\n\
3088 See document of make-coding-system for coding-system object.")
3089 (obj)
3090 Lisp_Object obj;
3092 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3095 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3096 Sread_non_nil_coding_system, 1, 1, 0,
3097 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3098 (prompt)
3099 Lisp_Object prompt;
3101 Lisp_Object val;
3104 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3105 Qt, Qnil, Qnil, Qnil, Qnil);
3107 while (XSTRING (val)->size == 0);
3108 return (Fintern (val, Qnil));
3111 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3112 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3113 (prompt)
3114 Lisp_Object prompt;
3116 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3117 Qt, Qnil, Qnil, Qnil, Qnil);
3118 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3121 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3122 1, 1, 0,
3123 "Check validity of CODING-SYSTEM.\n\
3124 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3125 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3126 The value of property should be a vector of length 5.")
3127 (coding_system)
3128 Lisp_Object coding_system;
3130 CHECK_SYMBOL (coding_system, 0);
3131 if (!NILP (Fcoding_system_p (coding_system)))
3132 return coding_system;
3133 while (1)
3134 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3137 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3138 2, 2, 0,
3139 "Detect coding system of the text in the region between START and END.\n\
3140 Return a list of possible coding systems ordered by priority.\n\
3141 If only ASCII characters are found, it returns `undecided'\n\
3142 or its subsidiary coding system according to a detected end-of-line format.")
3143 (b, e)
3144 Lisp_Object b, e;
3146 int coding_mask, eol_type;
3147 Lisp_Object val;
3148 int beg, end;
3150 validate_region (&b, &e);
3151 beg = XINT (b), end = XINT (e);
3152 if (beg < GPT && end >= GPT) move_gap (end);
3154 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3155 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3157 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3159 val = Qundecided;
3160 if (eol_type != CODING_EOL_UNDECIDED
3161 && eol_type != CODING_EOL_INCONSISTENT)
3163 Lisp_Object val2 = Fget (Qundecided, Qeol_type);
3164 if (VECTORP (val2))
3165 val = XVECTOR (val2)->contents[eol_type];
3168 else
3170 Lisp_Object val2;
3172 /* At first, gather possible coding-systems in VAL in a reverse
3173 order. */
3174 val = Qnil;
3175 for (val2 = Vcoding_category_list;
3176 !NILP (val2);
3177 val2 = XCONS (val2)->cdr)
3179 int idx
3180 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3181 if (coding_mask & (1 << idx))
3183 #if 0
3184 /* This code is suppressed until we find a better way to
3185 distinguish raw text file and binary file. */
3187 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3188 && eol_type == CODING_EOL_INCONSISTENT)
3189 val = Fcons (Qno_conversion, val);
3190 else
3191 #endif /* 0 */
3192 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3196 /* Then, change the order of the list, while getting subsidiary
3197 coding-systems. */
3198 val2 = val;
3199 val = Qnil;
3200 if (eol_type == CODING_EOL_INCONSISTENT)
3201 eol_type == CODING_EOL_UNDECIDED;
3202 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3204 if (eol_type == CODING_EOL_UNDECIDED)
3205 val = Fcons (XCONS (val2)->car, val);
3206 else
3208 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3209 if (VECTORP (val3))
3210 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3211 else
3212 val = Fcons (XCONS (val2)->car, val);
3217 return val;
3220 /* Scan text in the region between *BEGP and *ENDP, skip characters
3221 which we never have to encode to (iff ENCODEP is 1) or decode from
3222 coding system CODING at the head and tail, then set BEGP and ENDP
3223 to the addresses of start and end of the text we actually convert. */
3225 void
3226 shrink_conversion_area (begp, endp, coding, encodep)
3227 unsigned char **begp, **endp;
3228 struct coding_system *coding;
3229 int encodep;
3231 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3233 if (coding->eol_type != CODING_EOL_LF
3234 && coding->eol_type != CODING_EOL_UNDECIDED)
3235 /* Since we anyway have to convert end-of-line format, it is not
3236 worth skipping at most 100 bytes or so. */
3237 return;
3239 if (encodep) /* for encoding */
3241 switch (coding->type)
3243 case coding_type_no_conversion:
3244 case coding_type_emacs_mule:
3245 case coding_type_undecided:
3246 case coding_type_raw_text:
3247 /* We need no conversion. */
3248 *begp = *endp;
3249 return;
3250 case coding_type_ccl:
3251 /* We can't skip any data. */
3252 return;
3253 case coding_type_iso2022:
3254 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3256 unsigned char *bol = beg_addr;
3257 while (beg_addr < end_addr && *beg_addr < 0x80)
3259 beg_addr++;
3260 if (*(beg_addr - 1) == '\n')
3261 bol = beg_addr;
3263 beg_addr = bol;
3264 goto label_skip_tail;
3266 /* fall down ... */
3267 default:
3268 /* We can skip all ASCII characters at the head and tail. */
3269 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3270 label_skip_tail:
3271 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3272 break;
3275 else /* for decoding */
3277 switch (coding->type)
3279 case coding_type_no_conversion:
3280 /* We need no conversion. */
3281 *begp = *endp;
3282 return;
3283 case coding_type_emacs_mule:
3284 case coding_type_raw_text:
3285 if (coding->eol_type == CODING_EOL_LF)
3287 /* We need no conversion. */
3288 *begp = *endp;
3289 return;
3291 /* We can skip all but carriage-return. */
3292 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3293 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3294 break;
3295 case coding_type_sjis:
3296 case coding_type_big5:
3297 /* We can skip all ASCII characters at the head. */
3298 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3299 /* We can skip all ASCII characters at the tail except for
3300 the second byte of SJIS or BIG5 code. */
3301 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3302 if (end_addr != *endp)
3303 end_addr++;
3304 break;
3305 case coding_type_ccl:
3306 /* We can't skip any data. */
3307 return;
3308 default: /* i.e. case coding_type_iso2022: */
3310 unsigned char c;
3312 /* We can skip all ASCII characters except for a few
3313 control codes at the head. */
3314 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3315 && c != ISO_CODE_CR && c != ISO_CODE_SO
3316 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3317 beg_addr++;
3319 break;
3322 *begp = beg_addr;
3323 *endp = end_addr;
3324 return;
3327 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3328 text between B and E. B and E are buffer position. */
3330 Lisp_Object
3331 code_convert_region (b, e, coding, encodep)
3332 Lisp_Object b, e;
3333 struct coding_system *coding;
3334 int encodep;
3336 int beg, end, len, consumed, produced;
3337 char *buf;
3338 unsigned char *begp, *endp;
3339 int pos = PT;
3341 validate_region (&b, &e);
3342 beg = XINT (b), end = XINT (e);
3343 if (beg < GPT && end >= GPT)
3344 move_gap (end);
3346 if (encodep && !NILP (coding->pre_write_conversion))
3348 /* We must call a pre-conversion function which may put a new
3349 text to be converted in a new buffer. */
3350 struct buffer *old = current_buffer, *new;
3352 TEMP_SET_PT (beg);
3353 call2 (coding->pre_write_conversion, b, e);
3354 if (old != current_buffer)
3356 /* Replace the original text by the text just generated. */
3357 len = ZV - BEGV;
3358 new = current_buffer;
3359 set_buffer_internal (old);
3360 del_range (beg, end);
3361 insert_from_buffer (new, 1, len, 0);
3362 end = beg + len;
3366 /* We may be able to shrink the conversion region. */
3367 begp = POS_ADDR (beg); endp = begp + (end - beg);
3368 shrink_conversion_area (&begp, &endp, coding, encodep);
3370 if (begp == endp)
3371 /* We need no conversion. */
3372 len = end - beg;
3373 else
3375 beg += begp - POS_ADDR (beg);
3376 end = beg + (endp - begp);
3378 if (encodep)
3379 len = encoding_buffer_size (coding, end - beg);
3380 else
3381 len = decoding_buffer_size (coding, end - beg);
3382 buf = get_conversion_buffer (len);
3384 coding->last_block = 1;
3385 produced = (encodep
3386 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3387 &consumed)
3388 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3389 &consumed));
3391 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3393 TEMP_SET_PT (beg);
3394 insert (buf, produced);
3395 del_range (PT, PT + end - beg);
3396 if (pos >= end)
3397 pos = PT + (pos - end);
3398 else if (pos > beg)
3399 pos = beg;
3400 TEMP_SET_PT (pos);
3403 if (!encodep && !NILP (coding->post_read_conversion))
3405 /* We must call a post-conversion function which may alter
3406 the text just converted. */
3407 Lisp_Object insval;
3409 beg = XINT (b);
3410 TEMP_SET_PT (beg);
3411 insval = call1 (coding->post_read_conversion, make_number (len));
3412 CHECK_NUMBER (insval, 0);
3413 len = XINT (insval);
3416 return make_number (len);
3419 Lisp_Object
3420 code_convert_string (str, coding, encodep, nocopy)
3421 Lisp_Object str, nocopy;
3422 struct coding_system *coding;
3423 int encodep;
3425 int len, consumed, produced;
3426 char *buf;
3427 unsigned char *begp, *endp;
3428 int head_skip, tail_skip;
3429 struct gcpro gcpro1;
3431 if (encodep && !NILP (coding->pre_write_conversion)
3432 || !encodep && !NILP (coding->post_read_conversion))
3434 /* Since we have to call Lisp functions which assume target text
3435 is in a buffer, after setting a temporary buffer, call
3436 code_convert_region. */
3437 int count = specpdl_ptr - specpdl;
3438 int len = XSTRING (str)->size;
3439 Lisp_Object result;
3440 struct buffer *old = current_buffer;
3442 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3443 temp_output_buffer_setup (" *code-converting-work*");
3444 set_buffer_internal (XBUFFER (Vstandard_output));
3445 insert_from_string (str, 0, len, 0);
3446 code_convert_region (make_number (BEGV), make_number (ZV),
3447 coding, encodep);
3448 result = make_buffer_string (BEGV, ZV, 0);
3449 set_buffer_internal (old);
3450 return unbind_to (count, result);
3453 /* We may be able to shrink the conversion region. */
3454 begp = XSTRING (str)->data;
3455 endp = begp + XSTRING (str)->size;
3456 shrink_conversion_area (&begp, &endp, coding, encodep);
3458 if (begp == endp)
3459 /* We need no conversion. */
3460 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3462 head_skip = begp - XSTRING (str)->data;
3463 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3465 GCPRO1 (str);
3467 if (encodep)
3468 len = encoding_buffer_size (coding, endp - begp);
3469 else
3470 len = decoding_buffer_size (coding, endp - begp);
3471 buf = get_conversion_buffer (len + head_skip + tail_skip);
3473 bcopy (XSTRING (str)->data, buf, head_skip);
3474 coding->last_block = 1;
3475 produced = (encodep
3476 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3477 buf + head_skip, endp - begp, len, &consumed)
3478 : decode_coding (coding, XSTRING (str)->data + head_skip,
3479 buf + head_skip, endp - begp, len, &consumed));
3480 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3481 buf + head_skip + produced,
3482 tail_skip);
3484 UNGCPRO;
3486 return make_string (buf, head_skip + produced + tail_skip);
3489 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3490 3, 3, "r\nzCoding system: ",
3491 "Decode current region by specified coding system.\n\
3492 When called from a program, takes three arguments:\n\
3493 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3494 Return length of decoded text.")
3495 (b, e, coding_system)
3496 Lisp_Object b, e, coding_system;
3498 struct coding_system coding;
3500 CHECK_NUMBER_COERCE_MARKER (b, 0);
3501 CHECK_NUMBER_COERCE_MARKER (e, 1);
3502 CHECK_SYMBOL (coding_system, 2);
3504 if (NILP (coding_system))
3505 return make_number (XFASTINT (e) - XFASTINT (b));
3506 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3507 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3509 return code_convert_region (b, e, &coding, 0);
3512 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3513 3, 3, "r\nzCoding system: ",
3514 "Encode current region by specified coding system.\n\
3515 When called from a program, takes three arguments:\n\
3516 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3517 Return length of encoded text.")
3518 (b, e, coding_system)
3519 Lisp_Object b, e, coding_system;
3521 struct coding_system coding;
3523 CHECK_NUMBER_COERCE_MARKER (b, 0);
3524 CHECK_NUMBER_COERCE_MARKER (e, 1);
3525 CHECK_SYMBOL (coding_system, 2);
3527 if (NILP (coding_system))
3528 return make_number (XFASTINT (e) - XFASTINT (b));
3529 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3530 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3532 return code_convert_region (b, e, &coding, 1);
3535 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3536 2, 3, 0,
3537 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3538 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3539 of decoding.")
3540 (string, coding_system, nocopy)
3541 Lisp_Object string, coding_system, nocopy;
3543 struct coding_system coding;
3545 CHECK_STRING (string, 0);
3546 CHECK_SYMBOL (coding_system, 1);
3548 if (NILP (coding_system))
3549 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3550 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3551 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3553 return code_convert_string (string, &coding, 0, nocopy);
3556 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3557 2, 3, 0,
3558 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3559 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3560 of encoding.")
3561 (string, coding_system, nocopy)
3562 Lisp_Object string, coding_system, nocopy;
3564 struct coding_system coding;
3566 CHECK_STRING (string, 0);
3567 CHECK_SYMBOL (coding_system, 1);
3569 if (NILP (coding_system))
3570 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3571 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3572 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3574 return code_convert_string (string, &coding, 1, nocopy);
3577 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3578 "Decode a JISX0208 character of shift-jis encoding.\n\
3579 CODE is the character code in SJIS.\n\
3580 Return the corresponding character.")
3581 (code)
3582 Lisp_Object code;
3584 unsigned char c1, c2, s1, s2;
3585 Lisp_Object val;
3587 CHECK_NUMBER (code, 0);
3588 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3589 DECODE_SJIS (s1, s2, c1, c2);
3590 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3591 return val;
3594 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3595 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3596 Return the corresponding character code in SJIS.")
3597 (ch)
3598 Lisp_Object ch;
3600 int charset, c1, c2, s1, s2;
3601 Lisp_Object val;
3603 CHECK_NUMBER (ch, 0);
3604 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3605 if (charset == charset_jisx0208)
3607 ENCODE_SJIS (c1, c2, s1, s2);
3608 XSETFASTINT (val, (s1 << 8) | s2);
3610 else
3611 XSETFASTINT (val, 0);
3612 return val;
3615 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3616 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3617 CODE is the character code in BIG5.\n\
3618 Return the corresponding character.")
3619 (code)
3620 Lisp_Object code;
3622 int charset;
3623 unsigned char b1, b2, c1, c2;
3624 Lisp_Object val;
3626 CHECK_NUMBER (code, 0);
3627 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3628 DECODE_BIG5 (b1, b2, charset, c1, c2);
3629 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3630 return val;
3633 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3634 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3635 Return the corresponding character code in Big5.")
3636 (ch)
3637 Lisp_Object ch;
3639 int charset, c1, c2, b1, b2;
3640 Lisp_Object val;
3642 CHECK_NUMBER (ch, 0);
3643 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3644 if (charset == charset_big5_1 || charset == charset_big5_2)
3646 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3647 XSETFASTINT (val, (b1 << 8) | b2);
3649 else
3650 XSETFASTINT (val, 0);
3651 return val;
3654 DEFUN ("set-terminal-coding-system-internal",
3655 Fset_terminal_coding_system_internal,
3656 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3657 (coding_system)
3658 Lisp_Object coding_system;
3660 CHECK_SYMBOL (coding_system, 0);
3661 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3662 /* We had better not send unexpected characters to terminal. */
3663 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3665 return Qnil;
3668 DEFUN ("set-safe-terminal-coding-system-internal",
3669 Fset_safe_terminal_coding_system_internal,
3670 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3671 (coding_system)
3672 Lisp_Object coding_system;
3674 CHECK_SYMBOL (coding_system, 0);
3675 setup_coding_system (Fcheck_coding_system (coding_system),
3676 &safe_terminal_coding);
3677 return Qnil;
3680 DEFUN ("terminal-coding-system",
3681 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3682 "Return coding-system of your terminal.")
3685 return terminal_coding.symbol;
3688 DEFUN ("set-keyboard-coding-system-internal",
3689 Fset_keyboard_coding_system_internal,
3690 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3691 (coding_system)
3692 Lisp_Object coding_system;
3694 CHECK_SYMBOL (coding_system, 0);
3695 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3696 return Qnil;
3699 DEFUN ("keyboard-coding-system",
3700 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3701 "Return coding-system of what is sent from terminal keyboard.")
3704 return keyboard_coding.symbol;
3708 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3709 Sfind_operation_coding_system, 1, MANY, 0,
3710 "Choose a coding system for an operation based on the target name.\n\
3711 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3712 DECODING-SYSTEM is the coding system to use for decoding\n\
3713 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3714 for encoding (in case OPERATION does encoding).\n\
3716 The first argument OPERATION specifies an I/O primitive:\n\
3717 For file I/O, `insert-file-contents' or `write-region'.\n\
3718 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3719 For network I/O, `open-network-stream'.\n\
3721 The remaining arguments should be the same arguments that were passed\n\
3722 to the primitive. Depending on which primitive, one of those arguments\n\
3723 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3724 whichever argument specifies the file name is TARGET.\n\
3726 TARGET has a meaning which depends on OPERATION:\n\
3727 For file I/O, TARGET is a file name.\n\
3728 For process I/O, TARGET is a process name.\n\
3729 For network I/O, TARGET is a service name or a port number\n\
3731 This function looks up what specified for TARGET in,\n\
3732 `file-coding-system-alist', `process-coding-system-alist',\n\
3733 or `network-coding-system-alist' depending on OPERATION.\n\
3734 They may specify a coding system, a cons of coding systems,\n\
3735 or a function symbol to call.\n\
3736 In the last case, we call the function with one argument,\n\
3737 which is a list of all the arguments given to this function.")
3738 (nargs, args)
3739 int nargs;
3740 Lisp_Object *args;
3742 Lisp_Object operation, target_idx, target, val;
3743 register Lisp_Object chain;
3745 if (nargs < 2)
3746 error ("Too few arguments");
3747 operation = args[0];
3748 if (!SYMBOLP (operation)
3749 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3750 error ("Invalid first arguement");
3751 if (nargs < 1 + XINT (target_idx))
3752 error ("Too few arguments for operation: %s",
3753 XSYMBOL (operation)->name->data);
3754 target = args[XINT (target_idx) + 1];
3755 if (!(STRINGP (target)
3756 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3757 error ("Invalid %dth argument", XINT (target_idx) + 1);
3759 chain = ((EQ (operation, Qinsert_file_contents)
3760 || EQ (operation, Qwrite_region))
3761 ? Vfile_coding_system_alist
3762 : (EQ (operation, Qopen_network_stream)
3763 ? Vnetwork_coding_system_alist
3764 : Vprocess_coding_system_alist));
3765 if (NILP (chain))
3766 return Qnil;
3768 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3770 Lisp_Object elt = XCONS (chain)->car;
3772 if (CONSP (elt)
3773 && ((STRINGP (target)
3774 && STRINGP (XCONS (elt)->car)
3775 && fast_string_match (XCONS (elt)->car, target) >= 0)
3776 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3778 val = XCONS (elt)->cdr;
3779 if (CONSP (val))
3780 return val;
3781 if (! SYMBOLP (val))
3782 return Qnil;
3783 if (! NILP (Fcoding_system_p (val)))
3784 return Fcons (val, val);
3785 if (!NILP (Ffboundp (val)))
3786 return call1 (val, Flist (nargs, args));
3787 return Qnil;
3790 return Qnil;
3793 #endif /* emacs */
3796 /*** 8. Post-amble ***/
3798 init_coding_once ()
3800 int i;
3802 /* Emacs' internal format specific initialize routine. */
3803 for (i = 0; i <= 0x20; i++)
3804 emacs_code_class[i] = EMACS_control_code;
3805 emacs_code_class[0x0A] = EMACS_linefeed_code;
3806 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3807 for (i = 0x21 ; i < 0x7F; i++)
3808 emacs_code_class[i] = EMACS_ascii_code;
3809 emacs_code_class[0x7F] = EMACS_control_code;
3810 emacs_code_class[0x80] = EMACS_leading_code_composition;
3811 for (i = 0x81; i < 0xFF; i++)
3812 emacs_code_class[i] = EMACS_invalid_code;
3813 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3814 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3815 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3816 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3818 /* ISO2022 specific initialize routine. */
3819 for (i = 0; i < 0x20; i++)
3820 iso_code_class[i] = ISO_control_code;
3821 for (i = 0x21; i < 0x7F; i++)
3822 iso_code_class[i] = ISO_graphic_plane_0;
3823 for (i = 0x80; i < 0xA0; i++)
3824 iso_code_class[i] = ISO_control_code;
3825 for (i = 0xA1; i < 0xFF; i++)
3826 iso_code_class[i] = ISO_graphic_plane_1;
3827 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3828 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3829 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3830 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3831 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3832 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3833 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3834 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3835 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3836 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3838 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3839 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3841 setup_coding_system (Qnil, &keyboard_coding);
3842 setup_coding_system (Qnil, &terminal_coding);
3843 setup_coding_system (Qnil, &safe_terminal_coding);
3845 #if defined (MSDOS) || defined (WINDOWSNT)
3846 system_eol_type = CODING_EOL_CRLF;
3847 #else
3848 system_eol_type = CODING_EOL_LF;
3849 #endif
3852 #ifdef emacs
3854 syms_of_coding ()
3856 Qtarget_idx = intern ("target-idx");
3857 staticpro (&Qtarget_idx);
3859 /* Target FILENAME is the first argument. */
3860 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3861 /* Target FILENAME is the third argument. */
3862 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3864 Qcall_process = intern ("call-process");
3865 staticpro (&Qcall_process);
3866 /* Target PROGRAM is the first argument. */
3867 Fput (Qcall_process, Qtarget_idx, make_number (0));
3869 Qcall_process_region = intern ("call-process-region");
3870 staticpro (&Qcall_process_region);
3871 /* Target PROGRAM is the third argument. */
3872 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3874 Qstart_process = intern ("start-process");
3875 staticpro (&Qstart_process);
3876 /* Target PROGRAM is the third argument. */
3877 Fput (Qstart_process, Qtarget_idx, make_number (2));
3879 Qopen_network_stream = intern ("open-network-stream");
3880 staticpro (&Qopen_network_stream);
3881 /* Target SERVICE is the fourth argument. */
3882 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3884 Qcoding_system = intern ("coding-system");
3885 staticpro (&Qcoding_system);
3887 Qeol_type = intern ("eol-type");
3888 staticpro (&Qeol_type);
3890 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3891 staticpro (&Qbuffer_file_coding_system);
3893 Qpost_read_conversion = intern ("post-read-conversion");
3894 staticpro (&Qpost_read_conversion);
3896 Qpre_write_conversion = intern ("pre-write-conversion");
3897 staticpro (&Qpre_write_conversion);
3899 Qno_conversion = intern ("no-conversion");
3900 staticpro (&Qno_conversion);
3902 Qundecided = intern ("undecided");
3903 staticpro (&Qundecided);
3905 Qcoding_system_spec = intern ("coding-system-spec");
3906 staticpro (&Qcoding_system_spec);
3908 Qcoding_system_p = intern ("coding-system-p");
3909 staticpro (&Qcoding_system_p);
3911 Qcoding_system_error = intern ("coding-system-error");
3912 staticpro (&Qcoding_system_error);
3914 Fput (Qcoding_system_error, Qerror_conditions,
3915 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3916 Fput (Qcoding_system_error, Qerror_message,
3917 build_string ("Invalid coding system"));
3919 Qcoding_category_index = intern ("coding-category-index");
3920 staticpro (&Qcoding_category_index);
3923 int i;
3924 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3926 coding_category_table[i] = intern (coding_category_name[i]);
3927 staticpro (&coding_category_table[i]);
3928 Fput (coding_category_table[i], Qcoding_category_index,
3929 make_number (i));
3933 Qcharacter_unification_table = intern ("character-unification-table");
3934 staticpro (&Qcharacter_unification_table);
3935 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3936 make_number (0));
3938 Qcharacter_unification_table_for_decode
3939 = intern ("character-unification-table-for-decode");
3940 staticpro (&Qcharacter_unification_table_for_decode);
3942 Qcharacter_unification_table_for_encode
3943 = intern ("character-unification-table-for-encode");
3944 staticpro (&Qcharacter_unification_table_for_encode);
3946 Qemacs_mule = intern ("emacs-mule");
3947 staticpro (&Qemacs_mule);
3949 defsubr (&Scoding_system_spec);
3950 defsubr (&Scoding_system_p);
3951 defsubr (&Sread_coding_system);
3952 defsubr (&Sread_non_nil_coding_system);
3953 defsubr (&Scheck_coding_system);
3954 defsubr (&Sdetect_coding_region);
3955 defsubr (&Sdecode_coding_region);
3956 defsubr (&Sencode_coding_region);
3957 defsubr (&Sdecode_coding_string);
3958 defsubr (&Sencode_coding_string);
3959 defsubr (&Sdecode_sjis_char);
3960 defsubr (&Sencode_sjis_char);
3961 defsubr (&Sdecode_big5_char);
3962 defsubr (&Sencode_big5_char);
3963 defsubr (&Sset_terminal_coding_system_internal);
3964 defsubr (&Sset_safe_terminal_coding_system_internal);
3965 defsubr (&Sterminal_coding_system);
3966 defsubr (&Sset_keyboard_coding_system_internal);
3967 defsubr (&Skeyboard_coding_system);
3968 defsubr (&Sfind_operation_coding_system);
3970 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3971 "List of coding-categories (symbols) ordered by priority.");
3973 int i;
3975 Vcoding_category_list = Qnil;
3976 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3977 Vcoding_category_list
3978 = Fcons (coding_category_table[i], Vcoding_category_list);
3981 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3982 "Specify the coding system for read operations.\n\
3983 It is useful to bind this variable with `let', but do not set it globally.\n\
3984 If the value is a coding system, it is used for decoding on read operation.\n\
3985 If not, an appropriate element is used from one of the coding system alists:\n\
3986 There are three such tables, `file-coding-system-alist',\n\
3987 `process-coding-system-alist', and `network-coding-system-alist'.");
3988 Vcoding_system_for_read = Qnil;
3990 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3991 "Specify the coding system for write operations.\n\
3992 It is useful to bind this variable with `let', but do not set it globally.\n\
3993 If the value is a coding system, it is used for encoding on write operation.\n\
3994 If not, an appropriate element is used from one of the coding system alists:\n\
3995 There are three such tables, `file-coding-system-alist',\n\
3996 `process-coding-system-alist', and `network-coding-system-alist'.");
3997 Vcoding_system_for_write = Qnil;
3999 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4000 "Coding system used in the latest file or process I/O.");
4001 Vlast_coding_system_used = Qnil;
4003 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4004 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4005 inhibit_eol_conversion = 0;
4007 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4008 "Alist to decide a coding system to use for a file I/O operation.\n\
4009 The format is ((PATTERN . VAL) ...),\n\
4010 where PATTERN is a regular expression matching a file name,\n\
4011 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4012 If VAL is a coding system, it is used for both decoding and encoding\n\
4013 the file contents.\n\
4014 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4015 and the cdr part is used for encoding.\n\
4016 If VAL is a function symbol, the function must return a coding system\n\
4017 or a cons of coding systems which are used as above.\n\
4019 See also the function `find-operation-coding-system'.");
4020 Vfile_coding_system_alist = Qnil;
4022 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4023 "Alist to decide a coding system to use for a process I/O operation.\n\
4024 The format is ((PATTERN . VAL) ...),\n\
4025 where PATTERN is a regular expression matching a program name,\n\
4026 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4027 If VAL is a coding system, it is used for both decoding what received\n\
4028 from the program and encoding what sent to the program.\n\
4029 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4030 and the cdr part is used for encoding.\n\
4031 If VAL is a function symbol, the function must return a coding system\n\
4032 or a cons of coding systems which are used as above.\n\
4034 See also the function `find-operation-coding-system'.");
4035 Vprocess_coding_system_alist = Qnil;
4037 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4038 "Alist to decide a coding system to use for a network I/O operation.\n\
4039 The format is ((PATTERN . VAL) ...),\n\
4040 where PATTERN is a regular expression matching a network service name\n\
4041 or is a port number to connect to,\n\
4042 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4043 If VAL is a coding system, it is used for both decoding what received\n\
4044 from the network stream and encoding what sent to the network stream.\n\
4045 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4046 and the cdr part is used for encoding.\n\
4047 If VAL is a function symbol, the function must return a coding system\n\
4048 or a cons of coding systems which are used as above.\n\
4050 See also the function `find-operation-coding-system'.");
4051 Vnetwork_coding_system_alist = Qnil;
4053 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4054 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4055 eol_mnemonic_unix = ':';
4057 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4058 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4059 eol_mnemonic_dos = '\\';
4061 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4062 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4063 eol_mnemonic_mac = '/';
4065 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4066 "Mnemonic character indicating end-of-line format is not yet decided.");
4067 eol_mnemonic_undecided = ':';
4069 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4070 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4071 Venable_character_unification = Qt;
4073 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4074 &Vstandard_character_unification_table_for_decode,
4075 "Table for unifying characters when reading.");
4076 Vstandard_character_unification_table_for_decode = Qnil;
4078 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4079 &Vstandard_character_unification_table_for_encode,
4080 "Table for unifying characters when writing.");
4081 Vstandard_character_unification_table_for_encode = Qnil;
4083 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4084 "Alist of charsets vs revision numbers.\n\
4085 While encoding, if a charset (car part of an element) is found,\n\
4086 designate it with the escape sequence identifing revision (cdr part of the element).");
4087 Vcharset_revision_alist = Qnil;
4089 DEFVAR_LISP ("default-process-coding-system",
4090 &Vdefault_process_coding_system,
4091 "Cons of coding systems used for process I/O by default.\n\
4092 The car part is used for decoding a process output,\n\
4093 the cdr part is used for encoding a text to be sent to a process.");
4094 Vdefault_process_coding_system = Qnil;
4096 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4097 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4098 This is a vector of length 256.\n\
4099 If Nth element is non-nil, the existence of code N in a file\n\
4100 (or output of subprocess) doesn't prevent it to be detected as\n\
4101 a coding system of ISO 2022 variant which has a flag\n\
4102 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4103 or reading output of a subprocess.\n\
4104 Only 128th through 159th elements has a meaning.");
4105 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4108 #endif /* emacs */