(xcar, xcdr): Print with /x.
[emacs.git] / src / coding.c
blob116a54e444f14e9b9cb1eccc8696708c42731b89
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
49 1. ISO2022
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
62 3. BIG5
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
70 4. Other
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105 #if 0
107 detect_coding_emacs_mule (src, src_end)
108 unsigned char *src, *src_end;
112 #endif
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
122 #if 0
123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
131 #endif
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
141 #if 0
142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
150 #endif
152 /*** COMMONLY USED MACROS ***/
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
160 #define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
168 #define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
176 #define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
191 /* Decode one ASCII character C. */
193 #define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
226 /*** 1. Preamble ***/
228 #include <stdio.h>
230 #ifdef emacs
232 #include <config.h>
233 #include "lisp.h"
234 #include "buffer.h"
235 #include "charset.h"
236 #include "ccl.h"
237 #include "coding.h"
238 #include "window.h"
240 #else /* not emacs */
242 #include "mulelib.h"
244 #endif /* not emacs */
246 Lisp_Object Qcoding_system, Qeol_type;
247 Lisp_Object Qbuffer_file_coding_system;
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252 Lisp_Object Qstart_process, Qopen_network_stream;
253 Lisp_Object Qtarget_idx;
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257 /* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259 int eol_mnemonic_undecided;
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263 int system_eol_type;
265 #ifdef emacs
267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule;
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used;
281 /* Flag to inhibit code conversion of end-of-line format. */
282 int inhibit_eol_conversion;
284 /* Coding-system of what terminal accept for displaying. */
285 struct coding_system terminal_coding;
287 /* Coding-system of what is sent from terminal keyboard. */
288 struct coding_system keyboard_coding;
290 Lisp_Object Vfile_coding_system_alist;
291 Lisp_Object Vprocess_coding_system_alist;
292 Lisp_Object Vnetwork_coding_system_alist;
294 #endif /* emacs */
296 Lisp_Object Qcoding_category_index;
298 /* List of symbols `coding-category-xxx' ordered by priority. */
299 Lisp_Object Vcoding_category_list;
301 /* Table of coding-systems currently assigned to each coding-category. */
302 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
304 /* Table of names of symbol for each coding-category. */
305 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
306 "coding-category-emacs-mule",
307 "coding-category-sjis",
308 "coding-category-iso-7",
309 "coding-category-iso-8-1",
310 "coding-category-iso-8-2",
311 "coding-category-iso-7-else",
312 "coding-category-iso-8-else",
313 "coding-category-big5",
314 "coding-category-binary"
317 /* Flag to tell if we look up unification table on character code
318 conversion. */
319 Lisp_Object Venable_character_unification;
320 /* Standard unification table to look up on decoding (reading). */
321 Lisp_Object Vstandard_character_unification_table_for_decode;
322 /* Standard unification table to look up on encoding (writing). */
323 Lisp_Object Vstandard_character_unification_table_for_encode;
325 Lisp_Object Qcharacter_unification_table;
326 Lisp_Object Qcharacter_unification_table_for_decode;
327 Lisp_Object Qcharacter_unification_table_for_encode;
329 /* Alist of charsets vs revision number. */
330 Lisp_Object Vcharset_revision_alist;
332 /* Default coding systems used for process I/O. */
333 Lisp_Object Vdefault_process_coding_system;
336 /*** 2. Emacs internal format (emacs-mule) handlers ***/
338 /* Emacs' internal format for encoding multiple character sets is a
339 kind of multi-byte encoding, i.e. characters are encoded by
340 variable-length sequences of one-byte codes. ASCII characters
341 and control characters (e.g. `tab', `newline') are represented by
342 one-byte sequences which are their ASCII codes, in the range 0x00
343 through 0x7F. The other characters are represented by a sequence
344 of `base leading-code', optional `extended leading-code', and one
345 or two `position-code's. The length of the sequence is determined
346 by the base leading-code. Leading-code takes the range 0x80
347 through 0x9F, whereas extended leading-code and position-code take
348 the range 0xA0 through 0xFF. See `charset.h' for more details
349 about leading-code and position-code.
351 There's one exception to this rule. Special leading-code
352 `leading-code-composition' denotes that the following several
353 characters should be composed into one character. Leading-codes of
354 components (except for ASCII) are added 0x20. An ASCII character
355 component is represented by a 2-byte sequence of `0xA0' and
356 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
357 details of composite character. Hence, we can summarize the code
358 range as follows:
360 --- CODE RANGE of Emacs' internal format ---
361 (character set) (range)
362 ASCII 0x00 .. 0x7F
363 ELSE (1st byte) 0x80 .. 0x9F
364 (rest bytes) 0xA0 .. 0xFF
365 ---------------------------------------------
369 enum emacs_code_class_type emacs_code_class[256];
371 /* Go to the next statement only if *SRC is accessible and the code is
372 greater than 0xA0. */
373 #define CHECK_CODE_RANGE_A0_FF \
374 do { \
375 if (src >= src_end) \
376 goto label_end_of_switch; \
377 else if (*src++ < 0xA0) \
378 return 0; \
379 } while (0)
381 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
382 Check if a text is encoded in Emacs' internal format. If it is,
383 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
386 detect_coding_emacs_mule (src, src_end)
387 unsigned char *src, *src_end;
389 unsigned char c;
390 int composing = 0;
392 while (src < src_end)
394 c = *src++;
396 if (composing)
398 if (c < 0xA0)
399 composing = 0;
400 else
401 c -= 0x20;
404 switch (emacs_code_class[c])
406 case EMACS_ascii_code:
407 case EMACS_linefeed_code:
408 break;
410 case EMACS_control_code:
411 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
412 return 0;
413 break;
415 case EMACS_invalid_code:
416 return 0;
418 case EMACS_leading_code_composition: /* c == 0x80 */
419 if (composing)
420 CHECK_CODE_RANGE_A0_FF;
421 else
422 composing = 1;
423 break;
425 case EMACS_leading_code_4:
426 CHECK_CODE_RANGE_A0_FF;
427 /* fall down to check it two more times ... */
429 case EMACS_leading_code_3:
430 CHECK_CODE_RANGE_A0_FF;
431 /* fall down to check it one more time ... */
433 case EMACS_leading_code_2:
434 CHECK_CODE_RANGE_A0_FF;
435 break;
437 default:
438 label_end_of_switch:
439 break;
442 return CODING_CATEGORY_MASK_EMACS_MULE;
446 /*** 3. ISO2022 handlers ***/
448 /* The following note describes the coding system ISO2022 briefly.
449 Since the intention of this note is to help in understanding of
450 the programs in this file, some parts are NOT ACCURATE or OVERLY
451 SIMPLIFIED. For the thorough understanding, please refer to the
452 original document of ISO2022.
454 ISO2022 provides many mechanisms to encode several character sets
455 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
456 all text is encoded by codes of less than 128. This may make the
457 encoded text a little bit longer, but the text gets more stability
458 to pass through several gateways (some of them strip off the MSB).
460 There are two kinds of character set: control character set and
461 graphic character set. The former contains control characters such
462 as `newline' and `escape' to provide control functions (control
463 functions are provided also by escape sequences). The latter
464 contains graphic characters such as ' A' and '-'. Emacs recognizes
465 two control character sets and many graphic character sets.
467 Graphic character sets are classified into one of the following
468 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
469 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
470 bytes (DIMENSION) and the number of characters in one dimension
471 (CHARS) of the set. In addition, each character set is assigned an
472 identification tag (called "final character" and denoted as <F>
473 here after) which is unique in each class. <F> of each character
474 set is decided by ECMA(*) when it is registered in ISO. Code range
475 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
477 Note (*): ECMA = European Computer Manufacturers Association
479 Here are examples of graphic character set [NAME(<F>)]:
480 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
481 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
482 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
483 o DIMENSION2_CHARS96 -- none for the moment
485 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
486 C0 [0x00..0x1F] -- control character plane 0
487 GL [0x20..0x7F] -- graphic character plane 0
488 C1 [0x80..0x9F] -- control character plane 1
489 GR [0xA0..0xFF] -- graphic character plane 1
491 A control character set is directly designated and invoked to C0 or
492 C1 by an escape sequence. The most common case is that ISO646's
493 control character set is designated/invoked to C0 and ISO6429's
494 control character set is designated/invoked to C1, and usually
495 these designations/invocations are omitted in a coded text. With
496 7-bit environment, only C0 can be used, and a control character for
497 C1 is encoded by an appropriate escape sequence to fit in the
498 environment. All control characters for C1 are defined the
499 corresponding escape sequences.
501 A graphic character set is at first designated to one of four
502 graphic registers (G0 through G3), then these graphic registers are
503 invoked to GL or GR. These designations and invocations can be
504 done independently. The most common case is that G0 is invoked to
505 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
506 these invocations and designations are omitted in a coded text.
507 With 7-bit environment, only GL can be used.
509 When a graphic character set of CHARS94 is invoked to GL, code 0x20
510 and 0x7F of GL area work as control characters SPACE and DEL
511 respectively, and code 0xA0 and 0xFF of GR area should not be used.
513 There are two ways of invocation: locking-shift and single-shift.
514 With locking-shift, the invocation lasts until the next different
515 invocation, whereas with single-shift, the invocation works only
516 for the following character and doesn't affect locking-shift.
517 Invocations are done by the following control characters or escape
518 sequences.
520 ----------------------------------------------------------------------
521 function control char escape sequence description
522 ----------------------------------------------------------------------
523 SI (shift-in) 0x0F none invoke G0 to GL
524 SI (shift-out) 0x0E none invoke G1 to GL
525 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
526 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
527 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
528 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
529 ----------------------------------------------------------------------
530 The first four are for locking-shift. Control characters for these
531 functions are defined by macros ISO_CODE_XXX in `coding.h'.
533 Designations are done by the following escape sequences.
534 ----------------------------------------------------------------------
535 escape sequence description
536 ----------------------------------------------------------------------
537 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
538 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
539 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
540 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
541 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
542 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
543 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
544 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
545 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
546 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
547 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
548 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
549 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
550 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
551 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
552 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
553 ----------------------------------------------------------------------
555 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
556 of dimension 1, chars 94, and final character <F>, and etc.
558 Note (*): Although these designations are not allowed in ISO2022,
559 Emacs accepts them on decoding, and produces them on encoding
560 CHARS96 character set in a coding system which is characterized as
561 7-bit environment, non-locking-shift, and non-single-shift.
563 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
564 '(' can be omitted. We call this as "short-form" here after.
566 Now you may notice that there are a lot of ways for encoding the
567 same multilingual text in ISO2022. Actually, there exists many
568 coding systems such as Compound Text (used in X's inter client
569 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
570 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
571 localized platforms), and all of these are variants of ISO2022.
573 In addition to the above, Emacs handles two more kinds of escape
574 sequences: ISO6429's direction specification and Emacs' private
575 sequence for specifying character composition.
577 ISO6429's direction specification takes the following format:
578 o CSI ']' -- end of the current direction
579 o CSI '0' ']' -- end of the current direction
580 o CSI '1' ']' -- start of left-to-right text
581 o CSI '2' ']' -- start of right-to-left text
582 The control character CSI (0x9B: control sequence introducer) is
583 abbreviated to the escape sequence ESC '[' in 7-bit environment.
585 Character composition specification takes the following format:
586 o ESC '0' -- start character composition
587 o ESC '1' -- end character composition
588 Since these are not standard escape sequences of any ISO, the use
589 of them for these meaning is restricted to Emacs only. */
591 enum iso_code_class_type iso_code_class[256];
593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
594 Check if a text is encoded in ISO2022. If it is, returns an
595 integer in which appropriate flag bits any of:
596 CODING_CATEGORY_MASK_ISO_7
597 CODING_CATEGORY_MASK_ISO_8_1
598 CODING_CATEGORY_MASK_ISO_8_2
599 CODING_CATEGORY_MASK_ISO_7_ELSE
600 CODING_CATEGORY_MASK_ISO_8_ELSE
601 are set. If a code which should never appear in ISO2022 is found,
602 returns 0. */
605 detect_coding_iso2022 (src, src_end)
606 unsigned char *src, *src_end;
608 int mask = (CODING_CATEGORY_MASK_ISO_7
609 | CODING_CATEGORY_MASK_ISO_8_1
610 | CODING_CATEGORY_MASK_ISO_8_2
611 | CODING_CATEGORY_MASK_ISO_7_ELSE
612 | CODING_CATEGORY_MASK_ISO_8_ELSE
614 int g1 = 0; /* 1 iff designating to G1. */
615 int c, i;
617 while (src < src_end)
619 c = *src++;
620 switch (c)
622 case ISO_CODE_ESC:
623 if (src >= src_end)
624 break;
625 c = *src++;
626 if (src < src_end
627 && ((c >= '(' && c <= '/')
628 || c == '$' && ((*src >= '(' && *src <= '/')
629 || (*src >= '@' && *src <= 'B'))))
631 /* Valid designation sequence. */
632 if (c == ')' || (c == '$' && *src == ')'))
634 g1 = 1;
635 mask &= ~(CODING_CATEGORY_MASK_ISO_7
636 | CODING_CATEGORY_MASK_ISO_7_ELSE);
638 src++;
639 break;
641 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
642 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
643 | CODING_CATEGORY_MASK_ISO_8_ELSE);
644 break;
646 case ISO_CODE_SO:
647 if (g1)
648 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
649 | CODING_CATEGORY_MASK_ISO_8_ELSE);
650 break;
652 case ISO_CODE_CSI:
653 case ISO_CODE_SS2:
654 case ISO_CODE_SS3:
655 mask &= ~(CODING_CATEGORY_MASK_ISO_7
656 | CODING_CATEGORY_MASK_ISO_7_ELSE);
657 break;
659 default:
660 if (c < 0x80)
661 break;
662 else if (c < 0xA0)
663 return 0;
664 else
666 unsigned char *src_begin = src;
668 mask &= ~(CODING_CATEGORY_MASK_ISO_7
669 | CODING_CATEGORY_MASK_ISO_7_ELSE);
670 while (src < src_end && *src >= 0xA0)
671 src++;
672 if ((src - src_begin - 1) & 1 && src < src_end)
673 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
675 break;
679 return mask;
682 /* Decode a character of which charset is CHARSET and the 1st position
683 code is C1. If dimension of CHARSET is 2, the 2nd position code is
684 fetched from SRC and set to C2. If CHARSET is negative, it means
685 that we are decoding ill formed text, and what we can do is just to
686 read C1 as is. */
688 #define DECODE_ISO_CHARACTER(charset, c1) \
689 do { \
690 int c_alt, charset_alt = (charset); \
691 if (COMPOSING_HEAD_P (coding->composing)) \
693 *dst++ = LEADING_CODE_COMPOSITION; \
694 if (COMPOSING_WITH_RULE_P (coding->composing)) \
695 /* To tell composition rules are embeded. */ \
696 *dst++ = 0xFF; \
697 coding->composing += 2; \
699 if ((charset) >= 0) \
701 if (CHARSET_DIMENSION (charset) == 2) \
702 ONE_MORE_BYTE (c2); \
703 if (!NILP (unification_table) \
704 && ((c_alt = unify_char (unification_table, \
705 -1, (charset), c1, c2)) >= 0)) \
706 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
708 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
709 DECODE_CHARACTER_ASCII (c1); \
710 else if (CHARSET_DIMENSION (charset_alt) == 1) \
711 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
712 else \
713 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
714 if (COMPOSING_WITH_RULE_P (coding->composing)) \
715 /* To tell a composition rule follows. */ \
716 coding->composing = COMPOSING_WITH_RULE_RULE; \
717 } while (0)
719 /* Set designation state into CODING. */
720 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
721 do { \
722 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
723 make_number (chars), \
724 make_number (final_char)); \
725 if (charset >= 0) \
727 if (coding->direction == 1 \
728 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
729 charset = CHARSET_REVERSE_CHARSET (charset); \
730 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
732 } while (0)
734 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
737 decode_coding_iso2022 (coding, source, destination,
738 src_bytes, dst_bytes, consumed)
739 struct coding_system *coding;
740 unsigned char *source, *destination;
741 int src_bytes, dst_bytes;
742 int *consumed;
744 unsigned char *src = source;
745 unsigned char *src_end = source + src_bytes;
746 unsigned char *dst = destination;
747 unsigned char *dst_end = destination + dst_bytes;
748 /* Since the maximum bytes produced by each loop is 7, we subtract 6
749 from DST_END to assure that overflow checking is necessary only
750 at the head of loop. */
751 unsigned char *adjusted_dst_end = dst_end - 6;
752 int charset;
753 /* Charsets invoked to graphic plane 0 and 1 respectively. */
754 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
755 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
756 Lisp_Object unification_table
757 = coding->character_unification_table_for_decode;
759 if (!NILP (Venable_character_unification) && NILP (unification_table))
760 unification_table = Vstandard_character_unification_table_for_decode;
762 while (src < src_end && dst < adjusted_dst_end)
764 /* SRC_BASE remembers the start position in source in each loop.
765 The loop will be exited when there's not enough source text
766 to analyze long escape sequence or 2-byte code (within macros
767 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
768 to SRC_BASE before exiting. */
769 unsigned char *src_base = src;
770 int c1 = *src++, c2;
772 switch (iso_code_class [c1])
774 case ISO_0x20_or_0x7F:
775 if (!coding->composing
776 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
778 /* This is SPACE or DEL. */
779 *dst++ = c1;
780 break;
782 /* This is a graphic character, we fall down ... */
784 case ISO_graphic_plane_0:
785 if (coding->composing == COMPOSING_WITH_RULE_RULE)
787 /* This is a composition rule. */
788 *dst++ = c1 | 0x80;
789 coding->composing = COMPOSING_WITH_RULE_TAIL;
791 else
792 DECODE_ISO_CHARACTER (charset0, c1);
793 break;
795 case ISO_0xA0_or_0xFF:
796 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
798 /* Invalid code. */
799 *dst++ = c1;
800 break;
802 /* This is a graphic character, we fall down ... */
804 case ISO_graphic_plane_1:
805 DECODE_ISO_CHARACTER (charset1, c1);
806 break;
808 case ISO_control_code:
809 /* All ISO2022 control characters in this class have the
810 same representation in Emacs internal format. */
811 *dst++ = c1;
812 break;
814 case ISO_carriage_return:
815 if (coding->eol_type == CODING_EOL_CR)
817 *dst++ = '\n';
819 else if (coding->eol_type == CODING_EOL_CRLF)
821 ONE_MORE_BYTE (c1);
822 if (c1 == ISO_CODE_LF)
823 *dst++ = '\n';
824 else
826 src--;
827 *dst++ = c1;
830 else
832 *dst++ = c1;
834 break;
836 case ISO_shift_out:
837 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
838 goto label_invalid_escape_sequence;
839 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
840 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
841 break;
843 case ISO_shift_in:
844 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
845 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
846 break;
848 case ISO_single_shift_2_7:
849 case ISO_single_shift_2:
850 /* SS2 is handled as an escape sequence of ESC 'N' */
851 c1 = 'N';
852 goto label_escape_sequence;
854 case ISO_single_shift_3:
855 /* SS2 is handled as an escape sequence of ESC 'O' */
856 c1 = 'O';
857 goto label_escape_sequence;
859 case ISO_control_sequence_introducer:
860 /* CSI is handled as an escape sequence of ESC '[' ... */
861 c1 = '[';
862 goto label_escape_sequence;
864 case ISO_escape:
865 ONE_MORE_BYTE (c1);
866 label_escape_sequence:
867 /* Escape sequences handled by Emacs are invocation,
868 designation, direction specification, and character
869 composition specification. */
870 switch (c1)
872 case '&': /* revision of following character set */
873 ONE_MORE_BYTE (c1);
874 if (!(c1 >= '@' && c1 <= '~'))
875 goto label_invalid_escape_sequence;
876 ONE_MORE_BYTE (c1);
877 if (c1 != ISO_CODE_ESC)
878 goto label_invalid_escape_sequence;
879 ONE_MORE_BYTE (c1);
880 goto label_escape_sequence;
882 case '$': /* designation of 2-byte character set */
883 ONE_MORE_BYTE (c1);
884 if (c1 >= '@' && c1 <= 'B')
885 { /* designation of JISX0208.1978, GB2312.1980,
886 or JISX0208.1980 */
887 DECODE_DESIGNATION (0, 2, 94, c1);
889 else if (c1 >= 0x28 && c1 <= 0x2B)
890 { /* designation of DIMENSION2_CHARS94 character set */
891 ONE_MORE_BYTE (c2);
892 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
894 else if (c1 >= 0x2C && c1 <= 0x2F)
895 { /* designation of DIMENSION2_CHARS96 character set */
896 ONE_MORE_BYTE (c2);
897 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
899 else
900 goto label_invalid_escape_sequence;
901 break;
903 case 'n': /* invocation of locking-shift-2 */
904 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
905 goto label_invalid_escape_sequence;
906 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
907 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
908 break;
910 case 'o': /* invocation of locking-shift-3 */
911 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
912 goto label_invalid_escape_sequence;
913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
915 break;
917 case 'N': /* invocation of single-shift-2 */
918 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
919 goto label_invalid_escape_sequence;
920 ONE_MORE_BYTE (c1);
921 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
922 DECODE_ISO_CHARACTER (charset, c1);
923 break;
925 case 'O': /* invocation of single-shift-3 */
926 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
927 goto label_invalid_escape_sequence;
928 ONE_MORE_BYTE (c1);
929 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
930 DECODE_ISO_CHARACTER (charset, c1);
931 break;
933 case '0': /* start composing without embeded rules */
934 coding->composing = COMPOSING_NO_RULE_HEAD;
935 break;
937 case '1': /* end composing */
938 coding->composing = COMPOSING_NO;
939 break;
941 case '2': /* start composing with embeded rules */
942 coding->composing = COMPOSING_WITH_RULE_HEAD;
943 break;
945 case '[': /* specification of direction */
946 /* For the moment, nested direction is not supported.
947 So, the value of `coding->direction' is 0 or 1: 0
948 means left-to-right, 1 means right-to-left. */
949 ONE_MORE_BYTE (c1);
950 switch (c1)
952 case ']': /* end of the current direction */
953 coding->direction = 0;
955 case '0': /* end of the current direction */
956 case '1': /* start of left-to-right direction */
957 ONE_MORE_BYTE (c1);
958 if (c1 == ']')
959 coding->direction = 0;
960 else
961 goto label_invalid_escape_sequence;
962 break;
964 case '2': /* start of right-to-left direction */
965 ONE_MORE_BYTE (c1);
966 if (c1 == ']')
967 coding->direction= 1;
968 else
969 goto label_invalid_escape_sequence;
970 break;
972 default:
973 goto label_invalid_escape_sequence;
975 break;
977 default:
978 if (c1 >= 0x28 && c1 <= 0x2B)
979 { /* designation of DIMENSION1_CHARS94 character set */
980 ONE_MORE_BYTE (c2);
981 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
983 else if (c1 >= 0x2C && c1 <= 0x2F)
984 { /* designation of DIMENSION1_CHARS96 character set */
985 ONE_MORE_BYTE (c2);
986 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
988 else
990 goto label_invalid_escape_sequence;
993 /* We must update these variables now. */
994 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
995 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
996 break;
998 label_invalid_escape_sequence:
1000 int length = src - src_base;
1002 bcopy (src_base, dst, length);
1003 dst += length;
1006 continue;
1008 label_end_of_loop:
1009 coding->carryover_size = src - src_base;
1010 bcopy (src_base, coding->carryover, coding->carryover_size);
1011 src = src_base;
1012 break;
1015 /* If this is the last block of the text to be decoded, we had
1016 better just flush out all remaining codes in the text although
1017 they are not valid characters. */
1018 if (coding->last_block)
1020 bcopy (src, dst, src_end - src);
1021 dst += (src_end - src);
1022 src = src_end;
1024 *consumed = src - source;
1025 return dst - destination;
1028 /* ISO2022 encoding stuff. */
1031 It is not enough to say just "ISO2022" on encoding, we have to
1032 specify more details. In Emacs, each coding-system of ISO2022
1033 variant has the following specifications:
1034 1. Initial designation to G0 thru G3.
1035 2. Allows short-form designation?
1036 3. ASCII should be designated to G0 before control characters?
1037 4. ASCII should be designated to G0 at end of line?
1038 5. 7-bit environment or 8-bit environment?
1039 6. Use locking-shift?
1040 7. Use Single-shift?
1041 And the following two are only for Japanese:
1042 8. Use ASCII in place of JIS0201-1976-Roman?
1043 9. Use JISX0208-1983 in place of JISX0208-1978?
1044 These specifications are encoded in `coding->flags' as flag bits
1045 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1046 details.
1049 /* Produce codes (escape sequence) for designating CHARSET to graphic
1050 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1051 the coding system CODING allows, produce designation sequence of
1052 short-form. */
1054 #define ENCODE_DESIGNATION(charset, reg, coding) \
1055 do { \
1056 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1057 char *intermediate_char_94 = "()*+"; \
1058 char *intermediate_char_96 = ",-./"; \
1059 Lisp_Object temp \
1060 = Fassq (make_number (charset), Vcharset_revision_alist); \
1061 if (! NILP (temp)) \
1063 *dst++ = ISO_CODE_ESC; \
1064 *dst++ = '&'; \
1065 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1067 *dst++ = ISO_CODE_ESC; \
1068 if (CHARSET_DIMENSION (charset) == 1) \
1070 if (CHARSET_CHARS (charset) == 94) \
1071 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1072 else \
1073 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1075 else \
1077 *dst++ = '$'; \
1078 if (CHARSET_CHARS (charset) == 94) \
1080 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1081 || reg != 0 \
1082 || final_char < '@' || final_char > 'B') \
1083 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1085 else \
1086 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1088 *dst++ = final_char; \
1089 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1090 } while (0)
1092 /* The following two macros produce codes (control character or escape
1093 sequence) for ISO2022 single-shift functions (single-shift-2 and
1094 single-shift-3). */
1096 #define ENCODE_SINGLE_SHIFT_2 \
1097 do { \
1098 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1099 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1100 else \
1101 *dst++ = ISO_CODE_SS2; \
1102 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1103 } while (0)
1105 #define ENCODE_SINGLE_SHIFT_3 \
1106 do { \
1107 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1108 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1109 else \
1110 *dst++ = ISO_CODE_SS3; \
1111 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1112 } while (0)
1114 /* The following four macros produce codes (control character or
1115 escape sequence) for ISO2022 locking-shift functions (shift-in,
1116 shift-out, locking-shift-2, and locking-shift-3). */
1118 #define ENCODE_SHIFT_IN \
1119 do { \
1120 *dst++ = ISO_CODE_SI; \
1121 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1122 } while (0)
1124 #define ENCODE_SHIFT_OUT \
1125 do { \
1126 *dst++ = ISO_CODE_SO; \
1127 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1128 } while (0)
1130 #define ENCODE_LOCKING_SHIFT_2 \
1131 do { \
1132 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1133 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1134 } while (0)
1136 #define ENCODE_LOCKING_SHIFT_3 \
1137 do { \
1138 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1139 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1140 } while (0)
1142 /* Produce codes for a DIMENSION1 character whose character set is
1143 CHARSET and whose position-code is C1. Designation and invocation
1144 sequences are also produced in advance if necessary. */
1147 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1148 do { \
1149 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1151 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1152 *dst++ = c1 & 0x7F; \
1153 else \
1154 *dst++ = c1 | 0x80; \
1155 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1156 break; \
1158 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1160 *dst++ = c1 & 0x7F; \
1161 break; \
1163 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1165 *dst++ = c1 | 0x80; \
1166 break; \
1168 else \
1169 /* Since CHARSET is not yet invoked to any graphic planes, we \
1170 must invoke it, or, at first, designate it to some graphic \
1171 register. Then repeat the loop to actually produce the \
1172 character. */ \
1173 dst = encode_invocation_designation (charset, coding, dst); \
1174 } while (1)
1176 /* Produce codes for a DIMENSION2 character whose character set is
1177 CHARSET and whose position-codes are C1 and C2. Designation and
1178 invocation codes are also produced in advance if necessary. */
1180 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1181 do { \
1182 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1184 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1185 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1186 else \
1187 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1188 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1189 break; \
1191 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1193 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1194 break; \
1196 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1198 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1199 break; \
1201 else \
1202 /* Since CHARSET is not yet invoked to any graphic planes, we \
1203 must invoke it, or, at first, designate it to some graphic \
1204 register. Then repeat the loop to actually produce the \
1205 character. */ \
1206 dst = encode_invocation_designation (charset, coding, dst); \
1207 } while (1)
1209 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1210 do { \
1211 int c_alt, charset_alt; \
1212 if (!NILP (unification_table) \
1213 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1214 >= 0)) \
1215 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1216 else \
1217 charset_alt = charset; \
1218 if (CHARSET_DIMENSION (charset_alt) == 1) \
1219 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1220 else \
1221 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1222 } while (0)
1224 /* Produce designation and invocation codes at a place pointed by DST
1225 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1226 Return new DST. */
1228 unsigned char *
1229 encode_invocation_designation (charset, coding, dst)
1230 int charset;
1231 struct coding_system *coding;
1232 unsigned char *dst;
1234 int reg; /* graphic register number */
1236 /* At first, check designations. */
1237 for (reg = 0; reg < 4; reg++)
1238 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1239 break;
1241 if (reg >= 4)
1243 /* CHARSET is not yet designated to any graphic registers. */
1244 /* At first check the requested designation. */
1245 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1246 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1247 /* Since CHARSET requests no special designation, designate it
1248 to graphic register 0. */
1249 reg = 0;
1251 ENCODE_DESIGNATION (charset, reg, coding);
1254 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1255 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1257 /* Since the graphic register REG is not invoked to any graphic
1258 planes, invoke it to graphic plane 0. */
1259 switch (reg)
1261 case 0: /* graphic register 0 */
1262 ENCODE_SHIFT_IN;
1263 break;
1265 case 1: /* graphic register 1 */
1266 ENCODE_SHIFT_OUT;
1267 break;
1269 case 2: /* graphic register 2 */
1270 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1271 ENCODE_SINGLE_SHIFT_2;
1272 else
1273 ENCODE_LOCKING_SHIFT_2;
1274 break;
1276 case 3: /* graphic register 3 */
1277 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1278 ENCODE_SINGLE_SHIFT_3;
1279 else
1280 ENCODE_LOCKING_SHIFT_3;
1281 break;
1284 return dst;
1287 /* The following two macros produce codes for indicating composition. */
1288 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1289 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1290 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1292 /* The following three macros produce codes for indicating direction
1293 of text. */
1294 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1295 do { \
1296 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1297 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1298 else \
1299 *dst++ = ISO_CODE_CSI; \
1300 } while (0)
1302 #define ENCODE_DIRECTION_R2L \
1303 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1305 #define ENCODE_DIRECTION_L2R \
1306 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1308 /* Produce codes for designation and invocation to reset the graphic
1309 planes and registers to initial state. */
1310 #define ENCODE_RESET_PLANE_AND_REGISTER \
1311 do { \
1312 int reg; \
1313 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1314 ENCODE_SHIFT_IN; \
1315 for (reg = 0; reg < 4; reg++) \
1316 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1317 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1318 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1319 ENCODE_DESIGNATION \
1320 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1321 } while (0)
1323 /* Produce designation sequences of charsets in the line started from
1324 *SRC to a place pointed by DSTP.
1326 If the current block ends before any end-of-line, we may fail to
1327 find all the necessary *designations. */
1328 encode_designation_at_bol (coding, table, src, src_end, dstp)
1329 struct coding_system *coding;
1330 Lisp_Object table;
1331 unsigned char *src, *src_end, **dstp;
1333 int charset, c, found = 0, reg;
1334 /* Table of charsets to be designated to each graphic register. */
1335 int r[4];
1336 unsigned char *dst = *dstp;
1338 for (reg = 0; reg < 4; reg++)
1339 r[reg] = -1;
1341 while (src < src_end && *src != '\n' && found < 4)
1343 int bytes = BYTES_BY_CHAR_HEAD (*src);
1345 if (NILP (table))
1346 charset = CHARSET_AT (src);
1347 else
1349 int c_alt, c1, c2;
1351 SPLIT_STRING(src, bytes, charset, c1, c2);
1352 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1353 charset = CHAR_CHARSET (c_alt);
1356 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1357 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1359 found++;
1360 r[reg] = charset;
1363 src += bytes;
1366 if (found)
1368 for (reg = 0; reg < 4; reg++)
1369 if (r[reg] >= 0
1370 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1371 ENCODE_DESIGNATION (r[reg], reg, coding);
1372 *dstp = dst;
1376 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1379 encode_coding_iso2022 (coding, source, destination,
1380 src_bytes, dst_bytes, consumed)
1381 struct coding_system *coding;
1382 unsigned char *source, *destination;
1383 int src_bytes, dst_bytes;
1384 int *consumed;
1386 unsigned char *src = source;
1387 unsigned char *src_end = source + src_bytes;
1388 unsigned char *dst = destination;
1389 unsigned char *dst_end = destination + dst_bytes;
1390 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1391 from DST_END to assure overflow checking is necessary only at the
1392 head of loop. */
1393 unsigned char *adjusted_dst_end = dst_end - 19;
1394 Lisp_Object unification_table
1395 = coding->character_unification_table_for_encode;
1397 if (!NILP (Venable_character_unification) && NILP (unification_table))
1398 unification_table = Vstandard_character_unification_table_for_encode;
1400 while (src < src_end && dst < adjusted_dst_end)
1402 /* SRC_BASE remembers the start position in source in each loop.
1403 The loop will be exited when there's not enough source text
1404 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1405 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1406 reset to SRC_BASE before exiting. */
1407 unsigned char *src_base = src;
1408 int charset, c1, c2, c3, c4;
1410 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1411 && CODING_SPEC_ISO_BOL (coding))
1413 /* We have to produce designation sequences if any now. */
1414 encode_designation_at_bol (coding, unification_table,
1415 src, src_end, &dst);
1416 CODING_SPEC_ISO_BOL (coding) = 0;
1419 c1 = *src++;
1420 /* If we are seeing a component of a composite character, we are
1421 seeing a leading-code specially encoded for composition, or a
1422 composition rule if composing with rule. We must set C1
1423 to a normal leading-code or an ASCII code. If we are not at
1424 a composed character, we must reset the composition state. */
1425 if (COMPOSING_P (coding->composing))
1427 if (c1 < 0xA0)
1429 /* We are not in a composite character any longer. */
1430 coding->composing = COMPOSING_NO;
1431 ENCODE_COMPOSITION_END;
1433 else
1435 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1437 *dst++ = c1 & 0x7F;
1438 coding->composing = COMPOSING_WITH_RULE_HEAD;
1439 continue;
1441 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1442 coding->composing = COMPOSING_WITH_RULE_RULE;
1443 if (c1 == 0xA0)
1445 /* This is an ASCII component. */
1446 ONE_MORE_BYTE (c1);
1447 c1 &= 0x7F;
1449 else
1450 /* This is a leading-code of non ASCII component. */
1451 c1 -= 0x20;
1455 /* Now encode one character. C1 is a control character, an
1456 ASCII character, or a leading-code of multi-byte character. */
1457 switch (emacs_code_class[c1])
1459 case EMACS_ascii_code:
1460 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1461 break;
1463 case EMACS_control_code:
1464 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1465 ENCODE_RESET_PLANE_AND_REGISTER;
1466 *dst++ = c1;
1467 break;
1469 case EMACS_carriage_return_code:
1470 if (!coding->selective)
1472 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1473 ENCODE_RESET_PLANE_AND_REGISTER;
1474 *dst++ = c1;
1475 break;
1477 /* fall down to treat '\r' as '\n' ... */
1479 case EMACS_linefeed_code:
1480 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1481 ENCODE_RESET_PLANE_AND_REGISTER;
1482 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1483 bcopy (coding->spec.iso2022.initial_designation,
1484 coding->spec.iso2022.current_designation,
1485 sizeof coding->spec.iso2022.initial_designation);
1486 if (coding->eol_type == CODING_EOL_LF
1487 || coding->eol_type == CODING_EOL_UNDECIDED)
1488 *dst++ = ISO_CODE_LF;
1489 else if (coding->eol_type == CODING_EOL_CRLF)
1490 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1491 else
1492 *dst++ = ISO_CODE_CR;
1493 CODING_SPEC_ISO_BOL (coding) = 1;
1494 break;
1496 case EMACS_leading_code_2:
1497 ONE_MORE_BYTE (c2);
1498 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1499 break;
1501 case EMACS_leading_code_3:
1502 TWO_MORE_BYTES (c2, c3);
1503 if (c1 < LEADING_CODE_PRIVATE_11)
1504 ENCODE_ISO_CHARACTER (c1, c2, c3);
1505 else
1506 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1507 break;
1509 case EMACS_leading_code_4:
1510 THREE_MORE_BYTES (c2, c3, c4);
1511 ENCODE_ISO_CHARACTER (c2, c3, c4);
1512 break;
1514 case EMACS_leading_code_composition:
1515 ONE_MORE_BYTE (c1);
1516 if (c1 == 0xFF)
1518 coding->composing = COMPOSING_WITH_RULE_HEAD;
1519 ENCODE_COMPOSITION_WITH_RULE_START;
1521 else
1523 /* Rewind one byte because it is a character code of
1524 composition elements. */
1525 src--;
1526 coding->composing = COMPOSING_NO_RULE_HEAD;
1527 ENCODE_COMPOSITION_NO_RULE_START;
1529 break;
1531 case EMACS_invalid_code:
1532 *dst++ = c1;
1533 break;
1535 continue;
1536 label_end_of_loop:
1537 coding->carryover_size = src - src_base;
1538 bcopy (src_base, coding->carryover, coding->carryover_size);
1539 break;
1542 /* If this is the last block of the text to be encoded, we must
1543 reset graphic planes and registers to the initial state. */
1544 if (src >= src_end && coding->last_block)
1546 ENCODE_RESET_PLANE_AND_REGISTER;
1547 if (coding->carryover_size > 0
1548 && coding->carryover_size < (dst_end - dst))
1550 bcopy (coding->carryover, dst, coding->carryover_size);
1551 dst += coding->carryover_size;
1552 coding->carryover_size = 0;
1555 *consumed = src - source;
1556 return dst - destination;
1560 /*** 4. SJIS and BIG5 handlers ***/
1562 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1563 quite widely. So, for the moment, Emacs supports them in the bare
1564 C code. But, in the future, they may be supported only by CCL. */
1566 /* SJIS is a coding system encoding three character sets: ASCII, right
1567 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1568 as is. A character of charset katakana-jisx0201 is encoded by
1569 "position-code + 0x80". A character of charset japanese-jisx0208
1570 is encoded in 2-byte but two position-codes are divided and shifted
1571 so that it fit in the range below.
1573 --- CODE RANGE of SJIS ---
1574 (character set) (range)
1575 ASCII 0x00 .. 0x7F
1576 KATAKANA-JISX0201 0xA0 .. 0xDF
1577 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1578 (2nd byte) 0x40 .. 0xFF
1579 -------------------------------
1583 /* BIG5 is a coding system encoding two character sets: ASCII and
1584 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1585 character set and is encoded in two-byte.
1587 --- CODE RANGE of BIG5 ---
1588 (character set) (range)
1589 ASCII 0x00 .. 0x7F
1590 Big5 (1st byte) 0xA1 .. 0xFE
1591 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1592 --------------------------
1594 Since the number of characters in Big5 is larger than maximum
1595 characters in Emacs' charset (96x96), it can't be handled as one
1596 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1597 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1598 contains frequently used characters and the latter contains less
1599 frequently used characters. */
1601 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1602 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1603 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1604 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1606 /* Number of Big5 characters which have the same code in 1st byte. */
1607 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1609 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1610 do { \
1611 unsigned int temp \
1612 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1613 if (b1 < 0xC9) \
1614 charset = charset_big5_1; \
1615 else \
1617 charset = charset_big5_2; \
1618 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1620 c1 = temp / (0xFF - 0xA1) + 0x21; \
1621 c2 = temp % (0xFF - 0xA1) + 0x21; \
1622 } while (0)
1624 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1625 do { \
1626 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1627 if (charset == charset_big5_2) \
1628 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1629 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1630 b2 = temp % BIG5_SAME_ROW; \
1631 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1632 } while (0)
1634 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1635 do { \
1636 int c_alt, charset_alt = (charset); \
1637 if (!NILP (unification_table) \
1638 && ((c_alt = unify_char (unification_table, \
1639 -1, (charset), c1, c2)) >= 0)) \
1640 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1641 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1642 DECODE_CHARACTER_ASCII (c1); \
1643 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1644 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1645 else \
1646 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1647 } while (0)
1649 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1650 do { \
1651 int c_alt, charset_alt; \
1652 if (!NILP (unification_table) \
1653 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1654 >= 0)) \
1655 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1656 else \
1657 charset_alt = charset; \
1658 if (charset_alt == charset_ascii) \
1659 *dst++ = c1; \
1660 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1662 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1663 *dst++ = c1; \
1664 else \
1665 *dst++ = charset_alt, *dst++ = c1; \
1667 else \
1669 c1 &= 0x7F, c2 &= 0x7F; \
1670 if (sjis_p && charset_alt == charset_jisx0208) \
1672 unsigned char s1, s2; \
1674 ENCODE_SJIS (c1, c2, s1, s2); \
1675 *dst++ = s1, *dst++ = s2; \
1677 else if (!sjis_p \
1678 && (charset_alt == charset_big5_1 \
1679 || charset_alt == charset_big5_2)) \
1681 unsigned char b1, b2; \
1683 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1684 *dst++ = b1, *dst++ = b2; \
1686 else \
1687 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1689 } while (0);
1691 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1692 Check if a text is encoded in SJIS. If it is, return
1693 CODING_CATEGORY_MASK_SJIS, else return 0. */
1696 detect_coding_sjis (src, src_end)
1697 unsigned char *src, *src_end;
1699 unsigned char c;
1701 while (src < src_end)
1703 c = *src++;
1704 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1705 return 0;
1706 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1708 if (src < src_end && *src++ < 0x40)
1709 return 0;
1712 return CODING_CATEGORY_MASK_SJIS;
1715 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1716 Check if a text is encoded in BIG5. If it is, return
1717 CODING_CATEGORY_MASK_BIG5, else return 0. */
1720 detect_coding_big5 (src, src_end)
1721 unsigned char *src, *src_end;
1723 unsigned char c;
1725 while (src < src_end)
1727 c = *src++;
1728 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1729 return 0;
1730 if (c >= 0xA1)
1732 if (src >= src_end)
1733 break;
1734 c = *src++;
1735 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1736 return 0;
1739 return CODING_CATEGORY_MASK_BIG5;
1742 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1743 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1746 decode_coding_sjis_big5 (coding, source, destination,
1747 src_bytes, dst_bytes, consumed, sjis_p)
1748 struct coding_system *coding;
1749 unsigned char *source, *destination;
1750 int src_bytes, dst_bytes;
1751 int *consumed;
1752 int sjis_p;
1754 unsigned char *src = source;
1755 unsigned char *src_end = source + src_bytes;
1756 unsigned char *dst = destination;
1757 unsigned char *dst_end = destination + dst_bytes;
1758 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1759 from DST_END to assure overflow checking is necessary only at the
1760 head of loop. */
1761 unsigned char *adjusted_dst_end = dst_end - 3;
1762 Lisp_Object unification_table
1763 = coding->character_unification_table_for_decode;
1765 if (!NILP (Venable_character_unification) && NILP (unification_table))
1766 unification_table = Vstandard_character_unification_table_for_decode;
1768 while (src < src_end && dst < adjusted_dst_end)
1770 /* SRC_BASE remembers the start position in source in each loop.
1771 The loop will be exited when there's not enough source text
1772 to analyze two-byte character (within macro ONE_MORE_BYTE).
1773 In that case, SRC is reset to SRC_BASE before exiting. */
1774 unsigned char *src_base = src;
1775 unsigned char c1 = *src++, c2, c3, c4;
1777 if (c1 == '\r')
1779 if (coding->eol_type == CODING_EOL_CRLF)
1781 ONE_MORE_BYTE (c2);
1782 if (c2 == '\n')
1783 *dst++ = c2;
1784 else
1785 /* To process C2 again, SRC is subtracted by 1. */
1786 *dst++ = c1, src--;
1788 else
1789 *dst++ = c1;
1791 else if (c1 < 0x20)
1792 *dst++ = c1;
1793 else if (c1 < 0x80)
1794 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1795 else if (c1 < 0xA0 || c1 >= 0xE0)
1797 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1798 if (sjis_p)
1800 ONE_MORE_BYTE (c2);
1801 DECODE_SJIS (c1, c2, c3, c4);
1802 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1804 else if (c1 >= 0xE0 && c1 < 0xFF)
1806 int charset;
1808 ONE_MORE_BYTE (c2);
1809 DECODE_BIG5 (c1, c2, charset, c3, c4);
1810 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1812 else /* Invalid code */
1813 *dst++ = c1;
1815 else
1817 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1818 if (sjis_p)
1819 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1820 else
1822 int charset;
1824 ONE_MORE_BYTE (c2);
1825 DECODE_BIG5 (c1, c2, charset, c3, c4);
1826 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1829 continue;
1831 label_end_of_loop:
1832 coding->carryover_size = src - src_base;
1833 bcopy (src_base, coding->carryover, coding->carryover_size);
1834 src = src_base;
1835 break;
1838 *consumed = src - source;
1839 return dst - destination;
1842 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1843 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1844 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1845 sure that all these charsets are registered as official charset
1846 (i.e. do not have extended leading-codes). Characters of other
1847 charsets are produced without any encoding. If SJIS_P is 1, encode
1848 SJIS text, else encode BIG5 text. */
1851 encode_coding_sjis_big5 (coding, source, destination,
1852 src_bytes, dst_bytes, consumed, sjis_p)
1853 struct coding_system *coding;
1854 unsigned char *source, *destination;
1855 int src_bytes, dst_bytes;
1856 int *consumed;
1857 int sjis_p;
1859 unsigned char *src = source;
1860 unsigned char *src_end = source + src_bytes;
1861 unsigned char *dst = destination;
1862 unsigned char *dst_end = destination + dst_bytes;
1863 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1864 from DST_END to assure overflow checking is necessary only at the
1865 head of loop. */
1866 unsigned char *adjusted_dst_end = dst_end - 1;
1867 Lisp_Object unification_table
1868 = coding->character_unification_table_for_encode;
1870 if (!NILP (Venable_character_unification) && NILP (unification_table))
1871 unification_table = Vstandard_character_unification_table_for_encode;
1873 while (src < src_end && dst < adjusted_dst_end)
1875 /* SRC_BASE remembers the start position in source in each loop.
1876 The loop will be exited when there's not enough source text
1877 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1878 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1879 before exiting. */
1880 unsigned char *src_base = src;
1881 unsigned char c1 = *src++, c2, c3, c4;
1883 if (coding->composing)
1885 if (c1 == 0xA0)
1887 ONE_MORE_BYTE (c1);
1888 c1 &= 0x7F;
1890 else if (c1 >= 0xA0)
1891 c1 -= 0x20;
1892 else
1893 coding->composing = 0;
1896 switch (emacs_code_class[c1])
1898 case EMACS_ascii_code:
1899 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1900 break;
1902 case EMACS_control_code:
1903 *dst++ = c1;
1904 break;
1906 case EMACS_carriage_return_code:
1907 if (!coding->selective)
1909 *dst++ = c1;
1910 break;
1912 /* fall down to treat '\r' as '\n' ... */
1914 case EMACS_linefeed_code:
1915 if (coding->eol_type == CODING_EOL_LF
1916 || coding->eol_type == CODING_EOL_UNDECIDED)
1917 *dst++ = '\n';
1918 else if (coding->eol_type == CODING_EOL_CRLF)
1919 *dst++ = '\r', *dst++ = '\n';
1920 else
1921 *dst++ = '\r';
1922 break;
1924 case EMACS_leading_code_2:
1925 ONE_MORE_BYTE (c2);
1926 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1927 break;
1929 case EMACS_leading_code_3:
1930 TWO_MORE_BYTES (c2, c3);
1931 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1932 break;
1934 case EMACS_leading_code_4:
1935 THREE_MORE_BYTES (c2, c3, c4);
1936 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1937 break;
1939 case EMACS_leading_code_composition:
1940 coding->composing = 1;
1941 break;
1943 default: /* i.e. case EMACS_invalid_code: */
1944 *dst++ = c1;
1946 continue;
1948 label_end_of_loop:
1949 coding->carryover_size = src - src_base;
1950 bcopy (src_base, coding->carryover, coding->carryover_size);
1951 src = src_base;
1952 break;
1955 *consumed = src - source;
1956 return dst - destination;
1960 /*** 5. End-of-line handlers ***/
1962 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1963 This function is called only when `coding->eol_type' is
1964 CODING_EOL_CRLF or CODING_EOL_CR. */
1966 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1967 struct coding_system *coding;
1968 unsigned char *source, *destination;
1969 int src_bytes, dst_bytes;
1970 int *consumed;
1972 unsigned char *src = source;
1973 unsigned char *src_end = source + src_bytes;
1974 unsigned char *dst = destination;
1975 unsigned char *dst_end = destination + dst_bytes;
1976 int produced;
1978 switch (coding->eol_type)
1980 case CODING_EOL_CRLF:
1982 /* Since the maximum bytes produced by each loop is 2, we
1983 subtract 1 from DST_END to assure overflow checking is
1984 necessary only at the head of loop. */
1985 unsigned char *adjusted_dst_end = dst_end - 1;
1987 while (src < src_end && dst < adjusted_dst_end)
1989 unsigned char *src_base = src;
1990 unsigned char c = *src++;
1991 if (c == '\r')
1993 ONE_MORE_BYTE (c);
1994 if (c != '\n')
1995 *dst++ = '\r';
1996 *dst++ = c;
1998 else
1999 *dst++ = c;
2000 continue;
2002 label_end_of_loop:
2003 coding->carryover_size = src - src_base;
2004 bcopy (src_base, coding->carryover, coding->carryover_size);
2005 src = src_base;
2006 break;
2008 *consumed = src - source;
2009 produced = dst - destination;
2010 break;
2013 case CODING_EOL_CR:
2014 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2015 bcopy (source, destination, produced);
2016 dst_end = destination + produced;
2017 while (dst < dst_end)
2018 if (*dst++ == '\r') dst[-1] = '\n';
2019 *consumed = produced;
2020 break;
2022 default: /* i.e. case: CODING_EOL_LF */
2023 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2024 bcopy (source, destination, produced);
2025 *consumed = produced;
2026 break;
2029 return produced;
2032 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2033 format of end-of-line according to `coding->eol_type'. If
2034 `coding->selective' is 1, code '\r' in source text also means
2035 end-of-line. */
2037 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2038 struct coding_system *coding;
2039 unsigned char *source, *destination;
2040 int src_bytes, dst_bytes;
2041 int *consumed;
2043 unsigned char *src = source;
2044 unsigned char *dst = destination;
2045 int produced;
2047 if (src_bytes <= 0)
2048 return 0;
2050 switch (coding->eol_type)
2052 case CODING_EOL_LF:
2053 case CODING_EOL_UNDECIDED:
2054 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2055 bcopy (source, destination, produced);
2056 if (coding->selective)
2058 int i = produced;
2059 while (i--)
2060 if (*dst++ == '\r') dst[-1] = '\n';
2062 *consumed = produced;
2064 case CODING_EOL_CRLF:
2066 unsigned char c;
2067 unsigned char *src_end = source + src_bytes;
2068 unsigned char *dst_end = destination + dst_bytes;
2069 /* Since the maximum bytes produced by each loop is 2, we
2070 subtract 1 from DST_END to assure overflow checking is
2071 necessary only at the head of loop. */
2072 unsigned char *adjusted_dst_end = dst_end - 1;
2074 while (src < src_end && dst < adjusted_dst_end)
2076 c = *src++;
2077 if (c == '\n' || (c == '\r' && coding->selective))
2078 *dst++ = '\r', *dst++ = '\n';
2079 else
2080 *dst++ = c;
2082 produced = dst - destination;
2083 *consumed = src - source;
2084 break;
2087 default: /* i.e. case CODING_EOL_CR: */
2088 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2089 bcopy (source, destination, produced);
2091 int i = produced;
2092 while (i--)
2093 if (*dst++ == '\n') dst[-1] = '\r';
2095 *consumed = produced;
2098 return produced;
2102 /*** 6. C library functions ***/
2104 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2105 has a property `coding-system'. The value of this property is a
2106 vector of length 5 (called as coding-vector). Among elements of
2107 this vector, the first (element[0]) and the fifth (element[4])
2108 carry important information for decoding/encoding. Before
2109 decoding/encoding, this information should be set in fields of a
2110 structure of type `coding_system'.
2112 A value of property `coding-system' can be a symbol of another
2113 subsidiary coding-system. In that case, Emacs gets coding-vector
2114 from that symbol.
2116 `element[0]' contains information to be set in `coding->type'. The
2117 value and its meaning is as follows:
2119 0 -- coding_type_emacs_mule
2120 1 -- coding_type_sjis
2121 2 -- coding_type_iso2022
2122 3 -- coding_type_big5
2123 4 -- coding_type_ccl encoder/decoder written in CCL
2124 nil -- coding_type_no_conversion
2125 t -- coding_type_undecided (automatic conversion on decoding,
2126 no-conversion on encoding)
2128 `element[4]' contains information to be set in `coding->flags' and
2129 `coding->spec'. The meaning varies by `coding->type'.
2131 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2132 of length 32 (of which the first 13 sub-elements are used now).
2133 Meanings of these sub-elements are:
2135 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2136 If the value is an integer of valid charset, the charset is
2137 assumed to be designated to graphic register N initially.
2139 If the value is minus, it is a minus value of charset which
2140 reserves graphic register N, which means that the charset is
2141 not designated initially but should be designated to graphic
2142 register N just before encoding a character in that charset.
2144 If the value is nil, graphic register N is never used on
2145 encoding.
2147 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2148 Each value takes t or nil. See the section ISO2022 of
2149 `coding.h' for more information.
2151 If `coding->type' is `coding_type_big5', element[4] is t to denote
2152 BIG5-ETen or nil to denote BIG5-HKU.
2154 If `coding->type' takes the other value, element[4] is ignored.
2156 Emacs Lisp's coding system also carries information about format of
2157 end-of-line in a value of property `eol-type'. If the value is
2158 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2159 means CODING_EOL_CR. If it is not integer, it should be a vector
2160 of subsidiary coding systems of which property `eol-type' has one
2161 of above values.
2165 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2166 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2167 is setup so that no conversion is necessary and return -1, else
2168 return 0. */
2171 setup_coding_system (coding_system, coding)
2172 Lisp_Object coding_system;
2173 struct coding_system *coding;
2175 Lisp_Object type, eol_type;
2177 /* At first, set several fields to default values. */
2178 coding->require_flushing = 0;
2179 coding->last_block = 0;
2180 coding->selective = 0;
2181 coding->composing = 0;
2182 coding->direction = 0;
2183 coding->carryover_size = 0;
2184 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2185 coding->character_unification_table_for_decode = Qnil;
2186 coding->character_unification_table_for_encode = Qnil;
2188 Vlast_coding_system_used = coding->symbol = coding_system;
2189 eol_type = Qnil;
2190 /* Get value of property `coding-system' until we get a vector.
2191 While doing that, also get values of properties
2192 `post-read-conversion', `pre-write-conversion',
2193 `character-unification-table-for-decode',
2194 `character-unification-table-for-encode' and `eol-type'. */
2195 while (!NILP (coding_system) && SYMBOLP (coding_system))
2197 if (NILP (coding->post_read_conversion))
2198 coding->post_read_conversion = Fget (coding_system,
2199 Qpost_read_conversion);
2200 if (NILP (coding->pre_write_conversion))
2201 coding->pre_write_conversion = Fget (coding_system,
2202 Qpre_write_conversion);
2203 if (!inhibit_eol_conversion && NILP (eol_type))
2204 eol_type = Fget (coding_system, Qeol_type);
2206 if (NILP (coding->character_unification_table_for_decode))
2207 coding->character_unification_table_for_decode
2208 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2210 if (NILP (coding->character_unification_table_for_encode))
2211 coding->character_unification_table_for_encode
2212 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2214 coding_system = Fget (coding_system, Qcoding_system);
2217 while (!NILP (coding->character_unification_table_for_decode)
2218 && SYMBOLP (coding->character_unification_table_for_decode))
2219 coding->character_unification_table_for_decode
2220 = Fget (coding->character_unification_table_for_decode,
2221 Qcharacter_unification_table_for_decode);
2222 if (!NILP (coding->character_unification_table_for_decode)
2223 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2224 coding->character_unification_table_for_decode = Qnil;
2226 while (!NILP (coding->character_unification_table_for_encode)
2227 && SYMBOLP (coding->character_unification_table_for_encode))
2228 coding->character_unification_table_for_encode
2229 = Fget (coding->character_unification_table_for_encode,
2230 Qcharacter_unification_table_for_encode);
2231 if (!NILP (coding->character_unification_table_for_encode)
2232 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2233 coding->character_unification_table_for_encode = Qnil;
2235 if (!VECTORP (coding_system)
2236 || XVECTOR (coding_system)->size != 5)
2237 goto label_invalid_coding_system;
2239 if (VECTORP (eol_type))
2240 coding->eol_type = CODING_EOL_UNDECIDED;
2241 else if (XFASTINT (eol_type) == 1)
2242 coding->eol_type = CODING_EOL_CRLF;
2243 else if (XFASTINT (eol_type) == 2)
2244 coding->eol_type = CODING_EOL_CR;
2245 else
2246 coding->eol_type = CODING_EOL_LF;
2248 type = XVECTOR (coding_system)->contents[0];
2249 switch (XFASTINT (type))
2251 case 0:
2252 coding->type = coding_type_emacs_mule;
2253 break;
2255 case 1:
2256 coding->type = coding_type_sjis;
2257 break;
2259 case 2:
2260 coding->type = coding_type_iso2022;
2262 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2263 Lisp_Object *flags;
2264 int i, charset, default_reg_bits = 0;
2266 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2267 goto label_invalid_coding_system;
2269 flags = XVECTOR (val)->contents;
2270 coding->flags
2271 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2272 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2273 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2274 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2275 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2276 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2277 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2278 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2279 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2280 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2281 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2283 /* Invoke graphic register 0 to plane 0. */
2284 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2285 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2286 CODING_SPEC_ISO_INVOCATION (coding, 1)
2287 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2288 /* Not single shifting at first. */
2289 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2290 /* Beginning of buffer should also be regarded as bol. */
2291 CODING_SPEC_ISO_BOL(coding) = 1;
2293 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2294 FLAGS[REG] can be one of below:
2295 integer CHARSET: CHARSET occupies register I,
2296 t: designate nothing to REG initially, but can be used
2297 by any charsets,
2298 list of integer, nil, or t: designate the first
2299 element (if integer) to REG initially, the remaining
2300 elements (if integer) is designated to REG on request,
2301 if an element is t, REG can be used by any charset,
2302 nil: REG is never used. */
2303 for (charset = 0; charset <= MAX_CHARSET; charset++)
2304 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2305 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2306 for (i = 0; i < 4; i++)
2308 if (INTEGERP (flags[i])
2309 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2310 || (charset = get_charset_id (flags[i])) >= 0)
2312 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2313 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2315 else if (EQ (flags[i], Qt))
2317 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2318 default_reg_bits |= 1 << i;
2320 else if (CONSP (flags[i]))
2322 Lisp_Object tail = flags[i];
2324 if (INTEGERP (XCONS (tail)->car)
2325 && (charset = XINT (XCONS (tail)->car),
2326 CHARSET_VALID_P (charset))
2327 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2329 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2330 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2332 else
2333 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2334 tail = XCONS (tail)->cdr;
2335 while (CONSP (tail))
2337 if (INTEGERP (XCONS (tail)->car)
2338 && (charset = XINT (XCONS (tail)->car),
2339 CHARSET_VALID_P (charset))
2340 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2341 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2342 = i;
2343 else if (EQ (XCONS (tail)->car, Qt))
2344 default_reg_bits |= 1 << i;
2345 tail = XCONS (tail)->cdr;
2348 else
2349 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2351 CODING_SPEC_ISO_DESIGNATION (coding, i)
2352 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2355 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2357 /* REG 1 can be used only by locking shift in 7-bit env. */
2358 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2359 default_reg_bits &= ~2;
2360 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2361 /* Without any shifting, only REG 0 and 1 can be used. */
2362 default_reg_bits &= 3;
2365 for (charset = 0; charset <= MAX_CHARSET; charset++)
2366 if (CHARSET_VALID_P (charset)
2367 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2368 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2370 /* We have not yet decided where to designate CHARSET. */
2371 int reg_bits = default_reg_bits;
2373 if (CHARSET_CHARS (charset) == 96)
2374 /* A charset of CHARS96 can't be designated to REG 0. */
2375 reg_bits &= ~1;
2377 if (reg_bits)
2378 /* There exist some default graphic register. */
2379 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2380 = (reg_bits & 1
2381 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2382 else
2383 /* We anyway have to designate CHARSET to somewhere. */
2384 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2385 = (CHARSET_CHARS (charset) == 94
2387 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2388 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2390 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2391 ? 2 : 0)));
2394 coding->require_flushing = 1;
2395 break;
2397 case 3:
2398 coding->type = coding_type_big5;
2399 coding->flags
2400 = (NILP (XVECTOR (coding_system)->contents[4])
2401 ? CODING_FLAG_BIG5_HKU
2402 : CODING_FLAG_BIG5_ETEN);
2403 break;
2405 case 4:
2406 coding->type = coding_type_ccl;
2408 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2409 if (CONSP (val)
2410 && VECTORP (XCONS (val)->car)
2411 && VECTORP (XCONS (val)->cdr))
2413 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2414 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2416 else
2417 goto label_invalid_coding_system;
2419 coding->require_flushing = 1;
2420 break;
2422 default:
2423 if (EQ (type, Qt))
2424 coding->type = coding_type_undecided;
2425 else
2426 coding->type = coding_type_no_conversion;
2427 break;
2429 return 0;
2431 label_invalid_coding_system:
2432 coding->type = coding_type_no_conversion;
2433 coding->eol_type = CODING_EOL_LF;
2434 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2435 = Qnil;
2436 return -1;
2439 /* Emacs has a mechanism to automatically detect a coding system if it
2440 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2441 it's impossible to distinguish some coding systems accurately
2442 because they use the same range of codes. So, at first, coding
2443 systems are categorized into 7, those are:
2445 o coding-category-emacs-mule
2447 The category for a coding system which has the same code range
2448 as Emacs' internal format. Assigned the coding-system (Lisp
2449 symbol) `emacs-mule' by default.
2451 o coding-category-sjis
2453 The category for a coding system which has the same code range
2454 as SJIS. Assigned the coding-system (Lisp
2455 symbol) `japanese-shift-jis' by default.
2457 o coding-category-iso-7
2459 The category for a coding system which has the same code range
2460 as ISO2022 of 7-bit environment. This doesn't use any locking
2461 shift and single shift functions. Assigned the coding-system
2462 (Lisp symbol) `iso-2022-7bit' by default.
2464 o coding-category-iso-8-1
2466 The category for a coding system which has the same code range
2467 as ISO2022 of 8-bit environment and graphic plane 1 used only
2468 for DIMENSION1 charset. This doesn't use any locking shift
2469 and single shift functions. Assigned the coding-system (Lisp
2470 symbol) `iso-latin-1' by default.
2472 o coding-category-iso-8-2
2474 The category for a coding system which has the same code range
2475 as ISO2022 of 8-bit environment and graphic plane 1 used only
2476 for DIMENSION2 charset. This doesn't use any locking shift
2477 and single shift functions. Assigned the coding-system (Lisp
2478 symbol) `japanese-iso-8bit' by default.
2480 o coding-category-iso-7-else
2482 The category for a coding system which has the same code range
2483 as ISO2022 of 7-bit environemnt but uses locking shift or
2484 single shift functions. Assigned the coding-system (Lisp
2485 symbol) `iso-2022-7bit-lock' by default.
2487 o coding-category-iso-8-else
2489 The category for a coding system which has the same code range
2490 as ISO2022 of 8-bit environemnt but uses locking shift or
2491 single shift functions. Assigned the coding-system (Lisp
2492 symbol) `iso-2022-8bit-ss2' by default.
2494 o coding-category-big5
2496 The category for a coding system which has the same code range
2497 as BIG5. Assigned the coding-system (Lisp symbol)
2498 `cn-big5' by default.
2500 o coding-category-binary
2502 The category for a coding system not categorized in any of the
2503 above. Assigned the coding-system (Lisp symbol)
2504 `no-conversion' by default.
2506 Each of them is a Lisp symbol and the value is an actual
2507 `coding-system's (this is also a Lisp symbol) assigned by a user.
2508 What Emacs does actually is to detect a category of coding system.
2509 Then, it uses a `coding-system' assigned to it. If Emacs can't
2510 decide only one possible category, it selects a category of the
2511 highest priority. Priorities of categories are also specified by a
2512 user in a Lisp variable `coding-category-list'.
2516 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2517 If it detects possible coding systems, return an integer in which
2518 appropriate flag bits are set. Flag bits are defined by macros
2519 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2522 detect_coding_mask (src, src_bytes)
2523 unsigned char *src;
2524 int src_bytes;
2526 register unsigned char c;
2527 unsigned char *src_end = src + src_bytes;
2528 int mask;
2530 /* At first, skip all ASCII characters and control characters except
2531 for three ISO2022 specific control characters. */
2532 label_loop_detect_coding:
2533 while (src < src_end)
2535 c = *src;
2536 if (c >= 0x80
2537 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2538 break;
2539 src++;
2542 if (src >= src_end)
2543 /* We found nothing other than ASCII. There's nothing to do. */
2544 return CODING_CATEGORY_MASK_ANY;
2546 /* The text seems to be encoded in some multilingual coding system.
2547 Now, try to find in which coding system the text is encoded. */
2548 if (c < 0x80)
2550 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2551 /* C is an ISO2022 specific control code of C0. */
2552 mask = detect_coding_iso2022 (src, src_end);
2553 src++;
2554 if (mask == CODING_CATEGORY_MASK_ANY)
2555 /* No valid ISO2022 code follows C. Try again. */
2556 goto label_loop_detect_coding;
2558 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2559 /* C is an ISO2022 specific control code of C1,
2560 or the first byte of SJIS's 2-byte character code,
2561 or a leading code of Emacs. */
2562 mask = (detect_coding_iso2022 (src, src_end)
2563 | detect_coding_sjis (src, src_end)
2564 | detect_coding_emacs_mule (src, src_end));
2566 else if (c < 0xA0)
2567 /* C is the first byte of SJIS character code,
2568 or a leading-code of Emacs. */
2569 mask = (detect_coding_sjis (src, src_end)
2570 | detect_coding_emacs_mule (src, src_end));
2572 else
2573 /* C is a character of ISO2022 in graphic plane right,
2574 or a SJIS's 1-byte character code (i.e. JISX0201),
2575 or the first byte of BIG5's 2-byte code. */
2576 mask = (detect_coding_iso2022 (src, src_end)
2577 | detect_coding_sjis (src, src_end)
2578 | detect_coding_big5 (src, src_end));
2580 return mask;
2583 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2584 The information of the detected coding system is set in CODING. */
2586 void
2587 detect_coding (coding, src, src_bytes)
2588 struct coding_system *coding;
2589 unsigned char *src;
2590 int src_bytes;
2592 int mask = detect_coding_mask (src, src_bytes);
2593 int idx;
2595 if (mask == CODING_CATEGORY_MASK_ANY)
2596 /* We found nothing other than ASCII. There's nothing to do. */
2597 return;
2599 if (!mask)
2600 /* The source text seems to be encoded in unknown coding system.
2601 Emacs regards the category of such a kind of coding system as
2602 `coding-category-binary'. We assume that a user has assigned
2603 an appropriate coding system for a `coding-category-binary'. */
2604 idx = CODING_CATEGORY_IDX_BINARY;
2605 else
2607 /* We found some plausible coding systems. Let's use a coding
2608 system of the highest priority. */
2609 Lisp_Object val = Vcoding_category_list;
2611 if (CONSP (val))
2612 while (!NILP (val))
2614 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2615 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2616 break;
2617 val = XCONS (val)->cdr;
2619 else
2620 val = Qnil;
2622 if (NILP (val))
2624 /* For unknown reason, `Vcoding_category_list' contains none
2625 of found categories. Let's use any of them. */
2626 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2627 if (mask & (1 << idx))
2628 break;
2631 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2634 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2635 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2636 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2639 detect_eol_type (src, src_bytes)
2640 unsigned char *src;
2641 int src_bytes;
2643 unsigned char *src_end = src + src_bytes;
2644 unsigned char c;
2646 while (src < src_end)
2648 c = *src++;
2649 if (c == '\n')
2650 return CODING_EOL_LF;
2651 else if (c == '\r')
2653 if (src < src_end && *src == '\n')
2654 return CODING_EOL_CRLF;
2655 else
2656 return CODING_EOL_CR;
2659 return CODING_EOL_UNDECIDED;
2662 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2663 is encoded. If it detects an appropriate format of end-of-line, it
2664 sets the information in *CODING. */
2666 void
2667 detect_eol (coding, src, src_bytes)
2668 struct coding_system *coding;
2669 unsigned char *src;
2670 int src_bytes;
2672 Lisp_Object val;
2673 int eol_type = detect_eol_type (src, src_bytes);
2675 if (eol_type == CODING_EOL_UNDECIDED)
2676 /* We found no end-of-line in the source text. */
2677 return;
2679 val = Fget (coding->symbol, Qeol_type);
2680 if (VECTORP (val) && XVECTOR (val)->size == 3)
2681 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2684 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2685 decoding, it may detect coding system and format of end-of-line if
2686 those are not yet decided. */
2689 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2690 struct coding_system *coding;
2691 unsigned char *source, *destination;
2692 int src_bytes, dst_bytes;
2693 int *consumed;
2695 int produced;
2697 if (src_bytes <= 0)
2699 *consumed = 0;
2700 return 0;
2703 if (coding->type == coding_type_undecided)
2704 detect_coding (coding, source, src_bytes);
2706 if (coding->eol_type == CODING_EOL_UNDECIDED)
2707 detect_eol (coding, source, src_bytes);
2709 coding->carryover_size = 0;
2710 switch (coding->type)
2712 case coding_type_no_conversion:
2713 label_no_conversion:
2714 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2715 bcopy (source, destination, produced);
2716 *consumed = produced;
2717 break;
2719 case coding_type_emacs_mule:
2720 case coding_type_undecided:
2721 if (coding->eol_type == CODING_EOL_LF
2722 || coding->eol_type == CODING_EOL_UNDECIDED)
2723 goto label_no_conversion;
2724 produced = decode_eol (coding, source, destination,
2725 src_bytes, dst_bytes, consumed);
2726 break;
2728 case coding_type_sjis:
2729 produced = decode_coding_sjis_big5 (coding, source, destination,
2730 src_bytes, dst_bytes, consumed,
2732 break;
2734 case coding_type_iso2022:
2735 produced = decode_coding_iso2022 (coding, source, destination,
2736 src_bytes, dst_bytes, consumed);
2737 break;
2739 case coding_type_big5:
2740 produced = decode_coding_sjis_big5 (coding, source, destination,
2741 src_bytes, dst_bytes, consumed,
2743 break;
2745 case coding_type_ccl:
2746 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2747 src_bytes, dst_bytes, consumed);
2748 break;
2751 return produced;
2754 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2757 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2758 struct coding_system *coding;
2759 unsigned char *source, *destination;
2760 int src_bytes, dst_bytes;
2761 int *consumed;
2763 int produced;
2765 coding->carryover_size = 0;
2766 switch (coding->type)
2768 case coding_type_no_conversion:
2769 label_no_conversion:
2770 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2771 if (produced > 0)
2773 bcopy (source, destination, produced);
2774 if (coding->selective)
2776 unsigned char *p = destination, *pend = destination + produced;
2777 while (p < pend)
2778 if (*p++ == '\015') p[-1] = '\n';
2781 *consumed = produced;
2782 break;
2784 case coding_type_emacs_mule:
2785 case coding_type_undecided:
2786 if (coding->eol_type == CODING_EOL_LF
2787 || coding->eol_type == CODING_EOL_UNDECIDED)
2788 goto label_no_conversion;
2789 produced = encode_eol (coding, source, destination,
2790 src_bytes, dst_bytes, consumed);
2791 break;
2793 case coding_type_sjis:
2794 produced = encode_coding_sjis_big5 (coding, source, destination,
2795 src_bytes, dst_bytes, consumed,
2797 break;
2799 case coding_type_iso2022:
2800 produced = encode_coding_iso2022 (coding, source, destination,
2801 src_bytes, dst_bytes, consumed);
2802 break;
2804 case coding_type_big5:
2805 produced = encode_coding_sjis_big5 (coding, source, destination,
2806 src_bytes, dst_bytes, consumed,
2808 break;
2810 case coding_type_ccl:
2811 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2812 src_bytes, dst_bytes, consumed);
2813 break;
2816 return produced;
2819 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2821 /* Return maximum size (bytes) of a buffer enough for decoding
2822 SRC_BYTES of text encoded in CODING. */
2825 decoding_buffer_size (coding, src_bytes)
2826 struct coding_system *coding;
2827 int src_bytes;
2829 int magnification;
2831 if (coding->type == coding_type_iso2022)
2832 magnification = 3;
2833 else if (coding->type == coding_type_ccl)
2834 magnification = coding->spec.ccl.decoder.buf_magnification;
2835 else
2836 magnification = 2;
2838 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2841 /* Return maximum size (bytes) of a buffer enough for encoding
2842 SRC_BYTES of text to CODING. */
2845 encoding_buffer_size (coding, src_bytes)
2846 struct coding_system *coding;
2847 int src_bytes;
2849 int magnification;
2851 if (coding->type == coding_type_ccl)
2852 magnification = coding->spec.ccl.encoder.buf_magnification;
2853 else
2854 magnification = 3;
2856 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2859 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2860 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2861 #endif
2863 char *conversion_buffer;
2864 int conversion_buffer_size;
2866 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2867 or decoding. Sufficient memory is allocated automatically. If we
2868 run out of memory, return NULL. */
2870 char *
2871 get_conversion_buffer (size)
2872 int size;
2874 if (size > conversion_buffer_size)
2876 char *buf;
2877 int real_size = conversion_buffer_size * 2;
2879 while (real_size < size) real_size *= 2;
2880 buf = (char *) xmalloc (real_size);
2881 xfree (conversion_buffer);
2882 conversion_buffer = buf;
2883 conversion_buffer_size = real_size;
2885 return conversion_buffer;
2889 #ifdef emacs
2890 /*** 7. Emacs Lisp library functions ***/
2892 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2893 1, 1, 0,
2894 "Return coding-spec of CODING-SYSTEM.\n\
2895 If CODING-SYSTEM is not a valid coding-system, return nil.")
2896 (obj)
2897 Lisp_Object obj;
2899 while (SYMBOLP (obj) && !NILP (obj))
2900 obj = Fget (obj, Qcoding_system);
2901 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2902 ? Qnil : obj);
2905 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2906 "Return t if OBJECT is nil or a coding-system.\n\
2907 See document of make-coding-system for coding-system object.")
2908 (obj)
2909 Lisp_Object obj;
2911 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2914 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2915 Sread_non_nil_coding_system, 1, 1, 0,
2916 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2917 (prompt)
2918 Lisp_Object prompt;
2920 Lisp_Object val;
2923 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2924 Qt, Qnil, Qnil, Qnil);
2926 while (XSTRING (val)->size == 0);
2927 return (Fintern (val, Qnil));
2930 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2931 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2932 (prompt)
2933 Lisp_Object prompt;
2935 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2936 Qt, Qnil, Qnil, Qnil);
2937 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2940 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2941 1, 1, 0,
2942 "Check validity of CODING-SYSTEM.\n\
2943 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2944 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2945 The value of property should be a vector of length 5.")
2946 (coding_system)
2947 Lisp_Object coding_system;
2949 CHECK_SYMBOL (coding_system, 0);
2950 if (!NILP (Fcoding_system_p (coding_system)))
2951 return coding_system;
2952 while (1)
2953 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2956 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2957 2, 2, 0,
2958 "Detect coding-system of the text in the region between START and END.\n\
2959 Return a list of possible coding-systems ordered by priority.\n\
2960 If only ASCII characters are found, it returns `undecided'\n\
2961 or its subsidiary coding-system according to a detected end-of-line format.")
2962 (b, e)
2963 Lisp_Object b, e;
2965 int coding_mask, eol_type;
2966 Lisp_Object val;
2967 int beg, end;
2969 validate_region (&b, &e);
2970 beg = XINT (b), end = XINT (e);
2971 if (beg < GPT && end >= GPT) move_gap (end);
2973 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2974 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2976 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2978 val = intern ("undecided");
2979 if (eol_type != CODING_EOL_UNDECIDED)
2981 Lisp_Object val2 = Fget (val, Qeol_type);
2982 if (VECTORP (val2))
2983 val = XVECTOR (val2)->contents[eol_type];
2986 else
2988 Lisp_Object val2;
2990 /* At first, gather possible coding-systems in VAL in a reverse
2991 order. */
2992 val = Qnil;
2993 for (val2 = Vcoding_category_list;
2994 !NILP (val2);
2995 val2 = XCONS (val2)->cdr)
2997 int idx
2998 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2999 if (coding_mask & (1 << idx))
3000 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3003 /* Then, change the order of the list, while getting subsidiary
3004 coding-systems. */
3005 val2 = val;
3006 val = Qnil;
3007 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3009 if (eol_type == CODING_EOL_UNDECIDED)
3010 val = Fcons (XCONS (val2)->car, val);
3011 else
3013 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3014 if (VECTORP (val3))
3015 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3016 else
3017 val = Fcons (XCONS (val2)->car, val);
3022 return val;
3025 /* Scan text in the region between *BEGP and *ENDP, skip characters
3026 which we never have to encode to (iff ENCODEP is 1) or decode from
3027 coding system CODING at the head and tail, then set BEGP and ENDP
3028 to the addresses of start and end of the text we actually convert. */
3030 void
3031 shrink_conversion_area (begp, endp, coding, encodep)
3032 unsigned char **begp, **endp;
3033 struct coding_system *coding;
3034 int encodep;
3036 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3038 if (coding->eol_type != CODING_EOL_LF
3039 && coding->eol_type != CODING_EOL_UNDECIDED)
3040 /* Since we anyway have to convert end-of-line format, it is not
3041 worth skipping at most 100 bytes or so. */
3042 return;
3044 if (encodep) /* for encoding */
3046 switch (coding->type)
3048 case coding_type_no_conversion:
3049 case coding_type_emacs_mule:
3050 case coding_type_undecided:
3051 /* We need no conversion. */
3052 *begp = *endp;
3053 return;
3054 case coding_type_ccl:
3055 /* We can't skip any data. */
3056 return;
3057 case coding_type_iso2022:
3058 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3060 unsigned char *bol = beg_addr;
3061 while (beg_addr < end_addr && *beg_addr < 0x80)
3063 beg_addr++;
3064 if (*(beg_addr - 1) == '\n')
3065 bol = beg_addr;
3067 beg_addr = bol;
3068 goto label_skip_tail;
3070 /* fall down ... */
3071 default:
3072 /* We can skip all ASCII characters at the head and tail. */
3073 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3074 label_skip_tail:
3075 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3076 break;
3079 else /* for decoding */
3081 switch (coding->type)
3083 case coding_type_no_conversion:
3084 /* We need no conversion. */
3085 *begp = *endp;
3086 return;
3087 case coding_type_emacs_mule:
3088 if (coding->eol_type == CODING_EOL_LF)
3090 /* We need no conversion. */
3091 *begp = *endp;
3092 return;
3094 /* We can skip all but carriage-return. */
3095 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3096 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3097 break;
3098 case coding_type_sjis:
3099 case coding_type_big5:
3100 /* We can skip all ASCII characters at the head. */
3101 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3102 /* We can skip all ASCII characters at the tail except for
3103 the second byte of SJIS or BIG5 code. */
3104 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3105 if (end_addr != *endp)
3106 end_addr++;
3107 break;
3108 case coding_type_ccl:
3109 /* We can't skip any data. */
3110 return;
3111 default: /* i.e. case coding_type_iso2022: */
3113 unsigned char c;
3115 /* We can skip all ASCII characters except for a few
3116 control codes at the head. */
3117 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3118 && c != ISO_CODE_CR && c != ISO_CODE_SO
3119 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3120 beg_addr++;
3122 break;
3125 *begp = beg_addr;
3126 *endp = end_addr;
3127 return;
3130 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3131 text between B and E. B and E are buffer position. */
3133 Lisp_Object
3134 code_convert_region (b, e, coding, encodep)
3135 Lisp_Object b, e;
3136 struct coding_system *coding;
3137 int encodep;
3139 int beg, end, len, consumed, produced;
3140 char *buf;
3141 unsigned char *begp, *endp;
3142 int pos = PT;
3144 validate_region (&b, &e);
3145 beg = XINT (b), end = XINT (e);
3146 if (beg < GPT && end >= GPT)
3147 move_gap (end);
3149 if (encodep && !NILP (coding->pre_write_conversion))
3151 /* We must call a pre-conversion function which may put a new
3152 text to be converted in a new buffer. */
3153 struct buffer *old = current_buffer, *new;
3155 TEMP_SET_PT (beg);
3156 call2 (coding->pre_write_conversion, b, e);
3157 if (old != current_buffer)
3159 /* Replace the original text by the text just generated. */
3160 len = ZV - BEGV;
3161 new = current_buffer;
3162 set_buffer_internal (old);
3163 del_range (beg, end);
3164 insert_from_buffer (new, 1, len, 0);
3165 end = beg + len;
3169 /* We may be able to shrink the conversion region. */
3170 begp = POS_ADDR (beg); endp = begp + (end - beg);
3171 shrink_conversion_area (&begp, &endp, coding, encodep);
3173 if (begp == endp)
3174 /* We need no conversion. */
3175 len = end - beg;
3176 else
3178 beg += begp - POS_ADDR (beg);
3179 end = beg + (endp - begp);
3181 if (encodep)
3182 len = encoding_buffer_size (coding, end - beg);
3183 else
3184 len = decoding_buffer_size (coding, end - beg);
3185 buf = get_conversion_buffer (len);
3187 coding->last_block = 1;
3188 produced = (encodep
3189 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3190 &consumed)
3191 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3192 &consumed));
3194 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3196 TEMP_SET_PT (beg);
3197 insert (buf, produced);
3198 del_range (PT, PT + end - beg);
3199 if (pos >= end)
3200 pos = PT + (pos - end);
3201 else if (pos > beg)
3202 pos = beg;
3203 TEMP_SET_PT (pos);
3206 if (!encodep && !NILP (coding->post_read_conversion))
3208 /* We must call a post-conversion function which may alter
3209 the text just converted. */
3210 Lisp_Object insval;
3212 beg = XINT (b);
3213 TEMP_SET_PT (beg);
3214 insval = call1 (coding->post_read_conversion, make_number (len));
3215 CHECK_NUMBER (insval, 0);
3216 len = XINT (insval);
3219 return make_number (len);
3222 Lisp_Object
3223 code_convert_string (str, coding, encodep, nocopy)
3224 Lisp_Object str, nocopy;
3225 struct coding_system *coding;
3226 int encodep;
3228 int len, consumed, produced;
3229 char *buf;
3230 unsigned char *begp, *endp;
3231 int head_skip, tail_skip;
3232 struct gcpro gcpro1;
3234 if (encodep && !NILP (coding->pre_write_conversion)
3235 || !encodep && !NILP (coding->post_read_conversion))
3237 /* Since we have to call Lisp functions which assume target text
3238 is in a buffer, after setting a temporary buffer, call
3239 code_convert_region. */
3240 int count = specpdl_ptr - specpdl;
3241 int len = XSTRING (str)->size;
3242 Lisp_Object result;
3243 struct buffer *old = current_buffer;
3245 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3246 temp_output_buffer_setup (" *code-converting-work*");
3247 set_buffer_internal (XBUFFER (Vstandard_output));
3248 insert_from_string (str, 0, len, 0);
3249 code_convert_region (make_number (BEGV), make_number (ZV),
3250 coding, encodep);
3251 result = make_buffer_string (BEGV, ZV, 0);
3252 set_buffer_internal (old);
3253 return unbind_to (count, result);
3256 /* We may be able to shrink the conversion region. */
3257 begp = XSTRING (str)->data;
3258 endp = begp + XSTRING (str)->size;
3259 shrink_conversion_area (&begp, &endp, coding, encodep);
3261 if (begp == endp)
3262 /* We need no conversion. */
3263 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3265 head_skip = begp - XSTRING (str)->data;
3266 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3268 GCPRO1 (str);
3270 if (encodep)
3271 len = encoding_buffer_size (coding, endp - begp);
3272 else
3273 len = decoding_buffer_size (coding, endp - begp);
3274 buf = get_conversion_buffer (len + head_skip + tail_skip);
3276 bcopy (XSTRING (str)->data, buf, head_skip);
3277 coding->last_block = 1;
3278 produced = (encodep
3279 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3280 buf + head_skip, endp - begp, len, &consumed)
3281 : decode_coding (coding, XSTRING (str)->data + head_skip,
3282 buf + head_skip, endp - begp, len, &consumed));
3283 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3284 buf + head_skip + produced,
3285 tail_skip);
3287 UNGCPRO;
3289 return make_string (buf, head_skip + produced + tail_skip);
3292 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3293 3, 3, "r\nzCoding system: ",
3294 "Decode current region by specified coding system.\n\
3295 When called from a program, takes three arguments:\n\
3296 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3297 Return length of decoded text.")
3298 (b, e, coding_system)
3299 Lisp_Object b, e, coding_system;
3301 struct coding_system coding;
3303 CHECK_NUMBER_COERCE_MARKER (b, 0);
3304 CHECK_NUMBER_COERCE_MARKER (e, 1);
3305 CHECK_SYMBOL (coding_system, 2);
3307 if (NILP (coding_system))
3308 return make_number (XFASTINT (e) - XFASTINT (b));
3309 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3310 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3312 return code_convert_region (b, e, &coding, 0);
3315 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3316 3, 3, "r\nzCoding system: ",
3317 "Encode current region by specified coding system.\n\
3318 When called from a program, takes three arguments:\n\
3319 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3320 Return length of encoded text.")
3321 (b, e, coding_system)
3322 Lisp_Object b, e, coding_system;
3324 struct coding_system coding;
3326 CHECK_NUMBER_COERCE_MARKER (b, 0);
3327 CHECK_NUMBER_COERCE_MARKER (e, 1);
3328 CHECK_SYMBOL (coding_system, 2);
3330 if (NILP (coding_system))
3331 return make_number (XFASTINT (e) - XFASTINT (b));
3332 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3333 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3335 return code_convert_region (b, e, &coding, 1);
3338 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3339 2, 3, 0,
3340 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3341 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3342 of decoding.")
3343 (string, coding_system, nocopy)
3344 Lisp_Object string, coding_system, nocopy;
3346 struct coding_system coding;
3348 CHECK_STRING (string, 0);
3349 CHECK_SYMBOL (coding_system, 1);
3351 if (NILP (coding_system))
3352 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3353 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3354 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3356 return code_convert_string (string, &coding, 0, nocopy);
3359 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3360 2, 3, 0,
3361 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3362 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3363 of encoding.")
3364 (string, coding_system, nocopy)
3365 Lisp_Object string, coding_system, nocopy;
3367 struct coding_system coding;
3369 CHECK_STRING (string, 0);
3370 CHECK_SYMBOL (coding_system, 1);
3372 if (NILP (coding_system))
3373 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3374 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3375 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3377 return code_convert_string (string, &coding, 1, nocopy);
3380 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3381 "Decode a JISX0208 character of shift-jis encoding.\n\
3382 CODE is the character code in SJIS.\n\
3383 Return the corresponding character.")
3384 (code)
3385 Lisp_Object code;
3387 unsigned char c1, c2, s1, s2;
3388 Lisp_Object val;
3390 CHECK_NUMBER (code, 0);
3391 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3392 DECODE_SJIS (s1, s2, c1, c2);
3393 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3394 return val;
3397 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3398 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3399 Return the corresponding character code in SJIS.")
3400 (ch)
3401 Lisp_Object ch;
3403 int charset, c1, c2, s1, s2;
3404 Lisp_Object val;
3406 CHECK_NUMBER (ch, 0);
3407 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3408 if (charset == charset_jisx0208)
3410 ENCODE_SJIS (c1, c2, s1, s2);
3411 XSETFASTINT (val, (s1 << 8) | s2);
3413 else
3414 XSETFASTINT (val, 0);
3415 return val;
3418 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3419 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3420 CODE is the character code in BIG5.\n\
3421 Return the corresponding character.")
3422 (code)
3423 Lisp_Object code;
3425 int charset;
3426 unsigned char b1, b2, c1, c2;
3427 Lisp_Object val;
3429 CHECK_NUMBER (code, 0);
3430 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3431 DECODE_BIG5 (b1, b2, charset, c1, c2);
3432 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3433 return val;
3436 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3437 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3438 Return the corresponding character code in Big5.")
3439 (ch)
3440 Lisp_Object ch;
3442 int charset, c1, c2, b1, b2;
3443 Lisp_Object val;
3445 CHECK_NUMBER (ch, 0);
3446 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3447 if (charset == charset_big5_1 || charset == charset_big5_2)
3449 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3450 XSETFASTINT (val, (b1 << 8) | b2);
3452 else
3453 XSETFASTINT (val, 0);
3454 return val;
3457 DEFUN ("set-terminal-coding-system-internal",
3458 Fset_terminal_coding_system_internal,
3459 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3460 (coding_system)
3461 Lisp_Object coding_system;
3463 CHECK_SYMBOL (coding_system, 0);
3464 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3465 return Qnil;
3468 DEFUN ("terminal-coding-system",
3469 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3470 "Return coding-system of your terminal.")
3473 return terminal_coding.symbol;
3476 DEFUN ("set-keyboard-coding-system-internal",
3477 Fset_keyboard_coding_system_internal,
3478 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3479 (coding_system)
3480 Lisp_Object coding_system;
3482 CHECK_SYMBOL (coding_system, 0);
3483 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3484 return Qnil;
3487 DEFUN ("keyboard-coding-system",
3488 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3489 "Return coding-system of what is sent from terminal keyboard.")
3492 return keyboard_coding.symbol;
3496 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3497 Sfind_operation_coding_system, 1, MANY, 0,
3498 "Choose a coding system for an operation based on the target name.\n\
3499 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3500 DECODING-SYSTEM is the coding system to use for decoding\n\
3501 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3502 for encoding (in case OPERATION does encoding).\n\
3504 The first argument OPERATION specifies an I/O primitive:\n\
3505 For file I/O, `insert-file-contents' or `write-region'.\n\
3506 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3507 For network I/O, `open-network-stream'.\n\
3509 The remaining arguments should be the same arguments that were passed\n\
3510 to the primitive. Depending on which primitive, one of those arguments\n\
3511 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3512 whichever argument specifies the file name is TARGET.\n\
3514 TARGET has a meaning which depends on OPERATION:\n\
3515 For file I/O, TARGET is a file name.\n\
3516 For process I/O, TARGET is a process name.\n\
3517 For network I/O, TARGET is a service name or a port number\n\
3519 This function looks up what specified for TARGET in,\n\
3520 `file-coding-system-alist', `process-coding-system-alist',\n\
3521 or `network-coding-system-alist' depending on OPERATION.\n\
3522 They may specify a coding system, a cons of coding systems,\n\
3523 or a function symbol to call.\n\
3524 In the last case, we call the function with one argument,\n\
3525 which is a list of all the arguments given to this function.")
3526 (nargs, args)
3527 int nargs;
3528 Lisp_Object *args;
3530 Lisp_Object operation, target_idx, target, val;
3531 register Lisp_Object chain;
3533 if (nargs < 2)
3534 error ("Too few arguments");
3535 operation = args[0];
3536 if (!SYMBOLP (operation)
3537 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3538 error ("Invalid first arguement");
3539 if (nargs < 1 + XINT (target_idx))
3540 error ("Too few arguments for operation: %s",
3541 XSYMBOL (operation)->name->data);
3542 target = args[XINT (target_idx) + 1];
3543 if (!(STRINGP (target)
3544 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3545 error ("Invalid %dth argument", XINT (target_idx) + 1);
3547 chain = ((EQ (operation, Qinsert_file_contents)
3548 || EQ (operation, Qwrite_region))
3549 ? Vfile_coding_system_alist
3550 : (EQ (operation, Qopen_network_stream)
3551 ? Vnetwork_coding_system_alist
3552 : Vprocess_coding_system_alist));
3553 if (NILP (chain))
3554 return Qnil;
3556 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3558 Lisp_Object elt = XCONS (chain)->car;
3560 if (CONSP (elt)
3561 && ((STRINGP (target)
3562 && STRINGP (XCONS (elt)->car)
3563 && fast_string_match (XCONS (elt)->car, target) >= 0)
3564 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3566 val = XCONS (elt)->cdr;
3567 if (CONSP (val))
3568 return val;
3569 if (! SYMBOLP (val))
3570 return Qnil;
3571 if (! NILP (Fcoding_system_p (val)))
3572 return Fcons (val, val);
3573 if (!NILP (Fboundp (val)))
3574 return call1 (val, Flist (nargs, args));
3575 return Qnil;
3578 return Qnil;
3581 #endif /* emacs */
3584 /*** 8. Post-amble ***/
3586 init_coding_once ()
3588 int i;
3590 /* Emacs' internal format specific initialize routine. */
3591 for (i = 0; i <= 0x20; i++)
3592 emacs_code_class[i] = EMACS_control_code;
3593 emacs_code_class[0x0A] = EMACS_linefeed_code;
3594 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3595 for (i = 0x21 ; i < 0x7F; i++)
3596 emacs_code_class[i] = EMACS_ascii_code;
3597 emacs_code_class[0x7F] = EMACS_control_code;
3598 emacs_code_class[0x80] = EMACS_leading_code_composition;
3599 for (i = 0x81; i < 0xFF; i++)
3600 emacs_code_class[i] = EMACS_invalid_code;
3601 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3602 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3603 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3604 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3606 /* ISO2022 specific initialize routine. */
3607 for (i = 0; i < 0x20; i++)
3608 iso_code_class[i] = ISO_control_code;
3609 for (i = 0x21; i < 0x7F; i++)
3610 iso_code_class[i] = ISO_graphic_plane_0;
3611 for (i = 0x80; i < 0xA0; i++)
3612 iso_code_class[i] = ISO_control_code;
3613 for (i = 0xA1; i < 0xFF; i++)
3614 iso_code_class[i] = ISO_graphic_plane_1;
3615 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3616 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3617 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3618 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3619 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3620 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3621 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3622 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3623 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3624 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3626 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3627 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3629 setup_coding_system (Qnil, &keyboard_coding);
3630 setup_coding_system (Qnil, &terminal_coding);
3632 #if defined (MSDOS) || defined (WINDOWSNT)
3633 system_eol_type = CODING_EOL_CRLF;
3634 #else
3635 system_eol_type = CODING_EOL_LF;
3636 #endif
3639 #ifdef emacs
3641 syms_of_coding ()
3643 Qtarget_idx = intern ("target-idx");
3644 staticpro (&Qtarget_idx);
3646 /* Target FILENAME is the first argument. */
3647 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3648 /* Target FILENAME is the third argument. */
3649 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3651 Qcall_process = intern ("call-process");
3652 staticpro (&Qcall_process);
3653 /* Target PROGRAM is the first argument. */
3654 Fput (Qcall_process, Qtarget_idx, make_number (0));
3656 Qcall_process_region = intern ("call-process-region");
3657 staticpro (&Qcall_process_region);
3658 /* Target PROGRAM is the third argument. */
3659 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3661 Qstart_process = intern ("start-process");
3662 staticpro (&Qstart_process);
3663 /* Target PROGRAM is the third argument. */
3664 Fput (Qstart_process, Qtarget_idx, make_number (2));
3666 Qopen_network_stream = intern ("open-network-stream");
3667 staticpro (&Qopen_network_stream);
3668 /* Target SERVICE is the fourth argument. */
3669 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3671 Qcoding_system = intern ("coding-system");
3672 staticpro (&Qcoding_system);
3674 Qeol_type = intern ("eol-type");
3675 staticpro (&Qeol_type);
3677 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3678 staticpro (&Qbuffer_file_coding_system);
3680 Qpost_read_conversion = intern ("post-read-conversion");
3681 staticpro (&Qpost_read_conversion);
3683 Qpre_write_conversion = intern ("pre-write-conversion");
3684 staticpro (&Qpre_write_conversion);
3686 Qcoding_system_spec = intern ("coding-system-spec");
3687 staticpro (&Qcoding_system_spec);
3689 Qcoding_system_p = intern ("coding-system-p");
3690 staticpro (&Qcoding_system_p);
3692 Qcoding_system_error = intern ("coding-system-error");
3693 staticpro (&Qcoding_system_error);
3695 Fput (Qcoding_system_error, Qerror_conditions,
3696 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3697 Fput (Qcoding_system_error, Qerror_message,
3698 build_string ("Invalid coding system"));
3700 Qcoding_category_index = intern ("coding-category-index");
3701 staticpro (&Qcoding_category_index);
3704 int i;
3705 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3707 coding_category_table[i] = intern (coding_category_name[i]);
3708 staticpro (&coding_category_table[i]);
3709 Fput (coding_category_table[i], Qcoding_category_index,
3710 make_number (i));
3714 Qcharacter_unification_table = intern ("character-unification-table");
3715 staticpro (&Qcharacter_unification_table);
3716 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3717 make_number (0));
3719 Qcharacter_unification_table_for_decode
3720 = intern ("character-unification-table-for-decode");
3721 staticpro (&Qcharacter_unification_table_for_decode);
3723 Qcharacter_unification_table_for_encode
3724 = intern ("character-unification-table-for-encode");
3725 staticpro (&Qcharacter_unification_table_for_encode);
3727 Qemacs_mule = intern ("emacs-mule");
3728 staticpro (&Qemacs_mule);
3730 defsubr (&Scoding_system_spec);
3731 defsubr (&Scoding_system_p);
3732 defsubr (&Sread_coding_system);
3733 defsubr (&Sread_non_nil_coding_system);
3734 defsubr (&Scheck_coding_system);
3735 defsubr (&Sdetect_coding_region);
3736 defsubr (&Sdecode_coding_region);
3737 defsubr (&Sencode_coding_region);
3738 defsubr (&Sdecode_coding_string);
3739 defsubr (&Sencode_coding_string);
3740 defsubr (&Sdecode_sjis_char);
3741 defsubr (&Sencode_sjis_char);
3742 defsubr (&Sdecode_big5_char);
3743 defsubr (&Sencode_big5_char);
3744 defsubr (&Sset_terminal_coding_system_internal);
3745 defsubr (&Sterminal_coding_system);
3746 defsubr (&Sset_keyboard_coding_system_internal);
3747 defsubr (&Skeyboard_coding_system);
3748 defsubr (&Sfind_operation_coding_system);
3750 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3751 "List of coding-categories (symbols) ordered by priority.");
3753 int i;
3755 Vcoding_category_list = Qnil;
3756 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3757 Vcoding_category_list
3758 = Fcons (coding_category_table[i], Vcoding_category_list);
3761 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3762 "A variable of internal use only.\n\
3763 If the value is a coding system, it is used for decoding on read operation.\n\
3764 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3765 Vcoding_system_for_read = Qnil;
3767 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3768 "A variable of internal use only.\n\
3769 If the value is a coding system, it is used for encoding on write operation.\n\
3770 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3771 Vcoding_system_for_write = Qnil;
3773 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3774 "Coding-system used in the latest file or process I/O.");
3775 Vlast_coding_system_used = Qnil;
3777 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3778 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3779 inhibit_eol_conversion = 0;
3781 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3782 "Alist to decide a coding system to use for a file I/O operation.\n\
3783 The format is ((PATTERN . VAL) ...),\n\
3784 where PATTERN is a regular expression matching a file name,\n\
3785 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3786 If VAL is a coding system, it is used for both decoding and encoding\n\
3787 the file contents.\n\
3788 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3789 and the cdr part is used for encoding.\n\
3790 If VAL is a function symbol, the function must return a coding system\n\
3791 or a cons of coding systems which are used as above.\n\
3793 See also the function `find-operation-coding-system'.");
3794 Vfile_coding_system_alist = Qnil;
3796 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3797 "Alist to decide a coding system to use for a process I/O operation.\n\
3798 The format is ((PATTERN . VAL) ...),\n\
3799 where PATTERN is a regular expression matching a program name,\n\
3800 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3801 If VAL is a coding system, it is used for both decoding what received\n\
3802 from the program and encoding what sent to the program.\n\
3803 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3804 and the cdr part is used for encoding.\n\
3805 If VAL is a function symbol, the function must return a coding system\n\
3806 or a cons of coding systems which are used as above.\n\
3808 See also the function `find-operation-coding-system'.");
3809 Vprocess_coding_system_alist = Qnil;
3811 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3812 "Alist to decide a coding system to use for a network I/O operation.\n\
3813 The format is ((PATTERN . VAL) ...),\n\
3814 where PATTERN is a regular expression matching a network service name\n\
3815 or is a port number to connect to,\n\
3816 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3817 If VAL is a coding system, it is used for both decoding what received\n\
3818 from the network stream and encoding what sent to the network stream.\n\
3819 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3820 and the cdr part is used for encoding.\n\
3821 If VAL is a function symbol, the function must return a coding system\n\
3822 or a cons of coding systems which are used as above.\n\
3824 See also the function `find-operation-coding-system'.");
3825 Vnetwork_coding_system_alist = Qnil;
3827 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3828 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3829 eol_mnemonic_unix = ':';
3831 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3832 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3833 eol_mnemonic_dos = '\\';
3835 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3836 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3837 eol_mnemonic_mac = '/';
3839 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3840 "Mnemonic character indicating end-of-line format is not yet decided.");
3841 eol_mnemonic_undecided = ':';
3843 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3844 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3845 Venable_character_unification = Qt;
3847 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3848 &Vstandard_character_unification_table_for_decode,
3849 "Table for unifying characters when reading.");
3850 Vstandard_character_unification_table_for_decode = Qnil;
3852 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3853 &Vstandard_character_unification_table_for_encode,
3854 "Table for unifying characters when writing.");
3855 Vstandard_character_unification_table_for_encode = Qnil;
3857 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3858 "Alist of charsets vs revision numbers.\n\
3859 While encoding, if a charset (car part of an element) is found,\n\
3860 designate it with the escape sequence identifing revision (cdr part of the element).");
3861 Vcharset_revision_alist = Qnil;
3863 DEFVAR_LISP ("default-process-coding-system",
3864 &Vdefault_process_coding_system,
3865 "Cons of coding systems used for process I/O by default.\n\
3866 The car part is used for decoding a process output,\n\
3867 the cdr part is used for encoding a text to be sent to a process.");
3868 Vdefault_process_coding_system = Qnil;
3871 #endif /* emacs */