(xmalloc, xrealloc, xfree): Define using POINTER_TYPE.
[emacs.git] / src / coding.c
blobd02c27dc478214bd46416cff9247e5ed40e0136f
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
24 0. General comments
25 1. Preamble
26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
37 /*** 0. General comments ***/
40 /*** GENERAL NOTE on CODING SYSTEM ***
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
49 0. Emacs' internal format (emacs-mule)
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
54 1. ISO2022
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
65 section 4.
67 3. BIG5
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
75 4. Raw text
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
80 5. Other
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116 #if 0
118 detect_coding_emacs_mule (src, src_end)
119 unsigned char *src, *src_end;
123 #endif
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
141 Below is a template of these functions. */
142 #if 0
143 static void
144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
151 #endif
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
169 Below is a template of these functions. */
170 #if 0
171 static void
172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
179 #endif
181 /*** COMMONLY USED MACROS ***/
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
191 #define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
198 c1 = *src++; \
199 } while (0)
201 #define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
208 c1 = *src++; \
209 c2 = *src++; \
210 } while (0)
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
223 #define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
240 } while (0)
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
251 This macro is used in decoding routines. */
253 #define EMIT_CHAR(c) \
254 do { \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
276 } while (0)
279 #define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
286 *dst++ = c; \
287 } while (0)
289 #define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
299 #define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
306 while (from < to) \
307 *dst++ = *from++; \
308 } while (0)
311 /*** 1. Preamble ***/
313 #ifdef emacs
314 #include <config.h>
315 #endif
317 #include <stdio.h>
319 #ifdef emacs
321 #include "lisp.h"
322 #include "buffer.h"
323 #include "charset.h"
324 #include "composite.h"
325 #include "ccl.h"
326 #include "coding.h"
327 #include "window.h"
329 #else /* not emacs */
331 #include "mulelib.h"
333 #endif /* not emacs */
335 Lisp_Object Qcoding_system, Qeol_type;
336 Lisp_Object Qbuffer_file_coding_system;
337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
338 Lisp_Object Qno_conversion, Qundecided;
339 Lisp_Object Qcoding_system_history;
340 Lisp_Object Qsafe_charsets;
341 Lisp_Object Qvalid_codes;
343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345 Lisp_Object Qstart_process, Qopen_network_stream;
346 Lisp_Object Qtarget_idx;
348 Lisp_Object Vselect_safe_coding_system_function;
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352 /* Mnemonic string to indicate format of end-of-line is not yet
353 decided. */
354 Lisp_Object eol_mnemonic_undecided;
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358 int system_eol_type;
360 #ifdef emacs
362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule, Qraw_text;
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used;
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table;
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion;
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
389 /* Coding system to be used to encode text for terminal display. */
390 struct coding_system terminal_coding;
392 /* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394 struct coding_system safe_terminal_coding;
396 /* Coding system of what is sent from terminal keyboard. */
397 struct coding_system keyboard_coding;
399 /* Default coding system to be used to write a file. */
400 struct coding_system default_buffer_file_coding;
402 Lisp_Object Vfile_coding_system_alist;
403 Lisp_Object Vprocess_coding_system_alist;
404 Lisp_Object Vnetwork_coding_system_alist;
406 Lisp_Object Vlocale_coding_system;
408 #endif /* emacs */
410 Lisp_Object Qcoding_category, Qcoding_category_index;
412 /* List of symbols `coding-category-xxx' ordered by priority. */
413 Lisp_Object Vcoding_category_list;
415 /* Table of coding categories (Lisp symbols). */
416 Lisp_Object Vcoding_category_table;
418 /* Table of names of symbol for each coding-category. */
419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
420 "coding-category-emacs-mule",
421 "coding-category-sjis",
422 "coding-category-iso-7",
423 "coding-category-iso-7-tight",
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
428 "coding-category-ccl",
429 "coding-category-big5",
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
433 "coding-category-raw-text",
434 "coding-category-binary"
437 /* Table of pointers to coding systems corresponding to each coding
438 categories. */
439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
441 /* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443 static
444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
446 /* Flag to tell if we look up translation table on character code
447 conversion. */
448 Lisp_Object Venable_character_translation;
449 /* Standard translation table to look up on decoding (reading). */
450 Lisp_Object Vstandard_translation_table_for_decode;
451 /* Standard translation table to look up on encoding (writing). */
452 Lisp_Object Vstandard_translation_table_for_encode;
454 Lisp_Object Qtranslation_table;
455 Lisp_Object Qtranslation_table_id;
456 Lisp_Object Qtranslation_table_for_decode;
457 Lisp_Object Qtranslation_table_for_encode;
459 /* Alist of charsets vs revision number. */
460 Lisp_Object Vcharset_revision_alist;
462 /* Default coding systems used for process I/O. */
463 Lisp_Object Vdefault_process_coding_system;
465 /* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469 static int inhibit_pre_post_conversion;
472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
474 /* Emacs' internal format for encoding multiple character sets is a
475 kind of multi-byte encoding, i.e. characters are encoded by
476 variable-length sequences of one-byte codes.
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
497 --- CODE RANGE of Emacs' internal format ---
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
504 ---------------------------------------------
508 enum emacs_code_class_type emacs_code_class[256];
510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
515 detect_coding_emacs_mule (src, src_end)
516 unsigned char *src, *src_end;
518 unsigned char c;
519 int composing = 0;
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
524 while (1)
526 ONE_MORE_BYTE (c);
528 if (composing)
530 if (c < 0xA0)
531 composing = 0;
532 else if (c == 0xA0)
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
537 else
538 c -= 0x20;
541 if (c < 0x20)
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
546 else if (c >= 0x80 && c < 0xA0)
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
553 unsigned char *src_base = src - 1;
554 int bytes;
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
570 static void
571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
586 coding->produced_char = 0;
587 while ((src_base = src) < src_end)
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
592 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
594 p = src;
595 src += bytes;
597 else
599 bytes = CHAR_STRING (*src, tmp);
600 p = tmp;
601 src++;
603 if (dst + bytes >= (dst_bytes ? dst_end : src))
605 coding->result = CODING_FINISH_INSUFFICIENT_DST;
606 break;
608 while (bytes--) *dst++ = *p++;
609 coding->produced_char++;
611 coding->consumed = coding->consumed_char = src_base - source;
612 coding->produced = dst - destination;
615 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
616 encode_eol (coding, source, destination, src_bytes, dst_bytes)
620 /*** 3. ISO2022 handlers ***/
622 /* The following note describes the coding system ISO2022 briefly.
623 Since the intention of this note is to help understand the
624 functions in this file, some parts are NOT ACCURATE or OVERLY
625 SIMPLIFIED. For thorough understanding, please refer to the
626 original document of ISO2022.
628 ISO2022 provides many mechanisms to encode several character sets
629 in 7-bit and 8-bit environments. For 7-bite environments, all text
630 is encoded using bytes less than 128. This may make the encoded
631 text a little bit longer, but the text passes more easily through
632 several gateways, some of which strip off MSB (Most Signigant Bit).
634 There are two kinds of character sets: control character set and
635 graphic character set. The former contains control characters such
636 as `newline' and `escape' to provide control functions (control
637 functions are also provided by escape sequences). The latter
638 contains graphic characters such as 'A' and '-'. Emacs recognizes
639 two control character sets and many graphic character sets.
641 Graphic character sets are classified into one of the following
642 four classes, according to the number of bytes (DIMENSION) and
643 number of characters in one dimension (CHARS) of the set:
644 - DIMENSION1_CHARS94
645 - DIMENSION1_CHARS96
646 - DIMENSION2_CHARS94
647 - DIMENSION2_CHARS96
649 In addition, each character set is assigned an identification tag,
650 unique for each set, called "final character" (denoted as <F>
651 hereafter). The <F> of each character set is decided by ECMA(*)
652 when it is registered in ISO. The code range of <F> is 0x30..0x7F
653 (0x30..0x3F are for private use only).
655 Note (*): ECMA = European Computer Manufacturers Association
657 Here are examples of graphic character set [NAME(<F>)]:
658 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
659 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
660 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
661 o DIMENSION2_CHARS96 -- none for the moment
663 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
664 C0 [0x00..0x1F] -- control character plane 0
665 GL [0x20..0x7F] -- graphic character plane 0
666 C1 [0x80..0x9F] -- control character plane 1
667 GR [0xA0..0xFF] -- graphic character plane 1
669 A control character set is directly designated and invoked to C0 or
670 C1 by an escape sequence. The most common case is that:
671 - ISO646's control character set is designated/invoked to C0, and
672 - ISO6429's control character set is designated/invoked to C1,
673 and usually these designations/invocations are omitted in encoded
674 text. In a 7-bit environment, only C0 can be used, and a control
675 character for C1 is encoded by an appropriate escape sequence to
676 fit into the environment. All control characters for C1 are
677 defined to have corresponding escape sequences.
679 A graphic character set is at first designated to one of four
680 graphic registers (G0 through G3), then these graphic registers are
681 invoked to GL or GR. These designations and invocations can be
682 done independently. The most common case is that G0 is invoked to
683 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
684 these invocations and designations are omitted in encoded text.
685 In a 7-bit environment, only GL can be used.
687 When a graphic character set of CHARS94 is invoked to GL, codes
688 0x20 and 0x7F of the GL area work as control characters SPACE and
689 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
690 be used.
692 There are two ways of invocation: locking-shift and single-shift.
693 With locking-shift, the invocation lasts until the next different
694 invocation, whereas with single-shift, the invocation affects the
695 following character only and doesn't affect the locking-shift
696 state. Invocations are done by the following control characters or
697 escape sequences:
699 ----------------------------------------------------------------------
700 abbrev function cntrl escape seq description
701 ----------------------------------------------------------------------
702 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
703 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
704 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
705 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
706 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
707 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
708 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
709 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
710 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
711 ----------------------------------------------------------------------
712 (*) These are not used by any known coding system.
714 Control characters for these functions are defined by macros
715 ISO_CODE_XXX in `coding.h'.
717 Designations are done by the following escape sequences:
718 ----------------------------------------------------------------------
719 escape sequence description
720 ----------------------------------------------------------------------
721 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
722 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
723 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
724 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
725 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
726 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
727 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
728 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
729 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
730 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
731 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
732 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
733 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
734 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
735 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
736 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
737 ----------------------------------------------------------------------
739 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
740 of dimension 1, chars 94, and final character <F>, etc...
742 Note (*): Although these designations are not allowed in ISO2022,
743 Emacs accepts them on decoding, and produces them on encoding
744 CHARS96 character sets in a coding system which is characterized as
745 7-bit environment, non-locking-shift, and non-single-shift.
747 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
748 '(' can be omitted. We refer to this as "short-form" hereafter.
750 Now you may notice that there are a lot of ways for encoding the
751 same multilingual text in ISO2022. Actually, there exist many
752 coding systems such as Compound Text (used in X11's inter client
753 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
754 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
755 localized platforms), and all of these are variants of ISO2022.
757 In addition to the above, Emacs handles two more kinds of escape
758 sequences: ISO6429's direction specification and Emacs' private
759 sequence for specifying character composition.
761 ISO6429's direction specification takes the following form:
762 o CSI ']' -- end of the current direction
763 o CSI '0' ']' -- end of the current direction
764 o CSI '1' ']' -- start of left-to-right text
765 o CSI '2' ']' -- start of right-to-left text
766 The control character CSI (0x9B: control sequence introducer) is
767 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
769 Character composition specification takes the following form:
770 o ESC '0' -- start relative composition
771 o ESC '1' -- end composition
772 o ESC '2' -- start rule-base composition (*)
773 o ESC '3' -- start relative composition with alternate chars (**)
774 o ESC '4' -- start rule-base composition with alternate chars (**)
775 Since these are not standard escape sequences of any ISO standard,
776 the use of them for these meaning is restricted to Emacs only.
778 (*) This form is used only in Emacs 20.5 and the older versions,
779 but the newer versions can safely decode it.
780 (**) This form is used only in Emacs 21.1 and the newer versions,
781 and the older versions can't decode it.
783 Here's a list of examples usages of these composition escape
784 sequences (categorized by `enum composition_method').
786 COMPOSITION_RELATIVE:
787 ESC 0 CHAR [ CHAR ] ESC 1
788 COMPOSITOIN_WITH_RULE:
789 ESC 2 CHAR [ RULE CHAR ] ESC 1
790 COMPOSITION_WITH_ALTCHARS:
791 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
792 COMPOSITION_WITH_RULE_ALTCHARS:
793 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
795 enum iso_code_class_type iso_code_class[256];
797 #define CHARSET_OK(idx, charset) \
798 (coding_system_table[idx] \
799 && (coding_system_table[idx]->safe_charsets[charset] \
800 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
801 (coding_system_table[idx], charset) \
802 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
804 #define SHIFT_OUT_OK(idx) \
805 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
807 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
808 Check if a text is encoded in ISO2022. If it is, returns an
809 integer in which appropriate flag bits any of:
810 CODING_CATEGORY_MASK_ISO_7
811 CODING_CATEGORY_MASK_ISO_7_TIGHT
812 CODING_CATEGORY_MASK_ISO_8_1
813 CODING_CATEGORY_MASK_ISO_8_2
814 CODING_CATEGORY_MASK_ISO_7_ELSE
815 CODING_CATEGORY_MASK_ISO_8_ELSE
816 are set. If a code which should never appear in ISO2022 is found,
817 returns 0. */
820 detect_coding_iso2022 (src, src_end)
821 unsigned char *src, *src_end;
823 int mask = CODING_CATEGORY_MASK_ISO;
824 int mask_found = 0;
825 int reg[4], shift_out = 0, single_shifting = 0;
826 int c, c1, i, charset;
827 /* Dummy for ONE_MORE_BYTE. */
828 struct coding_system dummy_coding;
829 struct coding_system *coding = &dummy_coding;
831 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
832 while (mask && src < src_end)
834 ONE_MORE_BYTE (c);
835 switch (c)
837 case ISO_CODE_ESC:
838 single_shifting = 0;
839 ONE_MORE_BYTE (c);
840 if (c >= '(' && c <= '/')
842 /* Designation sequence for a charset of dimension 1. */
843 ONE_MORE_BYTE (c1);
844 if (c1 < ' ' || c1 >= 0x80
845 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
846 /* Invalid designation sequence. Just ignore. */
847 break;
848 reg[(c - '(') % 4] = charset;
850 else if (c == '$')
852 /* Designation sequence for a charset of dimension 2. */
853 ONE_MORE_BYTE (c);
854 if (c >= '@' && c <= 'B')
855 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
856 reg[0] = charset = iso_charset_table[1][0][c];
857 else if (c >= '(' && c <= '/')
859 ONE_MORE_BYTE (c1);
860 if (c1 < ' ' || c1 >= 0x80
861 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
862 /* Invalid designation sequence. Just ignore. */
863 break;
864 reg[(c - '(') % 4] = charset;
866 else
867 /* Invalid designation sequence. Just ignore. */
868 break;
870 else if (c == 'N' || c == 'O')
872 /* ESC <Fe> for SS2 or SS3. */
873 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
874 break;
876 else if (c >= '0' && c <= '4')
878 /* ESC <Fp> for start/end composition. */
879 mask_found |= CODING_CATEGORY_MASK_ISO;
880 break;
882 else
883 /* Invalid escape sequence. Just ignore. */
884 break;
886 /* We found a valid designation sequence for CHARSET. */
887 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
888 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
889 mask_found |= CODING_CATEGORY_MASK_ISO_7;
890 else
891 mask &= ~CODING_CATEGORY_MASK_ISO_7;
892 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
893 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
894 else
895 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
896 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
897 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
898 else
899 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
900 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
901 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
902 else
903 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
904 break;
906 case ISO_CODE_SO:
907 single_shifting = 0;
908 if (shift_out == 0
909 && (reg[1] >= 0
910 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
913 /* Locking shift out. */
914 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
915 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
917 break;
919 case ISO_CODE_SI:
920 single_shifting = 0;
921 if (shift_out == 1)
923 /* Locking shift in. */
924 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
925 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
927 break;
929 case ISO_CODE_CSI:
930 single_shifting = 0;
931 case ISO_CODE_SS2:
932 case ISO_CODE_SS3:
934 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
936 if (c != ISO_CODE_CSI)
938 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
939 & CODING_FLAG_ISO_SINGLE_SHIFT)
940 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
941 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
942 & CODING_FLAG_ISO_SINGLE_SHIFT)
943 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
944 single_shifting = 1;
946 if (VECTORP (Vlatin_extra_code_table)
947 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
949 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
950 & CODING_FLAG_ISO_LATIN_EXTRA)
951 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
952 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
953 & CODING_FLAG_ISO_LATIN_EXTRA)
954 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
956 mask &= newmask;
957 mask_found |= newmask;
959 break;
961 default:
962 if (c < 0x80)
964 single_shifting = 0;
965 break;
967 else if (c < 0xA0)
969 single_shifting = 0;
970 if (VECTORP (Vlatin_extra_code_table)
971 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
973 int newmask = 0;
975 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
976 & CODING_FLAG_ISO_LATIN_EXTRA)
977 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
978 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
979 & CODING_FLAG_ISO_LATIN_EXTRA)
980 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
981 mask &= newmask;
982 mask_found |= newmask;
984 else
985 return 0;
987 else
989 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
990 | CODING_CATEGORY_MASK_ISO_7_ELSE);
991 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
992 /* Check the length of succeeding codes of the range
993 0xA0..0FF. If the byte length is odd, we exclude
994 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
995 when we are not single shifting. */
996 if (!single_shifting
997 && mask & CODING_CATEGORY_MASK_ISO_8_2)
999 int i = 1;
1000 while (src < src_end)
1002 ONE_MORE_BYTE (c);
1003 if (c < 0xA0)
1004 break;
1005 i++;
1008 if (i & 1 && src < src_end)
1009 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1010 else
1011 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1014 break;
1017 label_end_of_loop:
1018 return (mask & mask_found);
1021 /* Decode a character of which charset is CHARSET, the 1st position
1022 code is C1, the 2nd position code is C2, and return the decoded
1023 character code. If the variable `translation_table' is non-nil,
1024 returned the translated code. */
1026 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1027 (NILP (translation_table) \
1028 ? MAKE_CHAR (charset, c1, c2) \
1029 : translate_char (translation_table, -1, charset, c1, c2))
1031 /* Set designation state into CODING. */
1032 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1033 do { \
1034 int charset; \
1036 if (final_char < '0' || final_char >= 128) \
1037 goto label_invalid_code; \
1038 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1039 make_number (chars), \
1040 make_number (final_char)); \
1041 if (charset >= 0 \
1042 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1043 || coding->safe_charsets[charset])) \
1045 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1046 && reg == 0 \
1047 && charset == CHARSET_ASCII) \
1049 /* We should insert this designation sequence as is so \
1050 that it is surely written back to a file. */ \
1051 coding->spec.iso2022.last_invalid_designation_register = -1; \
1052 goto label_invalid_code; \
1054 coding->spec.iso2022.last_invalid_designation_register = -1; \
1055 if ((coding->mode & CODING_MODE_DIRECTION) \
1056 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1057 charset = CHARSET_REVERSE_CHARSET (charset); \
1058 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1060 else \
1062 coding->spec.iso2022.last_invalid_designation_register = reg; \
1063 goto label_invalid_code; \
1065 } while (0)
1067 /* Allocate a memory block for storing information about compositions.
1068 The block is chained to the already allocated blocks. */
1070 void
1071 coding_allocate_composition_data (coding, char_offset)
1072 struct coding_system *coding;
1073 int char_offset;
1075 struct composition_data *cmp_data
1076 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1078 cmp_data->char_offset = char_offset;
1079 cmp_data->used = 0;
1080 cmp_data->prev = coding->cmp_data;
1081 cmp_data->next = NULL;
1082 if (coding->cmp_data)
1083 coding->cmp_data->next = cmp_data;
1084 coding->cmp_data = cmp_data;
1085 coding->cmp_data_start = 0;
1088 /* Record the starting position START and METHOD of one composition. */
1090 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1091 do { \
1092 struct composition_data *cmp_data = coding->cmp_data; \
1093 int *data = cmp_data->data + cmp_data->used; \
1094 coding->cmp_data_start = cmp_data->used; \
1095 data[0] = -1; \
1096 data[1] = cmp_data->char_offset + start; \
1097 data[3] = (int) method; \
1098 cmp_data->used += 4; \
1099 } while (0)
1101 /* Record the ending position END of the current composition. */
1103 #define CODING_ADD_COMPOSITION_END(coding, end) \
1104 do { \
1105 struct composition_data *cmp_data = coding->cmp_data; \
1106 int *data = cmp_data->data + coding->cmp_data_start; \
1107 data[0] = cmp_data->used - coding->cmp_data_start; \
1108 data[2] = cmp_data->char_offset + end; \
1109 } while (0)
1111 /* Record one COMPONENT (alternate character or composition rule). */
1113 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1114 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1116 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1118 #define DECODE_COMPOSITION_START(c1) \
1119 do { \
1120 if (coding->composing == COMPOSITION_DISABLED) \
1122 *dst++ = ISO_CODE_ESC; \
1123 *dst++ = c1 & 0x7f; \
1124 coding->produced_char += 2; \
1126 else if (!COMPOSING_P (coding)) \
1128 /* This is surely the start of a composition. We must be sure \
1129 that coding->cmp_data has enough space to store the \
1130 information about the composition. If not, terminate the \
1131 current decoding loop, allocate one more memory block for \
1132 coding->cmp_data in the calller, then start the decoding \
1133 loop again. We can't allocate memory here directly because \
1134 it may cause buffer/string relocation. */ \
1135 if (!coding->cmp_data \
1136 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1137 >= COMPOSITION_DATA_SIZE)) \
1139 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1140 goto label_end_of_loop; \
1142 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1143 : c1 == '2' ? COMPOSITION_WITH_RULE \
1144 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1145 : COMPOSITION_WITH_RULE_ALTCHARS); \
1146 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1147 coding->composing); \
1148 coding->composition_rule_follows = 0; \
1150 else \
1152 /* We are already handling a composition. If the method is \
1153 the following two, the codes following the current escape \
1154 sequence are actual characters stored in a buffer. */ \
1155 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1156 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1158 coding->composing = COMPOSITION_RELATIVE; \
1159 coding->composition_rule_follows = 0; \
1162 } while (0)
1164 /* Handle compositoin end sequence ESC 1. */
1166 #define DECODE_COMPOSITION_END(c1) \
1167 do { \
1168 if (coding->composing == COMPOSITION_DISABLED) \
1170 *dst++ = ISO_CODE_ESC; \
1171 *dst++ = c1; \
1172 coding->produced_char += 2; \
1174 else \
1176 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1177 coding->composing = COMPOSITION_NO; \
1179 } while (0)
1181 /* Decode a composition rule from the byte C1 (and maybe one more byte
1182 from SRC) and store one encoded composition rule in
1183 coding->cmp_data. */
1185 #define DECODE_COMPOSITION_RULE(c1) \
1186 do { \
1187 int rule = 0; \
1188 (c1) -= 32; \
1189 if (c1 < 81) /* old format (before ver.21) */ \
1191 int gref = (c1) / 9; \
1192 int nref = (c1) % 9; \
1193 if (gref == 4) gref = 10; \
1194 if (nref == 4) nref = 10; \
1195 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1197 else if (c1 < 93) /* new format (after ver.21) */ \
1199 ONE_MORE_BYTE (c2); \
1200 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1202 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1203 coding->composition_rule_follows = 0; \
1204 } while (0)
1207 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1209 static void
1210 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1211 struct coding_system *coding;
1212 unsigned char *source, *destination;
1213 int src_bytes, dst_bytes;
1215 unsigned char *src = source;
1216 unsigned char *src_end = source + src_bytes;
1217 unsigned char *dst = destination;
1218 unsigned char *dst_end = destination + dst_bytes;
1219 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1220 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1221 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1222 /* SRC_BASE remembers the start position in source in each loop.
1223 The loop will be exited when there's not enough source code
1224 (within macro ONE_MORE_BYTE), or when there's not enough
1225 destination area to produce a character (within macro
1226 EMIT_CHAR). */
1227 unsigned char *src_base;
1228 int c, charset;
1229 Lisp_Object translation_table;
1231 if (NILP (Venable_character_translation))
1232 translation_table = Qnil;
1233 else
1235 translation_table = coding->translation_table_for_decode;
1236 if (NILP (translation_table))
1237 translation_table = Vstandard_translation_table_for_decode;
1240 coding->result = CODING_FINISH_NORMAL;
1242 while (1)
1244 int c1, c2;
1246 src_base = src;
1247 ONE_MORE_BYTE (c1);
1249 /* We produce no character or one character. */
1250 switch (iso_code_class [c1])
1252 case ISO_0x20_or_0x7F:
1253 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1255 DECODE_COMPOSITION_RULE (c1);
1256 continue;
1258 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1260 /* This is SPACE or DEL. */
1261 charset = CHARSET_ASCII;
1262 break;
1264 /* This is a graphic character, we fall down ... */
1266 case ISO_graphic_plane_0:
1267 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1269 DECODE_COMPOSITION_RULE (c1);
1270 continue;
1272 charset = charset0;
1273 break;
1275 case ISO_0xA0_or_0xFF:
1276 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1277 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1278 goto label_invalid_code;
1279 /* This is a graphic character, we fall down ... */
1281 case ISO_graphic_plane_1:
1282 if (charset1 < 0)
1283 goto label_invalid_code;
1284 charset = charset1;
1285 break;
1287 case ISO_control_0:
1288 if (COMPOSING_P (coding))
1289 DECODE_COMPOSITION_END ('1');
1291 /* All ISO2022 control characters in this class have the
1292 same representation in Emacs internal format. */
1293 if (c1 == '\n'
1294 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1295 && (coding->eol_type == CODING_EOL_CR
1296 || coding->eol_type == CODING_EOL_CRLF))
1298 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1299 goto label_end_of_loop;
1301 charset = CHARSET_ASCII;
1302 break;
1304 case ISO_control_1:
1305 if (COMPOSING_P (coding))
1306 DECODE_COMPOSITION_END ('1');
1307 goto label_invalid_code;
1309 case ISO_carriage_return:
1310 if (COMPOSING_P (coding))
1311 DECODE_COMPOSITION_END ('1');
1313 if (coding->eol_type == CODING_EOL_CR)
1314 c1 = '\n';
1315 else if (coding->eol_type == CODING_EOL_CRLF)
1317 ONE_MORE_BYTE (c1);
1318 if (c1 != ISO_CODE_LF)
1320 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1322 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1323 goto label_end_of_loop;
1325 src--;
1326 c1 = '\r';
1329 charset = CHARSET_ASCII;
1330 break;
1332 case ISO_shift_out:
1333 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1334 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1335 goto label_invalid_code;
1336 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1337 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1338 continue;
1340 case ISO_shift_in:
1341 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1342 goto label_invalid_code;
1343 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1344 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1345 continue;
1347 case ISO_single_shift_2_7:
1348 case ISO_single_shift_2:
1349 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1350 goto label_invalid_code;
1351 /* SS2 is handled as an escape sequence of ESC 'N' */
1352 c1 = 'N';
1353 goto label_escape_sequence;
1355 case ISO_single_shift_3:
1356 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1357 goto label_invalid_code;
1358 /* SS2 is handled as an escape sequence of ESC 'O' */
1359 c1 = 'O';
1360 goto label_escape_sequence;
1362 case ISO_control_sequence_introducer:
1363 /* CSI is handled as an escape sequence of ESC '[' ... */
1364 c1 = '[';
1365 goto label_escape_sequence;
1367 case ISO_escape:
1368 ONE_MORE_BYTE (c1);
1369 label_escape_sequence:
1370 /* Escape sequences handled by Emacs are invocation,
1371 designation, direction specification, and character
1372 composition specification. */
1373 switch (c1)
1375 case '&': /* revision of following character set */
1376 ONE_MORE_BYTE (c1);
1377 if (!(c1 >= '@' && c1 <= '~'))
1378 goto label_invalid_code;
1379 ONE_MORE_BYTE (c1);
1380 if (c1 != ISO_CODE_ESC)
1381 goto label_invalid_code;
1382 ONE_MORE_BYTE (c1);
1383 goto label_escape_sequence;
1385 case '$': /* designation of 2-byte character set */
1386 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1387 goto label_invalid_code;
1388 ONE_MORE_BYTE (c1);
1389 if (c1 >= '@' && c1 <= 'B')
1390 { /* designation of JISX0208.1978, GB2312.1980,
1391 or JISX0208.1980 */
1392 DECODE_DESIGNATION (0, 2, 94, c1);
1394 else if (c1 >= 0x28 && c1 <= 0x2B)
1395 { /* designation of DIMENSION2_CHARS94 character set */
1396 ONE_MORE_BYTE (c2);
1397 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1399 else if (c1 >= 0x2C && c1 <= 0x2F)
1400 { /* designation of DIMENSION2_CHARS96 character set */
1401 ONE_MORE_BYTE (c2);
1402 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1404 else
1405 goto label_invalid_code;
1406 /* We must update these variables now. */
1407 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1408 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1409 continue;
1411 case 'n': /* invocation of locking-shift-2 */
1412 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1413 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1414 goto label_invalid_code;
1415 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1416 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1417 continue;
1419 case 'o': /* invocation of locking-shift-3 */
1420 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1421 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1422 goto label_invalid_code;
1423 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1424 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1425 continue;
1427 case 'N': /* invocation of single-shift-2 */
1428 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1429 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1430 goto label_invalid_code;
1431 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1432 ONE_MORE_BYTE (c1);
1433 break;
1435 case 'O': /* invocation of single-shift-3 */
1436 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1437 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1438 goto label_invalid_code;
1439 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1440 ONE_MORE_BYTE (c1);
1441 break;
1443 case '0': case '2': case '3': case '4': /* start composition */
1444 DECODE_COMPOSITION_START (c1);
1445 continue;
1447 case '1': /* end composition */
1448 DECODE_COMPOSITION_END (c1);
1449 continue;
1451 case '[': /* specification of direction */
1452 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1453 goto label_invalid_code;
1454 /* For the moment, nested direction is not supported.
1455 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1456 left-to-right, and nozero means right-to-left. */
1457 ONE_MORE_BYTE (c1);
1458 switch (c1)
1460 case ']': /* end of the current direction */
1461 coding->mode &= ~CODING_MODE_DIRECTION;
1463 case '0': /* end of the current direction */
1464 case '1': /* start of left-to-right direction */
1465 ONE_MORE_BYTE (c1);
1466 if (c1 == ']')
1467 coding->mode &= ~CODING_MODE_DIRECTION;
1468 else
1469 goto label_invalid_code;
1470 break;
1472 case '2': /* start of right-to-left direction */
1473 ONE_MORE_BYTE (c1);
1474 if (c1 == ']')
1475 coding->mode |= CODING_MODE_DIRECTION;
1476 else
1477 goto label_invalid_code;
1478 break;
1480 default:
1481 goto label_invalid_code;
1483 continue;
1485 default:
1486 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1487 goto label_invalid_code;
1488 if (c1 >= 0x28 && c1 <= 0x2B)
1489 { /* designation of DIMENSION1_CHARS94 character set */
1490 ONE_MORE_BYTE (c2);
1491 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1493 else if (c1 >= 0x2C && c1 <= 0x2F)
1494 { /* designation of DIMENSION1_CHARS96 character set */
1495 ONE_MORE_BYTE (c2);
1496 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1498 else
1499 goto label_invalid_code;
1500 /* We must update these variables now. */
1501 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1502 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1503 continue;
1507 /* Now we know CHARSET and 1st position code C1 of a character.
1508 Produce a multibyte sequence for that character while getting
1509 2nd position code C2 if necessary. */
1510 if (CHARSET_DIMENSION (charset) == 2)
1512 ONE_MORE_BYTE (c2);
1513 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1514 /* C2 is not in a valid range. */
1515 goto label_invalid_code;
1517 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1518 EMIT_CHAR (c);
1519 continue;
1521 label_invalid_code:
1522 coding->errors++;
1523 if (COMPOSING_P (coding))
1524 DECODE_COMPOSITION_END ('1');
1525 src = src_base;
1526 c = *src++;
1527 EMIT_CHAR (c);
1530 label_end_of_loop:
1531 coding->consumed = coding->consumed_char = src_base - source;
1532 coding->produced = dst - destination;
1533 return;
1537 /* ISO2022 encoding stuff. */
1540 It is not enough to say just "ISO2022" on encoding, we have to
1541 specify more details. In Emacs, each coding system of ISO2022
1542 variant has the following specifications:
1543 1. Initial designation to G0 thru G3.
1544 2. Allows short-form designation?
1545 3. ASCII should be designated to G0 before control characters?
1546 4. ASCII should be designated to G0 at end of line?
1547 5. 7-bit environment or 8-bit environment?
1548 6. Use locking-shift?
1549 7. Use Single-shift?
1550 And the following two are only for Japanese:
1551 8. Use ASCII in place of JIS0201-1976-Roman?
1552 9. Use JISX0208-1983 in place of JISX0208-1978?
1553 These specifications are encoded in `coding->flags' as flag bits
1554 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1555 details.
1558 /* Produce codes (escape sequence) for designating CHARSET to graphic
1559 register REG at DST, and increment DST. If <final-char> of CHARSET is
1560 '@', 'A', or 'B' and the coding system CODING allows, produce
1561 designation sequence of short-form. */
1563 #define ENCODE_DESIGNATION(charset, reg, coding) \
1564 do { \
1565 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1566 char *intermediate_char_94 = "()*+"; \
1567 char *intermediate_char_96 = ",-./"; \
1568 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1570 if (revision < 255) \
1572 *dst++ = ISO_CODE_ESC; \
1573 *dst++ = '&'; \
1574 *dst++ = '@' + revision; \
1576 *dst++ = ISO_CODE_ESC; \
1577 if (CHARSET_DIMENSION (charset) == 1) \
1579 if (CHARSET_CHARS (charset) == 94) \
1580 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1581 else \
1582 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1584 else \
1586 *dst++ = '$'; \
1587 if (CHARSET_CHARS (charset) == 94) \
1589 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1590 || reg != 0 \
1591 || final_char < '@' || final_char > 'B') \
1592 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1594 else \
1595 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1597 *dst++ = final_char; \
1598 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1599 } while (0)
1601 /* The following two macros produce codes (control character or escape
1602 sequence) for ISO2022 single-shift functions (single-shift-2 and
1603 single-shift-3). */
1605 #define ENCODE_SINGLE_SHIFT_2 \
1606 do { \
1607 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1608 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1609 else \
1610 *dst++ = ISO_CODE_SS2; \
1611 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1612 } while (0)
1614 #define ENCODE_SINGLE_SHIFT_3 \
1615 do { \
1616 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1617 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1618 else \
1619 *dst++ = ISO_CODE_SS3; \
1620 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1621 } while (0)
1623 /* The following four macros produce codes (control character or
1624 escape sequence) for ISO2022 locking-shift functions (shift-in,
1625 shift-out, locking-shift-2, and locking-shift-3). */
1627 #define ENCODE_SHIFT_IN \
1628 do { \
1629 *dst++ = ISO_CODE_SI; \
1630 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1631 } while (0)
1633 #define ENCODE_SHIFT_OUT \
1634 do { \
1635 *dst++ = ISO_CODE_SO; \
1636 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1637 } while (0)
1639 #define ENCODE_LOCKING_SHIFT_2 \
1640 do { \
1641 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1642 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1643 } while (0)
1645 #define ENCODE_LOCKING_SHIFT_3 \
1646 do { \
1647 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1648 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1649 } while (0)
1651 /* Produce codes for a DIMENSION1 character whose character set is
1652 CHARSET and whose position-code is C1. Designation and invocation
1653 sequences are also produced in advance if necessary. */
1655 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1656 do { \
1657 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1659 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1660 *dst++ = c1 & 0x7F; \
1661 else \
1662 *dst++ = c1 | 0x80; \
1663 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1664 break; \
1666 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1668 *dst++ = c1 & 0x7F; \
1669 break; \
1671 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1673 *dst++ = c1 | 0x80; \
1674 break; \
1676 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1677 && !coding->safe_charsets[charset]) \
1679 /* We should not encode this character, instead produce one or \
1680 two `?'s. */ \
1681 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1682 if (CHARSET_WIDTH (charset) == 2) \
1683 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1684 break; \
1686 else \
1687 /* Since CHARSET is not yet invoked to any graphic planes, we \
1688 must invoke it, or, at first, designate it to some graphic \
1689 register. Then repeat the loop to actually produce the \
1690 character. */ \
1691 dst = encode_invocation_designation (charset, coding, dst); \
1692 } while (1)
1694 /* Produce codes for a DIMENSION2 character whose character set is
1695 CHARSET and whose position-codes are C1 and C2. Designation and
1696 invocation codes are also produced in advance if necessary. */
1698 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1699 do { \
1700 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1702 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1703 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1704 else \
1705 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1706 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1707 break; \
1709 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1711 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1712 break; \
1714 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1716 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1717 break; \
1719 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1720 && !coding->safe_charsets[charset]) \
1722 /* We should not encode this character, instead produce one or \
1723 two `?'s. */ \
1724 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1725 if (CHARSET_WIDTH (charset) == 2) \
1726 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1727 break; \
1729 else \
1730 /* Since CHARSET is not yet invoked to any graphic planes, we \
1731 must invoke it, or, at first, designate it to some graphic \
1732 register. Then repeat the loop to actually produce the \
1733 character. */ \
1734 dst = encode_invocation_designation (charset, coding, dst); \
1735 } while (1)
1737 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1738 do { \
1739 int alt_charset = charset; \
1741 if (CHARSET_DEFINED_P (charset)) \
1743 if (CHARSET_DIMENSION (charset) == 1) \
1745 if (charset == CHARSET_ASCII \
1746 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1747 alt_charset = charset_latin_jisx0201; \
1748 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1750 else \
1752 if (charset == charset_jisx0208 \
1753 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1754 alt_charset = charset_jisx0208_1978; \
1755 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1758 else \
1760 *dst++ = c1; \
1761 if (c2 >= 0) \
1762 *dst++ = c2; \
1764 } while (0)
1766 /* Produce designation and invocation codes at a place pointed by DST
1767 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1768 Return new DST. */
1770 unsigned char *
1771 encode_invocation_designation (charset, coding, dst)
1772 int charset;
1773 struct coding_system *coding;
1774 unsigned char *dst;
1776 int reg; /* graphic register number */
1778 /* At first, check designations. */
1779 for (reg = 0; reg < 4; reg++)
1780 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1781 break;
1783 if (reg >= 4)
1785 /* CHARSET is not yet designated to any graphic registers. */
1786 /* At first check the requested designation. */
1787 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1788 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1789 /* Since CHARSET requests no special designation, designate it
1790 to graphic register 0. */
1791 reg = 0;
1793 ENCODE_DESIGNATION (charset, reg, coding);
1796 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1797 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1799 /* Since the graphic register REG is not invoked to any graphic
1800 planes, invoke it to graphic plane 0. */
1801 switch (reg)
1803 case 0: /* graphic register 0 */
1804 ENCODE_SHIFT_IN;
1805 break;
1807 case 1: /* graphic register 1 */
1808 ENCODE_SHIFT_OUT;
1809 break;
1811 case 2: /* graphic register 2 */
1812 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1813 ENCODE_SINGLE_SHIFT_2;
1814 else
1815 ENCODE_LOCKING_SHIFT_2;
1816 break;
1818 case 3: /* graphic register 3 */
1819 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1820 ENCODE_SINGLE_SHIFT_3;
1821 else
1822 ENCODE_LOCKING_SHIFT_3;
1823 break;
1827 return dst;
1830 /* Produce 2-byte codes for encoded composition rule RULE. */
1832 #define ENCODE_COMPOSITION_RULE(rule) \
1833 do { \
1834 int gref, nref; \
1835 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1836 *dst++ = 32 + 81 + gref; \
1837 *dst++ = 32 + nref; \
1838 } while (0)
1840 /* Produce codes for indicating the start of a composition sequence
1841 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1842 which specify information about the composition. See the comment
1843 in coding.h for the format of DATA. */
1845 #define ENCODE_COMPOSITION_START(coding, data) \
1846 do { \
1847 coding->composing = data[3]; \
1848 *dst++ = ISO_CODE_ESC; \
1849 if (coding->composing == COMPOSITION_RELATIVE) \
1850 *dst++ = '0'; \
1851 else \
1853 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1854 ? '3' : '4'); \
1855 coding->cmp_data_index = coding->cmp_data_start + 4; \
1856 coding->composition_rule_follows = 0; \
1858 } while (0)
1860 /* Produce codes for indicating the end of the current composition. */
1862 #define ENCODE_COMPOSITION_END(coding, data) \
1863 do { \
1864 *dst++ = ISO_CODE_ESC; \
1865 *dst++ = '1'; \
1866 coding->cmp_data_start += data[0]; \
1867 coding->composing = COMPOSITION_NO; \
1868 if (coding->cmp_data_start == coding->cmp_data->used \
1869 && coding->cmp_data->next) \
1871 coding->cmp_data = coding->cmp_data->next; \
1872 coding->cmp_data_start = 0; \
1874 } while (0)
1876 /* Produce composition start sequence ESC 0. Here, this sequence
1877 doesn't mean the start of a new composition but means that we have
1878 just produced components (alternate chars and composition rules) of
1879 the composition and the actual text follows in SRC. */
1881 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1882 do { \
1883 *dst++ = ISO_CODE_ESC; \
1884 *dst++ = '0'; \
1885 coding->composing = COMPOSITION_RELATIVE; \
1886 } while (0)
1888 /* The following three macros produce codes for indicating direction
1889 of text. */
1890 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1891 do { \
1892 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1893 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1894 else \
1895 *dst++ = ISO_CODE_CSI; \
1896 } while (0)
1898 #define ENCODE_DIRECTION_R2L \
1899 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1901 #define ENCODE_DIRECTION_L2R \
1902 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1904 /* Produce codes for designation and invocation to reset the graphic
1905 planes and registers to initial state. */
1906 #define ENCODE_RESET_PLANE_AND_REGISTER \
1907 do { \
1908 int reg; \
1909 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1910 ENCODE_SHIFT_IN; \
1911 for (reg = 0; reg < 4; reg++) \
1912 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1913 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1914 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1915 ENCODE_DESIGNATION \
1916 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1917 } while (0)
1919 /* Produce designation sequences of charsets in the line started from
1920 SRC to a place pointed by DST, and return updated DST.
1922 If the current block ends before any end-of-line, we may fail to
1923 find all the necessary designations. */
1925 static unsigned char *
1926 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1927 struct coding_system *coding;
1928 Lisp_Object translation_table;
1929 unsigned char *src, *src_end, *dst;
1931 int charset, c, found = 0, reg;
1932 /* Table of charsets to be designated to each graphic register. */
1933 int r[4];
1935 for (reg = 0; reg < 4; reg++)
1936 r[reg] = -1;
1938 while (found < 4)
1940 ONE_MORE_CHAR (c);
1941 if (c == '\n')
1942 break;
1944 charset = CHAR_CHARSET (c);
1945 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1946 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1948 found++;
1949 r[reg] = charset;
1953 label_end_of_loop:
1954 if (found)
1956 for (reg = 0; reg < 4; reg++)
1957 if (r[reg] >= 0
1958 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1959 ENCODE_DESIGNATION (r[reg], reg, coding);
1962 return dst;
1965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1967 static void
1968 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1969 struct coding_system *coding;
1970 unsigned char *source, *destination;
1971 int src_bytes, dst_bytes;
1973 unsigned char *src = source;
1974 unsigned char *src_end = source + src_bytes;
1975 unsigned char *dst = destination;
1976 unsigned char *dst_end = destination + dst_bytes;
1977 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1978 from DST_END to assure overflow checking is necessary only at the
1979 head of loop. */
1980 unsigned char *adjusted_dst_end = dst_end - 19;
1981 /* SRC_BASE remembers the start position in source in each loop.
1982 The loop will be exited when there's not enough source text to
1983 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1984 there's not enough destination area to produce encoded codes
1985 (within macro EMIT_BYTES). */
1986 unsigned char *src_base;
1987 int c;
1988 Lisp_Object translation_table;
1990 if (NILP (Venable_character_translation))
1991 translation_table = Qnil;
1992 else
1994 translation_table = coding->translation_table_for_encode;
1995 if (NILP (translation_table))
1996 translation_table = Vstandard_translation_table_for_encode;
1999 coding->consumed_char = 0;
2000 coding->errors = 0;
2001 while (1)
2003 int charset, c1, c2;
2005 src_base = src;
2007 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2009 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2010 break;
2013 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2014 && CODING_SPEC_ISO_BOL (coding))
2016 /* We have to produce designation sequences if any now. */
2017 dst = encode_designation_at_bol (coding, translation_table,
2018 src, src_end, dst);
2019 CODING_SPEC_ISO_BOL (coding) = 0;
2022 /* Check composition start and end. */
2023 if (coding->composing != COMPOSITION_DISABLED
2024 && coding->cmp_data_start < coding->cmp_data->used)
2026 struct composition_data *cmp_data = coding->cmp_data;
2027 int *data = cmp_data->data + coding->cmp_data_start;
2028 int this_pos = cmp_data->char_offset + coding->consumed_char;
2030 if (coding->composing == COMPOSITION_RELATIVE)
2032 if (this_pos == data[2])
2034 ENCODE_COMPOSITION_END (coding, data);
2035 cmp_data = coding->cmp_data;
2036 data = cmp_data->data + coding->cmp_data_start;
2039 else if (COMPOSING_P (coding))
2041 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2042 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2043 /* We have consumed components of the composition.
2044 What follows in SRC is the compositions's base
2045 text. */
2046 ENCODE_COMPOSITION_FAKE_START (coding);
2047 else
2049 int c = cmp_data->data[coding->cmp_data_index++];
2050 if (coding->composition_rule_follows)
2052 ENCODE_COMPOSITION_RULE (c);
2053 coding->composition_rule_follows = 0;
2055 else
2057 SPLIT_CHAR (c, charset, c1, c2);
2058 ENCODE_ISO_CHARACTER (charset, c1, c2);
2059 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2060 coding->composition_rule_follows = 1;
2062 continue;
2065 if (!COMPOSING_P (coding))
2067 if (this_pos == data[1])
2069 ENCODE_COMPOSITION_START (coding, data);
2070 continue;
2075 ONE_MORE_CHAR (c);
2077 /* Now encode the character C. */
2078 if (c < 0x20 || c == 0x7F)
2080 if (c == '\r')
2082 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2084 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2085 ENCODE_RESET_PLANE_AND_REGISTER;
2086 *dst++ = c;
2087 continue;
2089 /* fall down to treat '\r' as '\n' ... */
2090 c = '\n';
2092 if (c == '\n')
2094 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2095 ENCODE_RESET_PLANE_AND_REGISTER;
2096 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2097 bcopy (coding->spec.iso2022.initial_designation,
2098 coding->spec.iso2022.current_designation,
2099 sizeof coding->spec.iso2022.initial_designation);
2100 if (coding->eol_type == CODING_EOL_LF
2101 || coding->eol_type == CODING_EOL_UNDECIDED)
2102 *dst++ = ISO_CODE_LF;
2103 else if (coding->eol_type == CODING_EOL_CRLF)
2104 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2105 else
2106 *dst++ = ISO_CODE_CR;
2107 CODING_SPEC_ISO_BOL (coding) = 1;
2109 else
2111 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2112 ENCODE_RESET_PLANE_AND_REGISTER;
2113 *dst++ = c;
2116 else if (ASCII_BYTE_P (c))
2117 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2118 else if (SINGLE_BYTE_CHAR_P (c))
2120 *dst++ = c;
2121 coding->errors++;
2123 else
2125 SPLIT_CHAR (c, charset, c1, c2);
2126 ENCODE_ISO_CHARACTER (charset, c1, c2);
2129 coding->consumed_char++;
2132 label_end_of_loop:
2133 coding->consumed = src_base - source;
2134 coding->produced = coding->produced_char = dst - destination;
2138 /*** 4. SJIS and BIG5 handlers ***/
2140 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2141 quite widely. So, for the moment, Emacs supports them in the bare
2142 C code. But, in the future, they may be supported only by CCL. */
2144 /* SJIS is a coding system encoding three character sets: ASCII, right
2145 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2146 as is. A character of charset katakana-jisx0201 is encoded by
2147 "position-code + 0x80". A character of charset japanese-jisx0208
2148 is encoded in 2-byte but two position-codes are divided and shifted
2149 so that it fit in the range below.
2151 --- CODE RANGE of SJIS ---
2152 (character set) (range)
2153 ASCII 0x00 .. 0x7F
2154 KATAKANA-JISX0201 0xA0 .. 0xDF
2155 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2156 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2157 -------------------------------
2161 /* BIG5 is a coding system encoding two character sets: ASCII and
2162 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2163 character set and is encoded in two-byte.
2165 --- CODE RANGE of BIG5 ---
2166 (character set) (range)
2167 ASCII 0x00 .. 0x7F
2168 Big5 (1st byte) 0xA1 .. 0xFE
2169 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2170 --------------------------
2172 Since the number of characters in Big5 is larger than maximum
2173 characters in Emacs' charset (96x96), it can't be handled as one
2174 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2175 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2176 contains frequently used characters and the latter contains less
2177 frequently used characters. */
2179 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2180 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2181 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2182 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2184 /* Number of Big5 characters which have the same code in 1st byte. */
2185 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2187 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2188 do { \
2189 unsigned int temp \
2190 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2191 if (b1 < 0xC9) \
2192 charset = charset_big5_1; \
2193 else \
2195 charset = charset_big5_2; \
2196 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2198 c1 = temp / (0xFF - 0xA1) + 0x21; \
2199 c2 = temp % (0xFF - 0xA1) + 0x21; \
2200 } while (0)
2202 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2203 do { \
2204 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2205 if (charset == charset_big5_2) \
2206 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2207 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2208 b2 = temp % BIG5_SAME_ROW; \
2209 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2210 } while (0)
2212 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2213 Check if a text is encoded in SJIS. If it is, return
2214 CODING_CATEGORY_MASK_SJIS, else return 0. */
2217 detect_coding_sjis (src, src_end)
2218 unsigned char *src, *src_end;
2220 int c;
2221 /* Dummy for ONE_MORE_BYTE. */
2222 struct coding_system dummy_coding;
2223 struct coding_system *coding = &dummy_coding;
2225 while (1)
2227 ONE_MORE_BYTE (c);
2228 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2230 ONE_MORE_BYTE (c);
2231 if (c < 0x40)
2232 return 0;
2235 label_end_of_loop:
2236 return CODING_CATEGORY_MASK_SJIS;
2239 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2240 Check if a text is encoded in BIG5. If it is, return
2241 CODING_CATEGORY_MASK_BIG5, else return 0. */
2244 detect_coding_big5 (src, src_end)
2245 unsigned char *src, *src_end;
2247 int c;
2248 /* Dummy for ONE_MORE_BYTE. */
2249 struct coding_system dummy_coding;
2250 struct coding_system *coding = &dummy_coding;
2252 while (1)
2254 ONE_MORE_BYTE (c);
2255 if (c >= 0xA1)
2257 ONE_MORE_BYTE (c);
2258 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2259 return 0;
2262 label_end_of_loop:
2263 return CODING_CATEGORY_MASK_BIG5;
2266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2267 Check if a text is encoded in UTF-8. If it is, return
2268 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2270 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2271 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2276 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2279 detect_coding_utf_8 (src, src_end)
2280 unsigned char *src, *src_end;
2282 unsigned char c;
2283 int seq_maybe_bytes;
2284 /* Dummy for ONE_MORE_BYTE. */
2285 struct coding_system dummy_coding;
2286 struct coding_system *coding = &dummy_coding;
2288 while (1)
2290 ONE_MORE_BYTE (c);
2291 if (UTF_8_1_OCTET_P (c))
2292 continue;
2293 else if (UTF_8_2_OCTET_LEADING_P (c))
2294 seq_maybe_bytes = 1;
2295 else if (UTF_8_3_OCTET_LEADING_P (c))
2296 seq_maybe_bytes = 2;
2297 else if (UTF_8_4_OCTET_LEADING_P (c))
2298 seq_maybe_bytes = 3;
2299 else if (UTF_8_5_OCTET_LEADING_P (c))
2300 seq_maybe_bytes = 4;
2301 else if (UTF_8_6_OCTET_LEADING_P (c))
2302 seq_maybe_bytes = 5;
2303 else
2304 return 0;
2308 ONE_MORE_BYTE (c);
2309 if (!UTF_8_EXTRA_OCTET_P (c))
2310 return 0;
2311 seq_maybe_bytes--;
2313 while (seq_maybe_bytes > 0);
2316 label_end_of_loop:
2317 return CODING_CATEGORY_MASK_UTF_8;
2320 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2321 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2322 Little Endian (otherwise). If it is, return
2323 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2324 else return 0. */
2326 #define UTF_16_INVALID_P(val) \
2327 (((val) == 0xFFFE) \
2328 || ((val) == 0xFFFF))
2330 #define UTF_16_HIGH_SURROGATE_P(val) \
2331 (((val) & 0xD800) == 0xD800)
2333 #define UTF_16_LOW_SURROGATE_P(val) \
2334 (((val) & 0xDC00) == 0xDC00)
2337 detect_coding_utf_16 (src, src_end)
2338 unsigned char *src, *src_end;
2340 unsigned char c1, c2;
2341 /* Dummy for TWO_MORE_BYTES. */
2342 struct coding_system dummy_coding;
2343 struct coding_system *coding = &dummy_coding;
2345 TWO_MORE_BYTES (c1, c2);
2347 if ((c1 == 0xFF) && (c2 == 0xFE))
2348 return CODING_CATEGORY_MASK_UTF_16_LE;
2349 else if ((c1 == 0xFE) && (c2 == 0xFF))
2350 return CODING_CATEGORY_MASK_UTF_16_BE;
2352 label_end_of_loop:
2353 return 0;
2356 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2357 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2359 static void
2360 decode_coding_sjis_big5 (coding, source, destination,
2361 src_bytes, dst_bytes, sjis_p)
2362 struct coding_system *coding;
2363 unsigned char *source, *destination;
2364 int src_bytes, dst_bytes;
2365 int sjis_p;
2367 unsigned char *src = source;
2368 unsigned char *src_end = source + src_bytes;
2369 unsigned char *dst = destination;
2370 unsigned char *dst_end = destination + dst_bytes;
2371 /* SRC_BASE remembers the start position in source in each loop.
2372 The loop will be exited when there's not enough source code
2373 (within macro ONE_MORE_BYTE), or when there's not enough
2374 destination area to produce a character (within macro
2375 EMIT_CHAR). */
2376 unsigned char *src_base;
2377 Lisp_Object translation_table;
2379 if (NILP (Venable_character_translation))
2380 translation_table = Qnil;
2381 else
2383 translation_table = coding->translation_table_for_decode;
2384 if (NILP (translation_table))
2385 translation_table = Vstandard_translation_table_for_decode;
2388 coding->produced_char = 0;
2389 while (1)
2391 int c, charset, c1, c2;
2393 src_base = src;
2394 ONE_MORE_BYTE (c1);
2396 if (c1 < 0x80)
2398 charset = CHARSET_ASCII;
2399 if (c1 < 0x20)
2401 if (c1 == '\r')
2403 if (coding->eol_type == CODING_EOL_CRLF)
2405 ONE_MORE_BYTE (c2);
2406 if (c2 == '\n')
2407 c1 = c2;
2408 else if (coding->mode
2409 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2411 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2412 goto label_end_of_loop;
2414 else
2415 /* To process C2 again, SRC is subtracted by 1. */
2416 src--;
2418 else if (coding->eol_type == CODING_EOL_CR)
2419 c1 = '\n';
2421 else if (c1 == '\n'
2422 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2423 && (coding->eol_type == CODING_EOL_CR
2424 || coding->eol_type == CODING_EOL_CRLF))
2426 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2427 goto label_end_of_loop;
2431 else
2433 if (sjis_p)
2435 if (c1 >= 0xF0)
2436 goto label_invalid_code;
2437 if (c1 < 0xA0 || c1 >= 0xE0)
2439 /* SJIS -> JISX0208 */
2440 ONE_MORE_BYTE (c2);
2441 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2442 goto label_invalid_code;
2443 DECODE_SJIS (c1, c2, c1, c2);
2444 charset = charset_jisx0208;
2446 else
2447 /* SJIS -> JISX0201-Kana */
2448 charset = charset_katakana_jisx0201;
2450 else
2452 /* BIG5 -> Big5 */
2453 if (c1 < 0xA1 || c1 > 0xFE)
2454 goto label_invalid_code;
2455 ONE_MORE_BYTE (c2);
2456 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2457 goto label_invalid_code;
2458 DECODE_BIG5 (c1, c2, charset, c1, c2);
2462 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2463 EMIT_CHAR (c);
2464 continue;
2466 label_invalid_code:
2467 coding->errors++;
2468 src = src_base;
2469 c = *src++;
2470 EMIT_CHAR (c);
2473 label_end_of_loop:
2474 coding->consumed = coding->consumed_char = src_base - source;
2475 coding->produced = dst - destination;
2476 return;
2479 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2480 This function can encode charsets `ascii', `katakana-jisx0201',
2481 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2482 are sure that all these charsets are registered as official charset
2483 (i.e. do not have extended leading-codes). Characters of other
2484 charsets are produced without any encoding. If SJIS_P is 1, encode
2485 SJIS text, else encode BIG5 text. */
2487 static void
2488 encode_coding_sjis_big5 (coding, source, destination,
2489 src_bytes, dst_bytes, sjis_p)
2490 struct coding_system *coding;
2491 unsigned char *source, *destination;
2492 int src_bytes, dst_bytes;
2493 int sjis_p;
2495 unsigned char *src = source;
2496 unsigned char *src_end = source + src_bytes;
2497 unsigned char *dst = destination;
2498 unsigned char *dst_end = destination + dst_bytes;
2499 /* SRC_BASE remembers the start position in source in each loop.
2500 The loop will be exited when there's not enough source text to
2501 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2502 there's not enough destination area to produce encoded codes
2503 (within macro EMIT_BYTES). */
2504 unsigned char *src_base;
2505 Lisp_Object translation_table;
2507 if (NILP (Venable_character_translation))
2508 translation_table = Qnil;
2509 else
2511 translation_table = coding->translation_table_for_decode;
2512 if (NILP (translation_table))
2513 translation_table = Vstandard_translation_table_for_decode;
2516 while (1)
2518 int c, charset, c1, c2;
2520 src_base = src;
2521 ONE_MORE_CHAR (c);
2523 /* Now encode the character C. */
2524 if (SINGLE_BYTE_CHAR_P (c))
2526 switch (c)
2528 case '\r':
2529 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2531 EMIT_ONE_BYTE (c);
2532 break;
2534 c = '\n';
2535 case '\n':
2536 if (coding->eol_type == CODING_EOL_CRLF)
2538 EMIT_TWO_BYTES ('\r', c);
2539 break;
2541 else if (coding->eol_type == CODING_EOL_CR)
2542 c = '\r';
2543 default:
2544 EMIT_ONE_BYTE (c);
2547 else
2549 SPLIT_CHAR (c, charset, c1, c2);
2550 if (sjis_p)
2552 if (charset == charset_jisx0208
2553 || charset == charset_jisx0208_1978)
2555 ENCODE_SJIS (c1, c2, c1, c2);
2556 EMIT_TWO_BYTES (c1, c2);
2558 else if (charset == charset_latin_jisx0201)
2559 EMIT_ONE_BYTE (c1);
2560 else
2561 /* There's no way other than producing the internal
2562 codes as is. */
2563 EMIT_BYTES (src_base, src);
2565 else
2567 if (charset == charset_big5_1 || charset == charset_big5_2)
2569 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2570 EMIT_TWO_BYTES (c1, c2);
2572 else
2573 /* There's no way other than producing the internal
2574 codes as is. */
2575 EMIT_BYTES (src_base, src);
2578 coding->consumed_char++;
2581 label_end_of_loop:
2582 coding->consumed = src_base - source;
2583 coding->produced = coding->produced_char = dst - destination;
2587 /*** 5. CCL handlers ***/
2589 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2590 Check if a text is encoded in a coding system of which
2591 encoder/decoder are written in CCL program. If it is, return
2592 CODING_CATEGORY_MASK_CCL, else return 0. */
2595 detect_coding_ccl (src, src_end)
2596 unsigned char *src, *src_end;
2598 unsigned char *valid;
2599 int c;
2600 /* Dummy for ONE_MORE_BYTE. */
2601 struct coding_system dummy_coding;
2602 struct coding_system *coding = &dummy_coding;
2604 /* No coding system is assigned to coding-category-ccl. */
2605 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2606 return 0;
2608 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2609 while (1)
2611 ONE_MORE_BYTE (c);
2612 if (! valid[c])
2613 return 0;
2615 label_end_of_loop:
2616 return CODING_CATEGORY_MASK_CCL;
2620 /*** 6. End-of-line handlers ***/
2622 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2624 static void
2625 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2626 struct coding_system *coding;
2627 unsigned char *source, *destination;
2628 int src_bytes, dst_bytes;
2630 unsigned char *src = source;
2631 unsigned char *dst = destination;
2632 unsigned char *src_end = src + src_bytes;
2633 unsigned char *dst_end = dst + dst_bytes;
2634 Lisp_Object translation_table;
2635 /* SRC_BASE remembers the start position in source in each loop.
2636 The loop will be exited when there's not enough source code
2637 (within macro ONE_MORE_BYTE), or when there's not enough
2638 destination area to produce a character (within macro
2639 EMIT_CHAR). */
2640 unsigned char *src_base;
2641 int c;
2643 translation_table = Qnil;
2644 switch (coding->eol_type)
2646 case CODING_EOL_CRLF:
2647 while (1)
2649 src_base = src;
2650 ONE_MORE_BYTE (c);
2651 if (c == '\r')
2653 ONE_MORE_BYTE (c);
2654 if (c != '\n')
2656 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2658 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2659 goto label_end_of_loop;
2661 src--;
2662 c = '\r';
2665 else if (c == '\n'
2666 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2668 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2669 goto label_end_of_loop;
2671 EMIT_CHAR (c);
2673 break;
2675 case CODING_EOL_CR:
2676 while (1)
2678 src_base = src;
2679 ONE_MORE_BYTE (c);
2680 if (c == '\n')
2682 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2684 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2685 goto label_end_of_loop;
2688 else if (c == '\r')
2689 c = '\n';
2690 EMIT_CHAR (c);
2692 break;
2694 default: /* no need for EOL handling */
2695 while (1)
2697 src_base = src;
2698 ONE_MORE_BYTE (c);
2699 EMIT_CHAR (c);
2703 label_end_of_loop:
2704 coding->consumed = coding->consumed_char = src_base - source;
2705 coding->produced = dst - destination;
2706 return;
2709 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2710 format of end-of-line according to `coding->eol_type'. It also
2711 convert multibyte form 8-bit characers to unibyte if
2712 CODING->src_multibyte is nonzero. If `coding->mode &
2713 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2714 also means end-of-line. */
2716 static void
2717 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2718 struct coding_system *coding;
2719 unsigned char *source, *destination;
2720 int src_bytes, dst_bytes;
2722 unsigned char *src = source;
2723 unsigned char *dst = destination;
2724 unsigned char *src_end = src + src_bytes;
2725 unsigned char *dst_end = dst + dst_bytes;
2726 Lisp_Object translation_table;
2727 /* SRC_BASE remembers the start position in source in each loop.
2728 The loop will be exited when there's not enough source text to
2729 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2730 there's not enough destination area to produce encoded codes
2731 (within macro EMIT_BYTES). */
2732 unsigned char *src_base;
2733 int c;
2734 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2736 translation_table = Qnil;
2737 if (coding->src_multibyte
2738 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2740 src_end--;
2741 src_bytes--;
2742 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2745 if (coding->eol_type == CODING_EOL_CRLF)
2747 while (src < src_end)
2749 src_base = src;
2750 c = *src++;
2751 if (c >= 0x20)
2752 EMIT_ONE_BYTE (c);
2753 else if (c == '\n' || (c == '\r' && selective_display))
2754 EMIT_TWO_BYTES ('\r', '\n');
2755 else
2756 EMIT_ONE_BYTE (c);
2758 src_base = src;
2759 label_end_of_loop:
2762 else
2764 if (src_bytes <= dst_bytes)
2766 safe_bcopy (src, dst, src_bytes);
2767 src_base = src_end;
2768 dst += src_bytes;
2770 else
2772 if (coding->src_multibyte
2773 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2774 dst_bytes--;
2775 safe_bcopy (src, dst, dst_bytes);
2776 src_base = src + dst_bytes;
2777 dst = destination + dst_bytes;
2778 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2780 if (coding->eol_type == CODING_EOL_CR)
2782 for (src = destination; src < dst; src++)
2783 if (*src == '\n') *src = '\r';
2785 else if (selective_display)
2787 for (src = destination; src < dst; src++)
2788 if (*src == '\r') *src = '\n';
2791 if (coding->src_multibyte)
2792 dst = destination + str_as_unibyte (destination, dst - destination);
2794 coding->consumed = src_base - source;
2795 coding->produced = dst - destination;
2799 /*** 7. C library functions ***/
2801 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2802 has a property `coding-system'. The value of this property is a
2803 vector of length 5 (called as coding-vector). Among elements of
2804 this vector, the first (element[0]) and the fifth (element[4])
2805 carry important information for decoding/encoding. Before
2806 decoding/encoding, this information should be set in fields of a
2807 structure of type `coding_system'.
2809 A value of property `coding-system' can be a symbol of another
2810 subsidiary coding-system. In that case, Emacs gets coding-vector
2811 from that symbol.
2813 `element[0]' contains information to be set in `coding->type'. The
2814 value and its meaning is as follows:
2816 0 -- coding_type_emacs_mule
2817 1 -- coding_type_sjis
2818 2 -- coding_type_iso2022
2819 3 -- coding_type_big5
2820 4 -- coding_type_ccl encoder/decoder written in CCL
2821 nil -- coding_type_no_conversion
2822 t -- coding_type_undecided (automatic conversion on decoding,
2823 no-conversion on encoding)
2825 `element[4]' contains information to be set in `coding->flags' and
2826 `coding->spec'. The meaning varies by `coding->type'.
2828 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2829 of length 32 (of which the first 13 sub-elements are used now).
2830 Meanings of these sub-elements are:
2832 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2833 If the value is an integer of valid charset, the charset is
2834 assumed to be designated to graphic register N initially.
2836 If the value is minus, it is a minus value of charset which
2837 reserves graphic register N, which means that the charset is
2838 not designated initially but should be designated to graphic
2839 register N just before encoding a character in that charset.
2841 If the value is nil, graphic register N is never used on
2842 encoding.
2844 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2845 Each value takes t or nil. See the section ISO2022 of
2846 `coding.h' for more information.
2848 If `coding->type' is `coding_type_big5', element[4] is t to denote
2849 BIG5-ETen or nil to denote BIG5-HKU.
2851 If `coding->type' takes the other value, element[4] is ignored.
2853 Emacs Lisp's coding system also carries information about format of
2854 end-of-line in a value of property `eol-type'. If the value is
2855 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2856 means CODING_EOL_CR. If it is not integer, it should be a vector
2857 of subsidiary coding systems of which property `eol-type' has one
2858 of above values.
2862 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2863 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2864 is setup so that no conversion is necessary and return -1, else
2865 return 0. */
2868 setup_coding_system (coding_system, coding)
2869 Lisp_Object coding_system;
2870 struct coding_system *coding;
2872 Lisp_Object coding_spec, coding_type, eol_type, plist;
2873 Lisp_Object val;
2874 int i;
2876 /* Initialize some fields required for all kinds of coding systems. */
2877 coding->symbol = coding_system;
2878 coding->common_flags = 0;
2879 coding->mode = 0;
2880 coding->heading_ascii = -1;
2881 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2882 coding->composing = COMPOSITION_DISABLED;
2883 coding->cmp_data = NULL;
2885 if (NILP (coding_system))
2886 goto label_invalid_coding_system;
2888 coding_spec = Fget (coding_system, Qcoding_system);
2890 if (!VECTORP (coding_spec)
2891 || XVECTOR (coding_spec)->size != 5
2892 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2893 goto label_invalid_coding_system;
2895 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2896 if (VECTORP (eol_type))
2898 coding->eol_type = CODING_EOL_UNDECIDED;
2899 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2901 else if (XFASTINT (eol_type) == 1)
2903 coding->eol_type = CODING_EOL_CRLF;
2904 coding->common_flags
2905 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2907 else if (XFASTINT (eol_type) == 2)
2909 coding->eol_type = CODING_EOL_CR;
2910 coding->common_flags
2911 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2913 else
2914 coding->eol_type = CODING_EOL_LF;
2916 coding_type = XVECTOR (coding_spec)->contents[0];
2917 /* Try short cut. */
2918 if (SYMBOLP (coding_type))
2920 if (EQ (coding_type, Qt))
2922 coding->type = coding_type_undecided;
2923 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2925 else
2926 coding->type = coding_type_no_conversion;
2927 return 0;
2930 /* Get values of coding system properties:
2931 `post-read-conversion', `pre-write-conversion',
2932 `translation-table-for-decode', `translation-table-for-encode'. */
2933 plist = XVECTOR (coding_spec)->contents[3];
2934 /* Pre & post conversion functions should be disabled if
2935 inhibit_eol_conversion is nozero. This is the case that a code
2936 conversion function is called while those functions are running. */
2937 if (! inhibit_pre_post_conversion)
2939 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2940 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2942 val = Fplist_get (plist, Qtranslation_table_for_decode);
2943 if (SYMBOLP (val))
2944 val = Fget (val, Qtranslation_table_for_decode);
2945 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2946 val = Fplist_get (plist, Qtranslation_table_for_encode);
2947 if (SYMBOLP (val))
2948 val = Fget (val, Qtranslation_table_for_encode);
2949 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2950 val = Fplist_get (plist, Qcoding_category);
2951 if (!NILP (val))
2953 val = Fget (val, Qcoding_category_index);
2954 if (INTEGERP (val))
2955 coding->category_idx = XINT (val);
2956 else
2957 goto label_invalid_coding_system;
2959 else
2960 goto label_invalid_coding_system;
2962 val = Fplist_get (plist, Qsafe_charsets);
2963 if (EQ (val, Qt))
2965 for (i = 0; i <= MAX_CHARSET; i++)
2966 coding->safe_charsets[i] = 1;
2968 else
2970 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2971 while (CONSP (val))
2973 if ((i = get_charset_id (XCAR (val))) >= 0)
2974 coding->safe_charsets[i] = 1;
2975 val = XCDR (val);
2979 /* If the coding system has non-nil `composition' property, enable
2980 composition handling. */
2981 val = Fplist_get (plist, Qcomposition);
2982 if (!NILP (val))
2983 coding->composing = COMPOSITION_NO;
2985 switch (XFASTINT (coding_type))
2987 case 0:
2988 coding->type = coding_type_emacs_mule;
2989 if (!NILP (coding->post_read_conversion))
2990 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2991 if (!NILP (coding->pre_write_conversion))
2992 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2993 break;
2995 case 1:
2996 coding->type = coding_type_sjis;
2997 coding->common_flags
2998 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2999 break;
3001 case 2:
3002 coding->type = coding_type_iso2022;
3003 coding->common_flags
3004 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3006 Lisp_Object val, temp;
3007 Lisp_Object *flags;
3008 int i, charset, reg_bits = 0;
3010 val = XVECTOR (coding_spec)->contents[4];
3012 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3013 goto label_invalid_coding_system;
3015 flags = XVECTOR (val)->contents;
3016 coding->flags
3017 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3018 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3019 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3020 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3021 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3022 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3023 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3024 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3025 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3026 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3027 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3028 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3029 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3032 /* Invoke graphic register 0 to plane 0. */
3033 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3034 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3035 CODING_SPEC_ISO_INVOCATION (coding, 1)
3036 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3037 /* Not single shifting at first. */
3038 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3039 /* Beginning of buffer should also be regarded as bol. */
3040 CODING_SPEC_ISO_BOL (coding) = 1;
3042 for (charset = 0; charset <= MAX_CHARSET; charset++)
3043 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3044 val = Vcharset_revision_alist;
3045 while (CONSP (val))
3047 charset = get_charset_id (Fcar_safe (XCAR (val)));
3048 if (charset >= 0
3049 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3050 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3051 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3052 val = XCDR (val);
3055 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3056 FLAGS[REG] can be one of below:
3057 integer CHARSET: CHARSET occupies register I,
3058 t: designate nothing to REG initially, but can be used
3059 by any charsets,
3060 list of integer, nil, or t: designate the first
3061 element (if integer) to REG initially, the remaining
3062 elements (if integer) is designated to REG on request,
3063 if an element is t, REG can be used by any charsets,
3064 nil: REG is never used. */
3065 for (charset = 0; charset <= MAX_CHARSET; charset++)
3066 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3067 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3068 for (i = 0; i < 4; i++)
3070 if (INTEGERP (flags[i])
3071 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3072 || (charset = get_charset_id (flags[i])) >= 0)
3074 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3075 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3077 else if (EQ (flags[i], Qt))
3079 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3080 reg_bits |= 1 << i;
3081 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3083 else if (CONSP (flags[i]))
3085 Lisp_Object tail;
3086 tail = flags[i];
3088 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3089 if (INTEGERP (XCAR (tail))
3090 && (charset = XINT (XCAR (tail)),
3091 CHARSET_VALID_P (charset))
3092 || (charset = get_charset_id (XCAR (tail))) >= 0)
3094 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3095 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3097 else
3098 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3099 tail = XCDR (tail);
3100 while (CONSP (tail))
3102 if (INTEGERP (XCAR (tail))
3103 && (charset = XINT (XCAR (tail)),
3104 CHARSET_VALID_P (charset))
3105 || (charset = get_charset_id (XCAR (tail))) >= 0)
3106 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3107 = i;
3108 else if (EQ (XCAR (tail), Qt))
3109 reg_bits |= 1 << i;
3110 tail = XCDR (tail);
3113 else
3114 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3116 CODING_SPEC_ISO_DESIGNATION (coding, i)
3117 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3120 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3122 /* REG 1 can be used only by locking shift in 7-bit env. */
3123 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3124 reg_bits &= ~2;
3125 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3126 /* Without any shifting, only REG 0 and 1 can be used. */
3127 reg_bits &= 3;
3130 if (reg_bits)
3131 for (charset = 0; charset <= MAX_CHARSET; charset++)
3133 if (CHARSET_VALID_P (charset))
3135 /* There exist some default graphic registers to be
3136 used CHARSET. */
3138 /* We had better avoid designating a charset of
3139 CHARS96 to REG 0 as far as possible. */
3140 if (CHARSET_CHARS (charset) == 96)
3141 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3142 = (reg_bits & 2
3143 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3144 else
3145 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3146 = (reg_bits & 1
3147 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3151 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3152 coding->spec.iso2022.last_invalid_designation_register = -1;
3153 break;
3155 case 3:
3156 coding->type = coding_type_big5;
3157 coding->common_flags
3158 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3159 coding->flags
3160 = (NILP (XVECTOR (coding_spec)->contents[4])
3161 ? CODING_FLAG_BIG5_HKU
3162 : CODING_FLAG_BIG5_ETEN);
3163 break;
3165 case 4:
3166 coding->type = coding_type_ccl;
3167 coding->common_flags
3168 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3170 val = XVECTOR (coding_spec)->contents[4];
3171 if (! CONSP (val)
3172 || setup_ccl_program (&(coding->spec.ccl.decoder),
3173 XCAR (val)) < 0
3174 || setup_ccl_program (&(coding->spec.ccl.encoder),
3175 XCDR (val)) < 0)
3176 goto label_invalid_coding_system;
3178 bzero (coding->spec.ccl.valid_codes, 256);
3179 val = Fplist_get (plist, Qvalid_codes);
3180 if (CONSP (val))
3182 Lisp_Object this;
3184 for (; CONSP (val); val = XCDR (val))
3186 this = XCAR (val);
3187 if (INTEGERP (this)
3188 && XINT (this) >= 0 && XINT (this) < 256)
3189 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3190 else if (CONSP (this)
3191 && INTEGERP (XCAR (this))
3192 && INTEGERP (XCDR (this)))
3194 int start = XINT (XCAR (this));
3195 int end = XINT (XCDR (this));
3197 if (start >= 0 && start <= end && end < 256)
3198 while (start <= end)
3199 coding->spec.ccl.valid_codes[start++] = 1;
3204 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3205 coding->spec.ccl.cr_carryover = 0;
3206 break;
3208 case 5:
3209 coding->type = coding_type_raw_text;
3210 break;
3212 default:
3213 goto label_invalid_coding_system;
3215 return 0;
3217 label_invalid_coding_system:
3218 coding->type = coding_type_no_conversion;
3219 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3220 coding->common_flags = 0;
3221 coding->eol_type = CODING_EOL_LF;
3222 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3223 return -1;
3226 /* Free memory blocks allocated for storing composition information. */
3228 void
3229 coding_free_composition_data (coding)
3230 struct coding_system *coding;
3232 struct composition_data *cmp_data = coding->cmp_data, *next;
3234 if (!cmp_data)
3235 return;
3236 /* Memory blocks are chained. At first, rewind to the first, then,
3237 free blocks one by one. */
3238 while (cmp_data->prev)
3239 cmp_data = cmp_data->prev;
3240 while (cmp_data)
3242 next = cmp_data->next;
3243 xfree (cmp_data);
3244 cmp_data = next;
3246 coding->cmp_data = NULL;
3249 /* Set `char_offset' member of all memory blocks pointed by
3250 coding->cmp_data to POS. */
3252 void
3253 coding_adjust_composition_offset (coding, pos)
3254 struct coding_system *coding;
3255 int pos;
3257 struct composition_data *cmp_data;
3259 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260 cmp_data->char_offset = pos;
3263 /* Setup raw-text or one of its subsidiaries in the structure
3264 coding_system CODING according to the already setup value eol_type
3265 in CODING. CODING should be setup for some coding system in
3266 advance. */
3268 void
3269 setup_raw_text_coding_system (coding)
3270 struct coding_system *coding;
3272 if (coding->type != coding_type_raw_text)
3274 coding->symbol = Qraw_text;
3275 coding->type = coding_type_raw_text;
3276 if (coding->eol_type != CODING_EOL_UNDECIDED)
3278 Lisp_Object subsidiaries;
3279 subsidiaries = Fget (Qraw_text, Qeol_type);
3281 if (VECTORP (subsidiaries)
3282 && XVECTOR (subsidiaries)->size == 3)
3283 coding->symbol
3284 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3286 setup_coding_system (coding->symbol, coding);
3288 return;
3291 /* Emacs has a mechanism to automatically detect a coding system if it
3292 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3293 it's impossible to distinguish some coding systems accurately
3294 because they use the same range of codes. So, at first, coding
3295 systems are categorized into 7, those are:
3297 o coding-category-emacs-mule
3299 The category for a coding system which has the same code range
3300 as Emacs' internal format. Assigned the coding-system (Lisp
3301 symbol) `emacs-mule' by default.
3303 o coding-category-sjis
3305 The category for a coding system which has the same code range
3306 as SJIS. Assigned the coding-system (Lisp
3307 symbol) `japanese-shift-jis' by default.
3309 o coding-category-iso-7
3311 The category for a coding system which has the same code range
3312 as ISO2022 of 7-bit environment. This doesn't use any locking
3313 shift and single shift functions. This can encode/decode all
3314 charsets. Assigned the coding-system (Lisp symbol)
3315 `iso-2022-7bit' by default.
3317 o coding-category-iso-7-tight
3319 Same as coding-category-iso-7 except that this can
3320 encode/decode only the specified charsets.
3322 o coding-category-iso-8-1
3324 The category for a coding system which has the same code range
3325 as ISO2022 of 8-bit environment and graphic plane 1 used only
3326 for DIMENSION1 charset. This doesn't use any locking shift
3327 and single shift functions. Assigned the coding-system (Lisp
3328 symbol) `iso-latin-1' by default.
3330 o coding-category-iso-8-2
3332 The category for a coding system which has the same code range
3333 as ISO2022 of 8-bit environment and graphic plane 1 used only
3334 for DIMENSION2 charset. This doesn't use any locking shift
3335 and single shift functions. Assigned the coding-system (Lisp
3336 symbol) `japanese-iso-8bit' by default.
3338 o coding-category-iso-7-else
3340 The category for a coding system which has the same code range
3341 as ISO2022 of 7-bit environemnt but uses locking shift or
3342 single shift functions. Assigned the coding-system (Lisp
3343 symbol) `iso-2022-7bit-lock' by default.
3345 o coding-category-iso-8-else
3347 The category for a coding system which has the same code range
3348 as ISO2022 of 8-bit environemnt but uses locking shift or
3349 single shift functions. Assigned the coding-system (Lisp
3350 symbol) `iso-2022-8bit-ss2' by default.
3352 o coding-category-big5
3354 The category for a coding system which has the same code range
3355 as BIG5. Assigned the coding-system (Lisp symbol)
3356 `cn-big5' by default.
3358 o coding-category-utf-8
3360 The category for a coding system which has the same code range
3361 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3362 symbol) `utf-8' by default.
3364 o coding-category-utf-16-be
3366 The category for a coding system in which a text has an
3367 Unicode signature (cf. Unicode Standard) in the order of BIG
3368 endian at the head. Assigned the coding-system (Lisp symbol)
3369 `utf-16-be' by default.
3371 o coding-category-utf-16-le
3373 The category for a coding system in which a text has an
3374 Unicode signature (cf. Unicode Standard) in the order of
3375 LITTLE endian at the head. Assigned the coding-system (Lisp
3376 symbol) `utf-16-le' by default.
3378 o coding-category-ccl
3380 The category for a coding system of which encoder/decoder is
3381 written in CCL programs. The default value is nil, i.e., no
3382 coding system is assigned.
3384 o coding-category-binary
3386 The category for a coding system not categorized in any of the
3387 above. Assigned the coding-system (Lisp symbol)
3388 `no-conversion' by default.
3390 Each of them is a Lisp symbol and the value is an actual
3391 `coding-system's (this is also a Lisp symbol) assigned by a user.
3392 What Emacs does actually is to detect a category of coding system.
3393 Then, it uses a `coding-system' assigned to it. If Emacs can't
3394 decide only one possible category, it selects a category of the
3395 highest priority. Priorities of categories are also specified by a
3396 user in a Lisp variable `coding-category-list'.
3400 static
3401 int ascii_skip_code[256];
3403 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3404 If it detects possible coding systems, return an integer in which
3405 appropriate flag bits are set. Flag bits are defined by macros
3406 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3407 it should point the table `coding_priorities'. In that case, only
3408 the flag bit for a coding system of the highest priority is set in
3409 the returned value.
3411 How many ASCII characters are at the head is returned as *SKIP. */
3413 static int
3414 detect_coding_mask (source, src_bytes, priorities, skip)
3415 unsigned char *source;
3416 int src_bytes, *priorities, *skip;
3418 register unsigned char c;
3419 unsigned char *src = source, *src_end = source + src_bytes;
3420 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421 int i, idx;
3423 /* At first, skip all ASCII characters and control characters except
3424 for three ISO2022 specific control characters. */
3425 ascii_skip_code[ISO_CODE_SO] = 0;
3426 ascii_skip_code[ISO_CODE_SI] = 0;
3427 ascii_skip_code[ISO_CODE_ESC] = 0;
3429 label_loop_detect_coding:
3430 while (src < src_end && ascii_skip_code[*src]) src++;
3431 *skip = src - source;
3433 if (src >= src_end)
3434 /* We found nothing other than ASCII. There's nothing to do. */
3435 return 0;
3437 c = *src;
3438 /* The text seems to be encoded in some multilingual coding system.
3439 Now, try to find in which coding system the text is encoded. */
3440 if (c < 0x80)
3442 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443 /* C is an ISO2022 specific control code of C0. */
3444 mask = detect_coding_iso2022 (src, src_end);
3445 if (mask == 0)
3447 /* No valid ISO2022 code follows C. Try again. */
3448 src++;
3449 if (c == ISO_CODE_ESC)
3450 ascii_skip_code[ISO_CODE_ESC] = 1;
3451 else
3452 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3453 goto label_loop_detect_coding;
3455 if (priorities)
3457 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3459 if (mask & priorities[i])
3460 return priorities[i];
3462 return CODING_CATEGORY_MASK_RAW_TEXT;
3465 else
3467 int try;
3469 if (c < 0xA0)
3471 /* C is the first byte of SJIS character code,
3472 or a leading-code of Emacs' internal format (emacs-mule),
3473 or the first byte of UTF-16. */
3474 try = (CODING_CATEGORY_MASK_SJIS
3475 | CODING_CATEGORY_MASK_EMACS_MULE
3476 | CODING_CATEGORY_MASK_UTF_16_BE
3477 | CODING_CATEGORY_MASK_UTF_16_LE);
3479 /* Or, if C is a special latin extra code,
3480 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481 or is an ISO2022 control-sequence-introducer (CSI),
3482 we should also consider the possibility of ISO2022 codings. */
3483 if ((VECTORP (Vlatin_extra_code_table)
3484 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486 || (c == ISO_CODE_CSI
3487 && (src < src_end
3488 && (*src == ']'
3489 || ((*src == '0' || *src == '1' || *src == '2')
3490 && src + 1 < src_end
3491 && src[1] == ']')))))
3492 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493 | CODING_CATEGORY_MASK_ISO_8BIT);
3495 else
3496 /* C is a character of ISO2022 in graphic plane right,
3497 or a SJIS's 1-byte character code (i.e. JISX0201),
3498 or the first byte of BIG5's 2-byte code,
3499 or the first byte of UTF-8/16. */
3500 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501 | CODING_CATEGORY_MASK_ISO_8BIT
3502 | CODING_CATEGORY_MASK_SJIS
3503 | CODING_CATEGORY_MASK_BIG5
3504 | CODING_CATEGORY_MASK_UTF_8
3505 | CODING_CATEGORY_MASK_UTF_16_BE
3506 | CODING_CATEGORY_MASK_UTF_16_LE);
3508 /* Or, we may have to consider the possibility of CCL. */
3509 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511 ->spec.ccl.valid_codes)[c])
3512 try |= CODING_CATEGORY_MASK_CCL;
3514 mask = 0;
3515 utf16_examined_p = iso2022_examined_p = 0;
3516 if (priorities)
3518 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3520 if (!iso2022_examined_p
3521 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3523 mask |= detect_coding_iso2022 (src, src_end);
3524 iso2022_examined_p = 1;
3526 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3527 mask |= detect_coding_sjis (src, src_end);
3528 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529 mask |= detect_coding_utf_8 (src, src_end);
3530 else if (!utf16_examined_p
3531 && (priorities[i] & try &
3532 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3534 mask |= detect_coding_utf_16 (src, src_end);
3535 utf16_examined_p = 1;
3537 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3538 mask |= detect_coding_big5 (src, src_end);
3539 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3540 mask |= detect_coding_emacs_mule (src, src_end);
3541 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3542 mask |= detect_coding_ccl (src, src_end);
3543 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3544 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3545 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3546 mask |= CODING_CATEGORY_MASK_BINARY;
3547 if (mask & priorities[i])
3548 return priorities[i];
3550 return CODING_CATEGORY_MASK_RAW_TEXT;
3552 if (try & CODING_CATEGORY_MASK_ISO)
3553 mask |= detect_coding_iso2022 (src, src_end);
3554 if (try & CODING_CATEGORY_MASK_SJIS)
3555 mask |= detect_coding_sjis (src, src_end);
3556 if (try & CODING_CATEGORY_MASK_BIG5)
3557 mask |= detect_coding_big5 (src, src_end);
3558 if (try & CODING_CATEGORY_MASK_UTF_8)
3559 mask |= detect_coding_utf_8 (src, src_end);
3560 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561 mask |= detect_coding_utf_16 (src, src_end);
3562 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3563 mask |= detect_coding_emacs_mule (src, src_end);
3564 if (try & CODING_CATEGORY_MASK_CCL)
3565 mask |= detect_coding_ccl (src, src_end);
3567 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3570 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571 The information of the detected coding system is set in CODING. */
3573 void
3574 detect_coding (coding, src, src_bytes)
3575 struct coding_system *coding;
3576 unsigned char *src;
3577 int src_bytes;
3579 unsigned int idx;
3580 int skip, mask, i;
3581 Lisp_Object val;
3583 val = Vcoding_category_list;
3584 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3585 coding->heading_ascii = skip;
3587 if (!mask) return;
3589 /* We found a single coding system of the highest priority in MASK. */
3590 idx = 0;
3591 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592 if (! mask)
3593 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3595 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3597 if (coding->eol_type != CODING_EOL_UNDECIDED)
3599 Lisp_Object tmp;
3601 tmp = Fget (val, Qeol_type);
3602 if (VECTORP (tmp))
3603 val = XVECTOR (tmp)->contents[coding->eol_type];
3606 /* Setup this new coding system while preserving some slots. */
3608 int src_multibyte = coding->src_multibyte;
3609 int dst_multibyte = coding->dst_multibyte;
3611 setup_coding_system (val, coding);
3612 coding->src_multibyte = src_multibyte;
3613 coding->dst_multibyte = dst_multibyte;
3614 coding->heading_ascii = skip;
3618 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3622 How many non-eol characters are at the head is returned as *SKIP. */
3624 #define MAX_EOL_CHECK_COUNT 3
3626 static int
3627 detect_eol_type (source, src_bytes, skip)
3628 unsigned char *source;
3629 int src_bytes, *skip;
3631 unsigned char *src = source, *src_end = src + src_bytes;
3632 unsigned char c;
3633 int total = 0; /* How many end-of-lines are found so far. */
3634 int eol_type = CODING_EOL_UNDECIDED;
3635 int this_eol_type;
3637 *skip = 0;
3639 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3641 c = *src++;
3642 if (c == '\n' || c == '\r')
3644 if (*skip == 0)
3645 *skip = src - 1 - source;
3646 total++;
3647 if (c == '\n')
3648 this_eol_type = CODING_EOL_LF;
3649 else if (src >= src_end || *src != '\n')
3650 this_eol_type = CODING_EOL_CR;
3651 else
3652 this_eol_type = CODING_EOL_CRLF, src++;
3654 if (eol_type == CODING_EOL_UNDECIDED)
3655 /* This is the first end-of-line. */
3656 eol_type = this_eol_type;
3657 else if (eol_type != this_eol_type)
3659 /* The found type is different from what found before. */
3660 eol_type = CODING_EOL_INCONSISTENT;
3661 break;
3666 if (*skip == 0)
3667 *skip = src_end - source;
3668 return eol_type;
3671 /* Like detect_eol_type, but detect EOL type in 2-octet
3672 big-endian/little-endian format for coding systems utf-16-be and
3673 utf-16-le. */
3675 static int
3676 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677 unsigned char *source;
3678 int src_bytes, *skip;
3680 unsigned char *src = source, *src_end = src + src_bytes;
3681 unsigned int c1, c2;
3682 int total = 0; /* How many end-of-lines are found so far. */
3683 int eol_type = CODING_EOL_UNDECIDED;
3684 int this_eol_type;
3685 int msb, lsb;
3687 if (big_endian_p)
3688 msb = 0, lsb = 1;
3689 else
3690 msb = 1, lsb = 0;
3692 *skip = 0;
3694 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3696 c1 = (src[msb] << 8) | (src[lsb]);
3697 src += 2;
3699 if (c1 == '\n' || c1 == '\r')
3701 if (*skip == 0)
3702 *skip = src - 2 - source;
3703 total++;
3704 if (c1 == '\n')
3706 this_eol_type = CODING_EOL_LF;
3708 else
3710 if ((src + 1) >= src_end)
3712 this_eol_type = CODING_EOL_CR;
3714 else
3716 c2 = (src[msb] << 8) | (src[lsb]);
3717 if (c2 == '\n')
3718 this_eol_type = CODING_EOL_CRLF, src += 2;
3719 else
3720 this_eol_type = CODING_EOL_CR;
3724 if (eol_type == CODING_EOL_UNDECIDED)
3725 /* This is the first end-of-line. */
3726 eol_type = this_eol_type;
3727 else if (eol_type != this_eol_type)
3729 /* The found type is different from what found before. */
3730 eol_type = CODING_EOL_INCONSISTENT;
3731 break;
3736 if (*skip == 0)
3737 *skip = src_end - source;
3738 return eol_type;
3741 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742 is encoded. If it detects an appropriate format of end-of-line, it
3743 sets the information in *CODING. */
3745 void
3746 detect_eol (coding, src, src_bytes)
3747 struct coding_system *coding;
3748 unsigned char *src;
3749 int src_bytes;
3751 Lisp_Object val;
3752 int skip;
3753 int eol_type;
3755 switch (coding->category_idx)
3757 case CODING_CATEGORY_IDX_UTF_16_BE:
3758 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759 break;
3760 case CODING_CATEGORY_IDX_UTF_16_LE:
3761 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762 break;
3763 default:
3764 eol_type = detect_eol_type (src, src_bytes, &skip);
3765 break;
3768 if (coding->heading_ascii > skip)
3769 coding->heading_ascii = skip;
3770 else
3771 skip = coding->heading_ascii;
3773 if (eol_type == CODING_EOL_UNDECIDED)
3774 return;
3775 if (eol_type == CODING_EOL_INCONSISTENT)
3777 #if 0
3778 /* This code is suppressed until we find a better way to
3779 distinguish raw text file and binary file. */
3781 /* If we have already detected that the coding is raw-text, the
3782 coding should actually be no-conversion. */
3783 if (coding->type == coding_type_raw_text)
3785 setup_coding_system (Qno_conversion, coding);
3786 return;
3788 /* Else, let's decode only text code anyway. */
3789 #endif /* 0 */
3790 eol_type = CODING_EOL_LF;
3793 val = Fget (coding->symbol, Qeol_type);
3794 if (VECTORP (val) && XVECTOR (val)->size == 3)
3796 int src_multibyte = coding->src_multibyte;
3797 int dst_multibyte = coding->dst_multibyte;
3799 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3800 coding->src_multibyte = src_multibyte;
3801 coding->dst_multibyte = dst_multibyte;
3802 coding->heading_ascii = skip;
3806 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3808 #define DECODING_BUFFER_MAG(coding) \
3809 (coding->type == coding_type_iso2022 \
3810 ? 3 \
3811 : (coding->type == coding_type_ccl \
3812 ? coding->spec.ccl.decoder.buf_magnification \
3813 : 2))
3815 /* Return maximum size (bytes) of a buffer enough for decoding
3816 SRC_BYTES of text encoded in CODING. */
3819 decoding_buffer_size (coding, src_bytes)
3820 struct coding_system *coding;
3821 int src_bytes;
3823 return (src_bytes * DECODING_BUFFER_MAG (coding)
3824 + CONVERSION_BUFFER_EXTRA_ROOM);
3827 /* Return maximum size (bytes) of a buffer enough for encoding
3828 SRC_BYTES of text to CODING. */
3831 encoding_buffer_size (coding, src_bytes)
3832 struct coding_system *coding;
3833 int src_bytes;
3835 int magnification;
3837 if (coding->type == coding_type_ccl)
3838 magnification = coding->spec.ccl.encoder.buf_magnification;
3839 else if (CODING_REQUIRE_ENCODING (coding))
3840 magnification = 3;
3841 else
3842 magnification = 1;
3844 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3847 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849 #endif
3851 char *conversion_buffer;
3852 int conversion_buffer_size;
3854 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855 or decoding. Sufficient memory is allocated automatically. If we
3856 run out of memory, return NULL. */
3858 char *
3859 get_conversion_buffer (size)
3860 int size;
3862 if (size > conversion_buffer_size)
3864 char *buf;
3865 int real_size = conversion_buffer_size * 2;
3867 while (real_size < size) real_size *= 2;
3868 buf = (char *) xmalloc (real_size);
3869 xfree (conversion_buffer);
3870 conversion_buffer = buf;
3871 conversion_buffer_size = real_size;
3873 return conversion_buffer;
3877 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878 struct coding_system *coding;
3879 unsigned char *source, *destination;
3880 int src_bytes, dst_bytes, encodep;
3882 struct ccl_program *ccl
3883 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884 int result;
3886 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3887 if (encodep)
3888 ccl->eol_type = coding->eol_type;
3889 coding->produced = ccl_driver (ccl, source, destination,
3890 src_bytes, dst_bytes, &(coding->consumed));
3891 if (encodep)
3892 coding->produced_char = coding->produced;
3893 else
3895 int bytes
3896 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3897 coding->produced = str_as_multibyte (destination, bytes,
3898 coding->produced,
3899 &(coding->produced_char));
3902 switch (ccl->status)
3904 case CCL_STAT_SUSPEND_BY_SRC:
3905 result = CODING_FINISH_INSUFFICIENT_SRC;
3906 break;
3907 case CCL_STAT_SUSPEND_BY_DST:
3908 result = CODING_FINISH_INSUFFICIENT_DST;
3909 break;
3910 case CCL_STAT_QUIT:
3911 case CCL_STAT_INVALID_CMD:
3912 result = CODING_FINISH_INTERRUPT;
3913 break;
3914 default:
3915 result = CODING_FINISH_NORMAL;
3916 break;
3918 return result;
3921 /* Decode EOL format of the text at PTR of BYTES length destructively
3922 according to CODING->eol_type. This is called after the CCL
3923 program produced a decoded text at PTR. If we do CRLF->LF
3924 conversion, update CODING->produced and CODING->produced_char. */
3926 static void
3927 decode_eol_post_ccl (coding, ptr, bytes)
3928 struct coding_system *coding;
3929 unsigned char *ptr;
3930 int bytes;
3932 Lisp_Object val, saved_coding_symbol;
3933 unsigned char *pend = ptr + bytes;
3934 int dummy;
3936 /* Remember the current coding system symbol. We set it back when
3937 an inconsistent EOL is found so that `last-coding-system-used' is
3938 set to the coding system that doesn't specify EOL conversion. */
3939 saved_coding_symbol = coding->symbol;
3941 coding->spec.ccl.cr_carryover = 0;
3942 if (coding->eol_type == CODING_EOL_UNDECIDED)
3944 /* Here, to avoid the call of setup_coding_system, we directly
3945 call detect_eol_type. */
3946 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
3947 val = Fget (coding->symbol, Qeol_type);
3948 if (VECTORP (val) && XVECTOR (val)->size == 3)
3949 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
3950 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3953 if (coding->eol_type == CODING_EOL_LF)
3955 /* We have nothing to do. */
3956 ptr = pend;
3958 else if (coding->eol_type == CODING_EOL_CRLF)
3960 unsigned char *pstart = ptr, *p = ptr;
3962 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
3963 && *(pend - 1) == '\r')
3965 /* If the last character is CR, we can't handle it here
3966 because LF will be in the not-yet-decoded source text.
3967 Recorded that the CR is not yet processed. */
3968 coding->spec.ccl.cr_carryover = 1;
3969 coding->produced--;
3970 coding->produced_char--;
3971 pend--;
3973 while (ptr < pend)
3975 if (*ptr == '\r')
3977 if (ptr + 1 < pend && *(ptr + 1) == '\n')
3979 *p++ = '\n';
3980 ptr += 2;
3982 else
3984 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3985 goto undo_eol_conversion;
3986 *p++ = *ptr++;
3989 else if (*ptr == '\n'
3990 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3991 goto undo_eol_conversion;
3992 else
3993 *p++ = *ptr++;
3994 continue;
3996 undo_eol_conversion:
3997 /* We have faced with inconsistent EOL format at PTR.
3998 Convert all LFs before PTR back to CRLFs. */
3999 for (p--, ptr--; p >= pstart; p--)
4001 if (*p == '\n')
4002 *ptr-- = '\n', *ptr-- = '\r';
4003 else
4004 *ptr-- = *p;
4006 /* If carryover is recorded, cancel it because we don't
4007 convert CRLF anymore. */
4008 if (coding->spec.ccl.cr_carryover)
4010 coding->spec.ccl.cr_carryover = 0;
4011 coding->produced++;
4012 coding->produced_char++;
4013 pend++;
4015 p = ptr = pend;
4016 coding->eol_type = CODING_EOL_LF;
4017 coding->symbol = saved_coding_symbol;
4019 if (p < pend)
4021 /* As each two-byte sequence CRLF was converted to LF, (PEND
4022 - P) is the number of deleted characters. */
4023 coding->produced -= pend - p;
4024 coding->produced_char -= pend - p;
4027 else /* i.e. coding->eol_type == CODING_EOL_CR */
4029 unsigned char *p = ptr;
4031 for (; ptr < pend; ptr++)
4033 if (*ptr == '\r')
4034 *ptr = '\n';
4035 else if (*ptr == '\n'
4036 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4038 for (; p < ptr; p++)
4040 if (*p == '\n')
4041 *p = '\r';
4043 ptr = pend;
4044 coding->eol_type = CODING_EOL_LF;
4045 coding->symbol = saved_coding_symbol;
4051 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4052 decoding, it may detect coding system and format of end-of-line if
4053 those are not yet decided. The source should be unibyte, the
4054 result is multibyte if CODING->dst_multibyte is nonzero, else
4055 unibyte. */
4058 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4059 struct coding_system *coding;
4060 unsigned char *source, *destination;
4061 int src_bytes, dst_bytes;
4063 if (coding->type == coding_type_undecided)
4064 detect_coding (coding, source, src_bytes);
4066 if (coding->eol_type == CODING_EOL_UNDECIDED
4067 && coding->type != coding_type_ccl)
4068 detect_eol (coding, source, src_bytes);
4070 coding->produced = coding->produced_char = 0;
4071 coding->consumed = coding->consumed_char = 0;
4072 coding->errors = 0;
4073 coding->result = CODING_FINISH_NORMAL;
4075 switch (coding->type)
4077 case coding_type_sjis:
4078 decode_coding_sjis_big5 (coding, source, destination,
4079 src_bytes, dst_bytes, 1);
4080 break;
4082 case coding_type_iso2022:
4083 decode_coding_iso2022 (coding, source, destination,
4084 src_bytes, dst_bytes);
4085 break;
4087 case coding_type_big5:
4088 decode_coding_sjis_big5 (coding, source, destination,
4089 src_bytes, dst_bytes, 0);
4090 break;
4092 case coding_type_emacs_mule:
4093 decode_coding_emacs_mule (coding, source, destination,
4094 src_bytes, dst_bytes);
4095 break;
4097 case coding_type_ccl:
4098 if (coding->spec.ccl.cr_carryover)
4100 /* Set the CR which is not processed by the previous call of
4101 decode_eol_post_ccl in DESTINATION. */
4102 *destination = '\r';
4103 coding->produced++;
4104 coding->produced_char++;
4105 dst_bytes--;
4107 ccl_coding_driver (coding, source,
4108 destination + coding->spec.ccl.cr_carryover,
4109 src_bytes, dst_bytes, 0);
4110 if (coding->eol_type != CODING_EOL_LF)
4111 decode_eol_post_ccl (coding, destination, coding->produced);
4112 break;
4114 default:
4115 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4118 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4119 && coding->consumed == src_bytes)
4120 coding->result = CODING_FINISH_NORMAL;
4122 if (coding->mode & CODING_MODE_LAST_BLOCK
4123 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4125 unsigned char *src = source + coding->consumed;
4126 unsigned char *dst = destination + coding->produced;
4128 src_bytes -= coding->consumed;
4129 coding->errors++;
4130 if (COMPOSING_P (coding))
4131 DECODE_COMPOSITION_END ('1');
4132 while (src_bytes--)
4134 int c = *src++;
4135 dst += CHAR_STRING (c, dst);
4136 coding->produced_char++;
4138 coding->consumed = coding->consumed_char = src - source;
4139 coding->produced = dst - destination;
4142 if (!coding->dst_multibyte)
4144 coding->produced = str_as_unibyte (destination, coding->produced);
4145 coding->produced_char = coding->produced;
4148 return coding->result;
4151 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4152 multibyteness of the source is CODING->src_multibyte, the
4153 multibyteness of the result is always unibyte. */
4156 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4157 struct coding_system *coding;
4158 unsigned char *source, *destination;
4159 int src_bytes, dst_bytes;
4161 coding->produced = coding->produced_char = 0;
4162 coding->consumed = coding->consumed_char = 0;
4163 coding->errors = 0;
4164 coding->result = CODING_FINISH_NORMAL;
4166 switch (coding->type)
4168 case coding_type_sjis:
4169 encode_coding_sjis_big5 (coding, source, destination,
4170 src_bytes, dst_bytes, 1);
4171 break;
4173 case coding_type_iso2022:
4174 encode_coding_iso2022 (coding, source, destination,
4175 src_bytes, dst_bytes);
4176 break;
4178 case coding_type_big5:
4179 encode_coding_sjis_big5 (coding, source, destination,
4180 src_bytes, dst_bytes, 0);
4181 break;
4183 case coding_type_emacs_mule:
4184 encode_coding_emacs_mule (coding, source, destination,
4185 src_bytes, dst_bytes);
4186 break;
4188 case coding_type_ccl:
4189 ccl_coding_driver (coding, source, destination,
4190 src_bytes, dst_bytes, 1);
4191 break;
4193 default:
4194 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4197 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4198 && coding->consumed == src_bytes)
4199 coding->result = CODING_FINISH_NORMAL;
4201 if (coding->mode & CODING_MODE_LAST_BLOCK)
4203 unsigned char *src = source + coding->consumed;
4204 unsigned char *src_end = src + src_bytes;
4205 unsigned char *dst = destination + coding->produced;
4207 if (coding->type == coding_type_iso2022)
4208 ENCODE_RESET_PLANE_AND_REGISTER;
4209 if (COMPOSING_P (coding))
4210 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4211 if (coding->consumed < src_bytes)
4213 int len = src_bytes - coding->consumed;
4215 BCOPY_SHORT (source + coding->consumed, dst, len);
4216 if (coding->src_multibyte)
4217 len = str_as_unibyte (dst, len);
4218 dst += len;
4219 coding->consumed = src_bytes;
4221 coding->produced = coding->produced_char = dst - destination;
4224 return coding->result;
4227 /* Scan text in the region between *BEG and *END (byte positions),
4228 skip characters which we don't have to decode by coding system
4229 CODING at the head and tail, then set *BEG and *END to the region
4230 of the text we actually have to convert. The caller should move
4231 the gap out of the region in advance if the region is from a
4232 buffer.
4234 If STR is not NULL, *BEG and *END are indices into STR. */
4236 static void
4237 shrink_decoding_region (beg, end, coding, str)
4238 int *beg, *end;
4239 struct coding_system *coding;
4240 unsigned char *str;
4242 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4243 int eol_conversion;
4244 Lisp_Object translation_table;
4246 if (coding->type == coding_type_ccl
4247 || coding->type == coding_type_undecided
4248 || coding->eol_type != CODING_EOL_LF
4249 || !NILP (coding->post_read_conversion)
4250 || coding->composing != COMPOSITION_DISABLED)
4252 /* We can't skip any data. */
4253 return;
4255 if (coding->type == coding_type_no_conversion
4256 || coding->type == coding_type_raw_text
4257 || coding->type == coding_type_emacs_mule)
4259 /* We need no conversion, but don't have to skip any data here.
4260 Decoding routine handles them effectively anyway. */
4261 return;
4264 translation_table = coding->translation_table_for_decode;
4265 if (NILP (translation_table) && !NILP (Venable_character_translation))
4266 translation_table = Vstandard_translation_table_for_decode;
4267 if (CHAR_TABLE_P (translation_table))
4269 int i;
4270 for (i = 0; i < 128; i++)
4271 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4272 break;
4273 if (i < 128)
4274 /* Some ASCII character should be translated. We give up
4275 shrinking. */
4276 return;
4279 if (coding->heading_ascii >= 0)
4280 /* Detection routine has already found how much we can skip at the
4281 head. */
4282 *beg += coding->heading_ascii;
4284 if (str)
4286 begp_orig = begp = str + *beg;
4287 endp_orig = endp = str + *end;
4289 else
4291 begp_orig = begp = BYTE_POS_ADDR (*beg);
4292 endp_orig = endp = begp + *end - *beg;
4295 eol_conversion = (coding->eol_type == CODING_EOL_CR
4296 || coding->eol_type == CODING_EOL_CRLF);
4298 switch (coding->type)
4300 case coding_type_sjis:
4301 case coding_type_big5:
4302 /* We can skip all ASCII characters at the head. */
4303 if (coding->heading_ascii < 0)
4305 if (eol_conversion)
4306 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4307 else
4308 while (begp < endp && *begp < 0x80) begp++;
4310 /* We can skip all ASCII characters at the tail except for the
4311 second byte of SJIS or BIG5 code. */
4312 if (eol_conversion)
4313 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4314 else
4315 while (begp < endp && endp[-1] < 0x80) endp--;
4316 /* Do not consider LF as ascii if preceded by CR, since that
4317 confuses eol decoding. */
4318 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4319 endp++;
4320 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4321 endp++;
4322 break;
4324 case coding_type_iso2022:
4325 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4326 /* We can't skip any data. */
4327 break;
4328 if (coding->heading_ascii < 0)
4330 /* We can skip all ASCII characters at the head except for a
4331 few control codes. */
4332 while (begp < endp && (c = *begp) < 0x80
4333 && c != ISO_CODE_CR && c != ISO_CODE_SO
4334 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4335 && (!eol_conversion || c != ISO_CODE_LF))
4336 begp++;
4338 switch (coding->category_idx)
4340 case CODING_CATEGORY_IDX_ISO_8_1:
4341 case CODING_CATEGORY_IDX_ISO_8_2:
4342 /* We can skip all ASCII characters at the tail. */
4343 if (eol_conversion)
4344 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4345 else
4346 while (begp < endp && endp[-1] < 0x80) endp--;
4347 /* Do not consider LF as ascii if preceded by CR, since that
4348 confuses eol decoding. */
4349 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4350 endp++;
4351 break;
4353 case CODING_CATEGORY_IDX_ISO_7:
4354 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4356 /* We can skip all charactes at the tail except for 8-bit
4357 codes and ESC and the following 2-byte at the tail. */
4358 unsigned char *eight_bit = NULL;
4360 if (eol_conversion)
4361 while (begp < endp
4362 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4364 if (!eight_bit && c & 0x80) eight_bit = endp;
4365 endp--;
4367 else
4368 while (begp < endp
4369 && (c = endp[-1]) != ISO_CODE_ESC)
4371 if (!eight_bit && c & 0x80) eight_bit = endp;
4372 endp--;
4374 /* Do not consider LF as ascii if preceded by CR, since that
4375 confuses eol decoding. */
4376 if (begp < endp && endp < endp_orig
4377 && endp[-1] == '\r' && endp[0] == '\n')
4378 endp++;
4379 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4381 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4382 /* This is an ASCII designation sequence. We can
4383 surely skip the tail. But, if we have
4384 encountered an 8-bit code, skip only the codes
4385 after that. */
4386 endp = eight_bit ? eight_bit : endp + 2;
4387 else
4388 /* Hmmm, we can't skip the tail. */
4389 endp = endp_orig;
4391 else if (eight_bit)
4392 endp = eight_bit;
4395 break;
4397 default:
4398 abort ();
4400 *beg += begp - begp_orig;
4401 *end += endp - endp_orig;
4402 return;
4405 /* Like shrink_decoding_region but for encoding. */
4407 static void
4408 shrink_encoding_region (beg, end, coding, str)
4409 int *beg, *end;
4410 struct coding_system *coding;
4411 unsigned char *str;
4413 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4414 int eol_conversion;
4415 Lisp_Object translation_table;
4417 if (coding->type == coding_type_ccl
4418 || coding->eol_type == CODING_EOL_CRLF
4419 || coding->eol_type == CODING_EOL_CR
4420 || coding->cmp_data && coding->cmp_data->used > 0)
4422 /* We can't skip any data. */
4423 return;
4425 if (coding->type == coding_type_no_conversion
4426 || coding->type == coding_type_raw_text
4427 || coding->type == coding_type_emacs_mule
4428 || coding->type == coding_type_undecided)
4430 /* We need no conversion, but don't have to skip any data here.
4431 Encoding routine handles them effectively anyway. */
4432 return;
4435 translation_table = coding->translation_table_for_encode;
4436 if (NILP (translation_table) && !NILP (Venable_character_translation))
4437 translation_table = Vstandard_translation_table_for_encode;
4438 if (CHAR_TABLE_P (translation_table))
4440 int i;
4441 for (i = 0; i < 128; i++)
4442 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4443 break;
4444 if (i < 128)
4445 /* Some ASCII character should be tranlsated. We give up
4446 shrinking. */
4447 return;
4450 if (str)
4452 begp_orig = begp = str + *beg;
4453 endp_orig = endp = str + *end;
4455 else
4457 begp_orig = begp = BYTE_POS_ADDR (*beg);
4458 endp_orig = endp = begp + *end - *beg;
4461 eol_conversion = (coding->eol_type == CODING_EOL_CR
4462 || coding->eol_type == CODING_EOL_CRLF);
4464 /* Here, we don't have to check coding->pre_write_conversion because
4465 the caller is expected to have handled it already. */
4466 switch (coding->type)
4468 case coding_type_iso2022:
4469 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4470 /* We can't skip any data. */
4471 break;
4472 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4474 unsigned char *bol = begp;
4475 while (begp < endp && *begp < 0x80)
4477 begp++;
4478 if (begp[-1] == '\n')
4479 bol = begp;
4481 begp = bol;
4482 goto label_skip_tail;
4484 /* fall down ... */
4486 case coding_type_sjis:
4487 case coding_type_big5:
4488 /* We can skip all ASCII characters at the head and tail. */
4489 if (eol_conversion)
4490 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4491 else
4492 while (begp < endp && *begp < 0x80) begp++;
4493 label_skip_tail:
4494 if (eol_conversion)
4495 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4496 else
4497 while (begp < endp && *(endp - 1) < 0x80) endp--;
4498 break;
4500 default:
4501 abort ();
4504 *beg += begp - begp_orig;
4505 *end += endp - endp_orig;
4506 return;
4509 /* As shrinking conversion region requires some overhead, we don't try
4510 shrinking if the length of conversion region is less than this
4511 value. */
4512 static int shrink_conversion_region_threshhold = 1024;
4514 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4515 do { \
4516 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4518 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4519 else shrink_decoding_region (beg, end, coding, str); \
4521 } while (0)
4523 static Lisp_Object
4524 code_convert_region_unwind (dummy)
4525 Lisp_Object dummy;
4527 inhibit_pre_post_conversion = 0;
4528 return Qnil;
4531 /* Store information about all compositions in the range FROM and TO
4532 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4533 buffer or a string, defaults to the current buffer. */
4535 void
4536 coding_save_composition (coding, from, to, obj)
4537 struct coding_system *coding;
4538 int from, to;
4539 Lisp_Object obj;
4541 Lisp_Object prop;
4542 int start, end;
4544 if (coding->composing == COMPOSITION_DISABLED)
4545 return;
4546 if (!coding->cmp_data)
4547 coding_allocate_composition_data (coding, from);
4548 if (!find_composition (from, to, &start, &end, &prop, obj)
4549 || end > to)
4550 return;
4551 if (start < from
4552 && (!find_composition (end, to, &start, &end, &prop, obj)
4553 || end > to))
4554 return;
4555 coding->composing = COMPOSITION_NO;
4558 if (COMPOSITION_VALID_P (start, end, prop))
4560 enum composition_method method = COMPOSITION_METHOD (prop);
4561 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4562 >= COMPOSITION_DATA_SIZE)
4563 coding_allocate_composition_data (coding, from);
4564 /* For relative composition, we remember start and end
4565 positions, for the other compositions, we also remember
4566 components. */
4567 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4568 if (method != COMPOSITION_RELATIVE)
4570 /* We must store a*/
4571 Lisp_Object val, ch;
4573 val = COMPOSITION_COMPONENTS (prop);
4574 if (CONSP (val))
4575 while (CONSP (val))
4577 ch = XCAR (val), val = XCDR (val);
4578 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4580 else if (VECTORP (val) || STRINGP (val))
4582 int len = (VECTORP (val)
4583 ? XVECTOR (val)->size : XSTRING (val)->size);
4584 int i;
4585 for (i = 0; i < len; i++)
4587 ch = (STRINGP (val)
4588 ? Faref (val, make_number (i))
4589 : XVECTOR (val)->contents[i]);
4590 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4593 else /* INTEGERP (val) */
4594 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4596 CODING_ADD_COMPOSITION_END (coding, end - from);
4598 start = end;
4600 while (start < to
4601 && find_composition (start, to, &start, &end, &prop, obj)
4602 && end <= to);
4604 /* Make coding->cmp_data point to the first memory block. */
4605 while (coding->cmp_data->prev)
4606 coding->cmp_data = coding->cmp_data->prev;
4607 coding->cmp_data_start = 0;
4610 /* Reflect the saved information about compositions to OBJ.
4611 CODING->cmp_data points to a memory block for the informaiton. OBJ
4612 is a buffer or a string, defaults to the current buffer. */
4614 void
4615 coding_restore_composition (coding, obj)
4616 struct coding_system *coding;
4617 Lisp_Object obj;
4619 struct composition_data *cmp_data = coding->cmp_data;
4621 if (!cmp_data)
4622 return;
4624 while (cmp_data->prev)
4625 cmp_data = cmp_data->prev;
4627 while (cmp_data)
4629 int i;
4631 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4633 int *data = cmp_data->data + i;
4634 enum composition_method method = (enum composition_method) data[3];
4635 Lisp_Object components;
4637 if (method == COMPOSITION_RELATIVE)
4638 components = Qnil;
4639 else
4641 int len = data[0] - 4, j;
4642 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4644 for (j = 0; j < len; j++)
4645 args[j] = make_number (data[4 + j]);
4646 components = (method == COMPOSITION_WITH_ALTCHARS
4647 ? Fstring (len, args) : Fvector (len, args));
4649 compose_text (data[1], data[2], components, Qnil, obj);
4651 cmp_data = cmp_data->next;
4655 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4656 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4657 coding system CODING, and return the status code of code conversion
4658 (currently, this value has no meaning).
4660 How many characters (and bytes) are converted to how many
4661 characters (and bytes) are recorded in members of the structure
4662 CODING.
4664 If REPLACE is nonzero, we do various things as if the original text
4665 is deleted and a new text is inserted. See the comments in
4666 replace_range (insdel.c) to know what we are doing.
4668 If REPLACE is zero, it is assumed that the source text is unibyte.
4669 Otherwize, it is assumed that the source text is multibyte. */
4672 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4673 int from, from_byte, to, to_byte, encodep, replace;
4674 struct coding_system *coding;
4676 int len = to - from, len_byte = to_byte - from_byte;
4677 int require, inserted, inserted_byte;
4678 int head_skip, tail_skip, total_skip = 0;
4679 Lisp_Object saved_coding_symbol;
4680 int first = 1;
4681 unsigned char *src, *dst;
4682 Lisp_Object deletion;
4683 int orig_point = PT, orig_len = len;
4684 int prev_Z;
4685 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4687 coding->src_multibyte = replace && multibyte_p;
4688 coding->dst_multibyte = multibyte_p;
4690 deletion = Qnil;
4691 saved_coding_symbol = Qnil;
4693 if (from < PT && PT < to)
4695 TEMP_SET_PT_BOTH (from, from_byte);
4696 orig_point = from;
4699 if (replace)
4701 int saved_from = from;
4703 prepare_to_modify_buffer (from, to, &from);
4704 if (saved_from != from)
4706 to = from + len;
4707 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4708 len_byte = to_byte - from_byte;
4712 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4714 /* We must detect encoding of text and eol format. */
4716 if (from < GPT && to > GPT)
4717 move_gap_both (from, from_byte);
4718 if (coding->type == coding_type_undecided)
4720 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4721 if (coding->type == coding_type_undecided)
4722 /* It seems that the text contains only ASCII, but we
4723 should not left it undecided because the deeper
4724 decoding routine (decode_coding) tries to detect the
4725 encodings again in vain. */
4726 coding->type = coding_type_emacs_mule;
4728 if (coding->eol_type == CODING_EOL_UNDECIDED
4729 && coding->type != coding_type_ccl)
4731 saved_coding_symbol = coding->symbol;
4732 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4733 if (coding->eol_type == CODING_EOL_UNDECIDED)
4734 coding->eol_type = CODING_EOL_LF;
4735 /* We had better recover the original eol format if we
4736 encounter an inconsitent eol format while decoding. */
4737 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4741 /* Now we convert the text. */
4743 /* For encoding, we must process pre-write-conversion in advance. */
4744 if (! inhibit_pre_post_conversion
4745 && encodep
4746 && SYMBOLP (coding->pre_write_conversion)
4747 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4749 /* The function in pre-write-conversion may put a new text in a
4750 new buffer. */
4751 struct buffer *prev = current_buffer;
4752 Lisp_Object new;
4753 int count = specpdl_ptr - specpdl;
4755 record_unwind_protect (code_convert_region_unwind, Qnil);
4756 /* We should not call any more pre-write/post-read-conversion
4757 functions while this pre-write-conversion is running. */
4758 inhibit_pre_post_conversion = 1;
4759 call2 (coding->pre_write_conversion,
4760 make_number (from), make_number (to));
4761 inhibit_pre_post_conversion = 0;
4762 /* Discard the unwind protect. */
4763 specpdl_ptr--;
4765 if (current_buffer != prev)
4767 len = ZV - BEGV;
4768 new = Fcurrent_buffer ();
4769 set_buffer_internal_1 (prev);
4770 del_range_2 (from, from_byte, to, to_byte, 0);
4771 TEMP_SET_PT_BOTH (from, from_byte);
4772 insert_from_buffer (XBUFFER (new), 1, len, 0);
4773 Fkill_buffer (new);
4774 if (orig_point >= to)
4775 orig_point += len - orig_len;
4776 else if (orig_point > from)
4777 orig_point = from;
4778 orig_len = len;
4779 to = from + len;
4780 from_byte = CHAR_TO_BYTE (from);
4781 to_byte = CHAR_TO_BYTE (to);
4782 len_byte = to_byte - from_byte;
4783 TEMP_SET_PT_BOTH (from, from_byte);
4787 if (replace)
4788 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4790 if (coding->composing != COMPOSITION_DISABLED)
4792 if (encodep)
4793 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4794 else
4795 coding_allocate_composition_data (coding, from);
4798 /* Try to skip the heading and tailing ASCIIs. */
4800 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4802 if (from < GPT && GPT < to)
4803 move_gap_both (from, from_byte);
4804 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4805 if (from_byte == to_byte
4806 && (encodep || NILP (coding->post_read_conversion))
4807 && ! CODING_REQUIRE_FLUSHING (coding))
4809 coding->produced = len_byte;
4810 coding->produced_char = len;
4811 if (!replace)
4812 /* We must record and adjust for this new text now. */
4813 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4814 return 0;
4817 head_skip = from_byte - from_byte_orig;
4818 tail_skip = to_byte_orig - to_byte;
4819 total_skip = head_skip + tail_skip;
4820 from += head_skip;
4821 to -= tail_skip;
4822 len -= total_skip; len_byte -= total_skip;
4825 /* The code conversion routine can not preserve text properties for
4826 now. So, we must remove all text properties in the region.
4827 Here, we must suppress all modification hooks. */
4828 if (replace)
4830 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4831 inhibit_modification_hooks = 1;
4832 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4833 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4836 /* For converion, we must put the gap before the text in addition to
4837 making the gap larger for efficient decoding. The required gap
4838 size starts from 2000 which is the magic number used in make_gap.
4839 But, after one batch of conversion, it will be incremented if we
4840 find that it is not enough . */
4841 require = 2000;
4843 if (GAP_SIZE < require)
4844 make_gap (require - GAP_SIZE);
4845 move_gap_both (from, from_byte);
4847 inserted = inserted_byte = 0;
4849 GAP_SIZE += len_byte;
4850 ZV -= len;
4851 Z -= len;
4852 ZV_BYTE -= len_byte;
4853 Z_BYTE -= len_byte;
4855 if (GPT - BEG < BEG_UNCHANGED)
4856 BEG_UNCHANGED = GPT - BEG;
4857 if (Z - GPT < END_UNCHANGED)
4858 END_UNCHANGED = Z - GPT;
4860 if (!encodep && coding->src_multibyte)
4862 /* Decoding routines expects that the source text is unibyte.
4863 We must convert 8-bit characters of multibyte form to
4864 unibyte. */
4865 int len_byte_orig = len_byte;
4866 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4867 if (len_byte < len_byte_orig)
4868 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4869 len_byte);
4870 coding->src_multibyte = 0;
4873 for (;;)
4875 int result;
4877 /* The buffer memory is now:
4878 +--------+converted-text+---------+-------original-text-------+---+
4879 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4880 |<---------------------- GAP ----------------------->| */
4881 src = GAP_END_ADDR - len_byte;
4882 dst = GPT_ADDR + inserted_byte;
4884 if (encodep)
4885 result = encode_coding (coding, src, dst, len_byte, 0);
4886 else
4887 result = decode_coding (coding, src, dst, len_byte, 0);
4889 /* The buffer memory is now:
4890 +--------+-------converted-text----+--+------original-text----+---+
4891 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4892 |<---------------------- GAP ----------------------->| */
4894 inserted += coding->produced_char;
4895 inserted_byte += coding->produced;
4896 len_byte -= coding->consumed;
4898 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4900 coding_allocate_composition_data (coding, from + inserted);
4901 continue;
4904 src += coding->consumed;
4905 dst += coding->produced;
4907 if (result == CODING_FINISH_NORMAL)
4909 src += len_byte;
4910 break;
4912 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4914 unsigned char *pend = dst, *p = pend - inserted_byte;
4915 Lisp_Object eol_type;
4917 /* Encode LFs back to the original eol format (CR or CRLF). */
4918 if (coding->eol_type == CODING_EOL_CR)
4920 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4922 else
4924 int count = 0;
4926 while (p < pend) if (*p++ == '\n') count++;
4927 if (src - dst < count)
4929 /* We don't have sufficient room for encoding LFs
4930 back to CRLF. We must record converted and
4931 not-yet-converted text back to the buffer
4932 content, enlarge the gap, then record them out of
4933 the buffer contents again. */
4934 int add = len_byte + inserted_byte;
4936 GAP_SIZE -= add;
4937 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4938 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4939 make_gap (count - GAP_SIZE);
4940 GAP_SIZE += add;
4941 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4942 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4943 /* Don't forget to update SRC, DST, and PEND. */
4944 src = GAP_END_ADDR - len_byte;
4945 dst = GPT_ADDR + inserted_byte;
4946 pend = dst;
4948 inserted += count;
4949 inserted_byte += count;
4950 coding->produced += count;
4951 p = dst = pend + count;
4952 while (count)
4954 *--p = *--pend;
4955 if (*p == '\n') count--, *--p = '\r';
4959 /* Suppress eol-format conversion in the further conversion. */
4960 coding->eol_type = CODING_EOL_LF;
4962 /* Set the coding system symbol to that for Unix-like EOL. */
4963 eol_type = Fget (saved_coding_symbol, Qeol_type);
4964 if (VECTORP (eol_type)
4965 && XVECTOR (eol_type)->size == 3
4966 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4967 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4968 else
4969 coding->symbol = saved_coding_symbol;
4971 continue;
4973 if (len_byte <= 0)
4975 if (coding->type != coding_type_ccl
4976 || coding->mode & CODING_MODE_LAST_BLOCK)
4977 break;
4978 coding->mode |= CODING_MODE_LAST_BLOCK;
4979 continue;
4981 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4983 /* The source text ends in invalid codes. Let's just
4984 make them valid buffer contents, and finish conversion. */
4985 inserted += len_byte;
4986 inserted_byte += len_byte;
4987 while (len_byte--)
4988 *dst++ = *src++;
4989 break;
4991 if (result == CODING_FINISH_INTERRUPT)
4993 /* The conversion procedure was interrupted by a user. */
4994 break;
4996 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4997 if (coding->consumed < 1)
4999 /* It's quite strange to require more memory without
5000 consuming any bytes. Perhaps CCL program bug. */
5001 break;
5003 if (first)
5005 /* We have just done the first batch of conversion which was
5006 stoped because of insufficient gap. Let's reconsider the
5007 required gap size (i.e. SRT - DST) now.
5009 We have converted ORIG bytes (== coding->consumed) into
5010 NEW bytes (coding->produced). To convert the remaining
5011 LEN bytes, we may need REQUIRE bytes of gap, where:
5012 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5013 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5014 Here, we are sure that NEW >= ORIG. */
5015 float ratio = coding->produced - coding->consumed;
5016 ratio /= coding->consumed;
5017 require = len_byte * ratio;
5018 first = 0;
5020 if ((src - dst) < (require + 2000))
5022 /* See the comment above the previous call of make_gap. */
5023 int add = len_byte + inserted_byte;
5025 GAP_SIZE -= add;
5026 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5027 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5028 make_gap (require + 2000);
5029 GAP_SIZE += add;
5030 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5031 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5034 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5036 if (encodep && coding->dst_multibyte)
5038 /* The output is unibyte. We must convert 8-bit characters to
5039 multibyte form. */
5040 if (inserted_byte * 2 > GAP_SIZE)
5042 GAP_SIZE -= inserted_byte;
5043 ZV += inserted_byte; Z += inserted_byte;
5044 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5045 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5046 make_gap (inserted_byte - GAP_SIZE);
5047 GAP_SIZE += inserted_byte;
5048 ZV -= inserted_byte; Z -= inserted_byte;
5049 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5050 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5052 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5055 /* If we have shrinked the conversion area, adjust it now. */
5056 if (total_skip > 0)
5058 if (tail_skip > 0)
5059 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5060 inserted += total_skip; inserted_byte += total_skip;
5061 GAP_SIZE += total_skip;
5062 GPT -= head_skip; GPT_BYTE -= head_skip;
5063 ZV -= total_skip; ZV_BYTE -= total_skip;
5064 Z -= total_skip; Z_BYTE -= total_skip;
5065 from -= head_skip; from_byte -= head_skip;
5066 to += tail_skip; to_byte += tail_skip;
5069 prev_Z = Z;
5070 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5071 inserted = Z - prev_Z;
5073 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5074 coding_restore_composition (coding, Fcurrent_buffer ());
5075 coding_free_composition_data (coding);
5077 if (! inhibit_pre_post_conversion
5078 && ! encodep && ! NILP (coding->post_read_conversion))
5080 Lisp_Object val;
5081 int count = specpdl_ptr - specpdl;
5083 if (from != PT)
5084 TEMP_SET_PT_BOTH (from, from_byte);
5085 prev_Z = Z;
5086 record_unwind_protect (code_convert_region_unwind, Qnil);
5087 /* We should not call any more pre-write/post-read-conversion
5088 functions while this post-read-conversion is running. */
5089 inhibit_pre_post_conversion = 1;
5090 val = call1 (coding->post_read_conversion, make_number (inserted));
5091 inhibit_pre_post_conversion = 0;
5092 /* Discard the unwind protect. */
5093 specpdl_ptr--;
5094 CHECK_NUMBER (val, 0);
5095 inserted += Z - prev_Z;
5098 if (orig_point >= from)
5100 if (orig_point >= from + orig_len)
5101 orig_point += inserted - orig_len;
5102 else
5103 orig_point = from;
5104 TEMP_SET_PT (orig_point);
5107 if (replace)
5109 signal_after_change (from, to - from, inserted);
5110 update_compositions (from, from + inserted, CHECK_BORDER);
5114 coding->consumed = to_byte - from_byte;
5115 coding->consumed_char = to - from;
5116 coding->produced = inserted_byte;
5117 coding->produced_char = inserted;
5120 return 0;
5123 Lisp_Object
5124 run_pre_post_conversion_on_str (str, coding, encodep)
5125 Lisp_Object str;
5126 struct coding_system *coding;
5127 int encodep;
5129 int count = specpdl_ptr - specpdl;
5130 struct gcpro gcpro1;
5131 struct buffer *prev = current_buffer;
5132 int multibyte = STRING_MULTIBYTE (str);
5134 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5135 record_unwind_protect (code_convert_region_unwind, Qnil);
5136 GCPRO1 (str);
5137 temp_output_buffer_setup (" *code-converting-work*");
5138 set_buffer_internal (XBUFFER (Vstandard_output));
5139 /* We must insert the contents of STR as is without
5140 unibyte<->multibyte conversion. For that, we adjust the
5141 multibyteness of the working buffer to that of STR. */
5142 Ferase_buffer ();
5143 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5144 insert_from_string (str, 0, 0,
5145 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5146 UNGCPRO;
5147 inhibit_pre_post_conversion = 1;
5148 if (encodep)
5149 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5150 else
5152 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5153 call1 (coding->post_read_conversion, make_number (Z - BEG));
5155 inhibit_pre_post_conversion = 0;
5156 str = make_buffer_string (BEG, Z, 0);
5157 return unbind_to (count, str);
5160 Lisp_Object
5161 decode_coding_string (str, coding, nocopy)
5162 Lisp_Object str;
5163 struct coding_system *coding;
5164 int nocopy;
5166 int len;
5167 char *buf;
5168 int from, to, to_byte;
5169 struct gcpro gcpro1;
5170 Lisp_Object saved_coding_symbol;
5171 int result;
5173 from = 0;
5174 to = XSTRING (str)->size;
5175 to_byte = STRING_BYTES (XSTRING (str));
5177 saved_coding_symbol = Qnil;
5178 if (CODING_REQUIRE_DETECTION (coding))
5180 /* See the comments in code_convert_region. */
5181 if (coding->type == coding_type_undecided)
5183 detect_coding (coding, XSTRING (str)->data, to_byte);
5184 if (coding->type == coding_type_undecided)
5185 coding->type = coding_type_emacs_mule;
5187 if (coding->eol_type == CODING_EOL_UNDECIDED
5188 && coding->type != coding_type_ccl)
5190 saved_coding_symbol = coding->symbol;
5191 detect_eol (coding, XSTRING (str)->data, to_byte);
5192 if (coding->eol_type == CODING_EOL_UNDECIDED)
5193 coding->eol_type = CODING_EOL_LF;
5194 /* We had better recover the original eol format if we
5195 encounter an inconsitent eol format while decoding. */
5196 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5200 if (! CODING_REQUIRE_DECODING (coding))
5202 if (!STRING_MULTIBYTE (str))
5204 str = Fstring_as_multibyte (str);
5205 nocopy = 1;
5207 return (nocopy ? str : Fcopy_sequence (str));
5210 if (STRING_MULTIBYTE (str))
5212 /* Decoding routines expect the source text to be unibyte. */
5213 str = Fstring_as_unibyte (str);
5214 nocopy = 1;
5215 coding->src_multibyte = 0;
5217 coding->dst_multibyte = 1;
5219 if (coding->composing != COMPOSITION_DISABLED)
5220 coding_allocate_composition_data (coding, from);
5222 /* Try to skip the heading and tailing ASCIIs. */
5224 int from_orig = from;
5226 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5228 if (from == to_byte)
5229 return (nocopy ? str : Fcopy_sequence (str));
5232 len = decoding_buffer_size (coding, to_byte - from);
5233 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5234 GCPRO1 (str);
5235 buf = get_conversion_buffer (len);
5236 UNGCPRO;
5238 if (from > 0)
5239 bcopy (XSTRING (str)->data, buf, from);
5240 result = decode_coding (coding, XSTRING (str)->data + from,
5241 buf + from, to_byte - from, len);
5242 if (result == CODING_FINISH_INCONSISTENT_EOL)
5244 /* We simply try to decode the whole string again but without
5245 eol-conversion this time. */
5246 coding->eol_type = CODING_EOL_LF;
5247 coding->symbol = saved_coding_symbol;
5248 coding_free_composition_data (coding);
5249 return decode_coding_string (str, coding, nocopy);
5252 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5253 STRING_BYTES (XSTRING (str)) - to_byte);
5255 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5256 str = make_multibyte_string (buf, len + coding->produced_char,
5257 len + coding->produced);
5259 if (coding->cmp_data && coding->cmp_data->used)
5260 coding_restore_composition (coding, str);
5261 coding_free_composition_data (coding);
5263 if (SYMBOLP (coding->post_read_conversion)
5264 && !NILP (Ffboundp (coding->post_read_conversion)))
5265 str = run_pre_post_conversion_on_str (str, coding, 0);
5267 return str;
5270 Lisp_Object
5271 encode_coding_string (str, coding, nocopy)
5272 Lisp_Object str;
5273 struct coding_system *coding;
5274 int nocopy;
5276 int len;
5277 char *buf;
5278 int from, to, to_byte;
5279 struct gcpro gcpro1;
5280 Lisp_Object saved_coding_symbol;
5281 int result;
5283 if (SYMBOLP (coding->pre_write_conversion)
5284 && !NILP (Ffboundp (coding->pre_write_conversion)))
5285 str = run_pre_post_conversion_on_str (str, coding, 1);
5287 from = 0;
5288 to = XSTRING (str)->size;
5289 to_byte = STRING_BYTES (XSTRING (str));
5291 saved_coding_symbol = Qnil;
5292 if (! CODING_REQUIRE_ENCODING (coding))
5294 if (STRING_MULTIBYTE (str))
5296 str = Fstring_as_unibyte (str);
5297 nocopy = 1;
5299 return (nocopy ? str : Fcopy_sequence (str));
5302 /* Encoding routines determine the multibyteness of the source text
5303 by coding->src_multibyte. */
5304 coding->src_multibyte = STRING_MULTIBYTE (str);
5305 coding->dst_multibyte = 0;
5307 if (coding->composing != COMPOSITION_DISABLED)
5308 coding_save_composition (coding, from, to, str);
5310 /* Try to skip the heading and tailing ASCIIs. */
5312 int from_orig = from;
5314 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5316 if (from == to_byte)
5317 return (nocopy ? str : Fcopy_sequence (str));
5320 len = encoding_buffer_size (coding, to_byte - from);
5321 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5322 GCPRO1 (str);
5323 buf = get_conversion_buffer (len);
5324 UNGCPRO;
5326 if (from > 0)
5327 bcopy (XSTRING (str)->data, buf, from);
5328 result = encode_coding (coding, XSTRING (str)->data + from,
5329 buf + from, to_byte - from, len);
5330 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5331 STRING_BYTES (XSTRING (str)) - to_byte);
5333 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5334 str = make_unibyte_string (buf, len + coding->produced);
5335 coding_free_composition_data (coding);
5337 return str;
5341 #ifdef emacs
5342 /*** 8. Emacs Lisp library functions ***/
5344 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5345 "Return t if OBJECT is nil or a coding-system.\n\
5346 See the documentation of `make-coding-system' for information\n\
5347 about coding-system objects.")
5348 (obj)
5349 Lisp_Object obj;
5351 if (NILP (obj))
5352 return Qt;
5353 if (!SYMBOLP (obj))
5354 return Qnil;
5355 /* Get coding-spec vector for OBJ. */
5356 obj = Fget (obj, Qcoding_system);
5357 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5358 ? Qt : Qnil);
5361 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5362 Sread_non_nil_coding_system, 1, 1, 0,
5363 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5364 (prompt)
5365 Lisp_Object prompt;
5367 Lisp_Object val;
5370 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5371 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5373 while (XSTRING (val)->size == 0);
5374 return (Fintern (val, Qnil));
5377 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5378 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5379 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5380 (prompt, default_coding_system)
5381 Lisp_Object prompt, default_coding_system;
5383 Lisp_Object val;
5384 if (SYMBOLP (default_coding_system))
5385 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5386 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5387 Qt, Qnil, Qcoding_system_history,
5388 default_coding_system, Qnil);
5389 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5392 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5393 1, 1, 0,
5394 "Check validity of CODING-SYSTEM.\n\
5395 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5396 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5397 The value of property should be a vector of length 5.")
5398 (coding_system)
5399 Lisp_Object coding_system;
5401 CHECK_SYMBOL (coding_system, 0);
5402 if (!NILP (Fcoding_system_p (coding_system)))
5403 return coding_system;
5404 while (1)
5405 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5408 Lisp_Object
5409 detect_coding_system (src, src_bytes, highest)
5410 unsigned char *src;
5411 int src_bytes, highest;
5413 int coding_mask, eol_type;
5414 Lisp_Object val, tmp;
5415 int dummy;
5417 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5418 eol_type = detect_eol_type (src, src_bytes, &dummy);
5419 if (eol_type == CODING_EOL_INCONSISTENT)
5420 eol_type = CODING_EOL_UNDECIDED;
5422 if (!coding_mask)
5424 val = Qundecided;
5425 if (eol_type != CODING_EOL_UNDECIDED)
5427 Lisp_Object val2;
5428 val2 = Fget (Qundecided, Qeol_type);
5429 if (VECTORP (val2))
5430 val = XVECTOR (val2)->contents[eol_type];
5432 return (highest ? val : Fcons (val, Qnil));
5435 /* At first, gather possible coding systems in VAL. */
5436 val = Qnil;
5437 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5439 Lisp_Object category_val, category_index;
5441 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5442 category_val = Fsymbol_value (XCAR (tmp));
5443 if (!NILP (category_val)
5444 && NATNUMP (category_index)
5445 && (coding_mask & (1 << XFASTINT (category_index))))
5447 val = Fcons (category_val, val);
5448 if (highest)
5449 break;
5452 if (!highest)
5453 val = Fnreverse (val);
5455 /* Then, replace the elements with subsidiary coding systems. */
5456 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5458 if (eol_type != CODING_EOL_UNDECIDED
5459 && eol_type != CODING_EOL_INCONSISTENT)
5461 Lisp_Object eol;
5462 eol = Fget (XCAR (tmp), Qeol_type);
5463 if (VECTORP (eol))
5464 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5467 return (highest ? XCAR (val) : val);
5470 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5471 2, 3, 0,
5472 "Detect coding system of the text in the region between START and END.\n\
5473 Return a list of possible coding systems ordered by priority.\n\
5475 If only ASCII characters are found, it returns a list of single element\n\
5476 `undecided' or its subsidiary coding system according to a detected\n\
5477 end-of-line format.\n\
5479 If optional argument HIGHEST is non-nil, return the coding system of\n\
5480 highest priority.")
5481 (start, end, highest)
5482 Lisp_Object start, end, highest;
5484 int from, to;
5485 int from_byte, to_byte;
5487 CHECK_NUMBER_COERCE_MARKER (start, 0);
5488 CHECK_NUMBER_COERCE_MARKER (end, 1);
5490 validate_region (&start, &end);
5491 from = XINT (start), to = XINT (end);
5492 from_byte = CHAR_TO_BYTE (from);
5493 to_byte = CHAR_TO_BYTE (to);
5495 if (from < GPT && to >= GPT)
5496 move_gap_both (to, to_byte);
5498 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5499 to_byte - from_byte,
5500 !NILP (highest));
5503 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5504 1, 2, 0,
5505 "Detect coding system of the text in STRING.\n\
5506 Return a list of possible coding systems ordered by priority.\n\
5508 If only ASCII characters are found, it returns a list of single element\n\
5509 `undecided' or its subsidiary coding system according to a detected\n\
5510 end-of-line format.\n\
5512 If optional argument HIGHEST is non-nil, return the coding system of\n\
5513 highest priority.")
5514 (string, highest)
5515 Lisp_Object string, highest;
5517 CHECK_STRING (string, 0);
5519 return detect_coding_system (XSTRING (string)->data,
5520 STRING_BYTES (XSTRING (string)),
5521 !NILP (highest));
5524 Lisp_Object
5525 code_convert_region1 (start, end, coding_system, encodep)
5526 Lisp_Object start, end, coding_system;
5527 int encodep;
5529 struct coding_system coding;
5530 int from, to, len;
5532 CHECK_NUMBER_COERCE_MARKER (start, 0);
5533 CHECK_NUMBER_COERCE_MARKER (end, 1);
5534 CHECK_SYMBOL (coding_system, 2);
5536 validate_region (&start, &end);
5537 from = XFASTINT (start);
5538 to = XFASTINT (end);
5540 if (NILP (coding_system))
5541 return make_number (to - from);
5543 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5544 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5546 coding.mode |= CODING_MODE_LAST_BLOCK;
5547 coding.src_multibyte = coding.dst_multibyte
5548 = !NILP (current_buffer->enable_multibyte_characters);
5549 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5550 &coding, encodep, 1);
5551 Vlast_coding_system_used = coding.symbol;
5552 return make_number (coding.produced_char);
5555 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5556 3, 3, "r\nzCoding system: ",
5557 "Decode the current region by specified coding system.\n\
5558 When called from a program, takes three arguments:\n\
5559 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5560 This function sets `last-coding-system-used' to the precise coding system\n\
5561 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5562 not fully specified.)\n\
5563 It returns the length of the decoded text.")
5564 (start, end, coding_system)
5565 Lisp_Object start, end, coding_system;
5567 return code_convert_region1 (start, end, coding_system, 0);
5570 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5571 3, 3, "r\nzCoding system: ",
5572 "Encode the current region by specified coding system.\n\
5573 When called from a program, takes three arguments:\n\
5574 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5575 This function sets `last-coding-system-used' to the precise coding system\n\
5576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5577 not fully specified.)\n\
5578 It returns the length of the encoded text.")
5579 (start, end, coding_system)
5580 Lisp_Object start, end, coding_system;
5582 return code_convert_region1 (start, end, coding_system, 1);
5585 Lisp_Object
5586 code_convert_string1 (string, coding_system, nocopy, encodep)
5587 Lisp_Object string, coding_system, nocopy;
5588 int encodep;
5590 struct coding_system coding;
5592 CHECK_STRING (string, 0);
5593 CHECK_SYMBOL (coding_system, 1);
5595 if (NILP (coding_system))
5596 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5598 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5599 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5601 coding.mode |= CODING_MODE_LAST_BLOCK;
5602 string = (encodep
5603 ? encode_coding_string (string, &coding, !NILP (nocopy))
5604 : decode_coding_string (string, &coding, !NILP (nocopy)));
5605 Vlast_coding_system_used = coding.symbol;
5607 return string;
5610 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5611 2, 3, 0,
5612 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5613 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5614 if the decoding operation is trivial.\n\
5615 This function sets `last-coding-system-used' to the precise coding system\n\
5616 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5617 not fully specified.)")
5618 (string, coding_system, nocopy)
5619 Lisp_Object string, coding_system, nocopy;
5621 return code_convert_string1 (string, coding_system, nocopy, 0);
5624 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5625 2, 3, 0,
5626 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5627 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5628 if the encoding operation is trivial.\n\
5629 This function sets `last-coding-system-used' to the precise coding system\n\
5630 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5631 not fully specified.)")
5632 (string, coding_system, nocopy)
5633 Lisp_Object string, coding_system, nocopy;
5635 return code_convert_string1 (string, coding_system, nocopy, 1);
5638 /* Encode or decode STRING according to CODING_SYSTEM.
5639 Do not set Vlast_coding_system_used.
5641 This function is called only from macros DECODE_FILE and
5642 ENCODE_FILE, thus we ignore character composition. */
5644 Lisp_Object
5645 code_convert_string_norecord (string, coding_system, encodep)
5646 Lisp_Object string, coding_system;
5647 int encodep;
5649 struct coding_system coding;
5651 CHECK_STRING (string, 0);
5652 CHECK_SYMBOL (coding_system, 1);
5654 if (NILP (coding_system))
5655 return string;
5657 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5658 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5660 coding.composing = COMPOSITION_DISABLED;
5661 coding.mode |= CODING_MODE_LAST_BLOCK;
5662 return (encodep
5663 ? encode_coding_string (string, &coding, 1)
5664 : decode_coding_string (string, &coding, 1));
5667 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5668 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5669 Return the corresponding character.")
5670 (code)
5671 Lisp_Object code;
5673 unsigned char c1, c2, s1, s2;
5674 Lisp_Object val;
5676 CHECK_NUMBER (code, 0);
5677 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5678 if (s1 == 0)
5680 if (s2 < 0x80)
5681 XSETFASTINT (val, s2);
5682 else if (s2 >= 0xA0 || s2 <= 0xDF)
5683 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5684 else
5685 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5687 else
5689 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5690 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5691 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5692 DECODE_SJIS (s1, s2, c1, c2);
5693 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5695 return val;
5698 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5699 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5700 Return the corresponding code in SJIS.")
5701 (ch)
5702 Lisp_Object ch;
5704 int charset, c1, c2, s1, s2;
5705 Lisp_Object val;
5707 CHECK_NUMBER (ch, 0);
5708 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5709 if (charset == CHARSET_ASCII)
5711 val = ch;
5713 else if (charset == charset_jisx0208
5714 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5716 ENCODE_SJIS (c1, c2, s1, s2);
5717 XSETFASTINT (val, (s1 << 8) | s2);
5719 else if (charset == charset_katakana_jisx0201
5720 && c1 > 0x20 && c2 < 0xE0)
5722 XSETFASTINT (val, c1 | 0x80);
5724 else
5725 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5726 return val;
5729 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5730 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5731 Return the corresponding character.")
5732 (code)
5733 Lisp_Object code;
5735 int charset;
5736 unsigned char b1, b2, c1, c2;
5737 Lisp_Object val;
5739 CHECK_NUMBER (code, 0);
5740 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5741 if (b1 == 0)
5743 if (b2 >= 0x80)
5744 error ("Invalid BIG5 code: %x", XFASTINT (code));
5745 val = code;
5747 else
5749 if ((b1 < 0xA1 || b1 > 0xFE)
5750 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5751 error ("Invalid BIG5 code: %x", XFASTINT (code));
5752 DECODE_BIG5 (b1, b2, charset, c1, c2);
5753 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5755 return val;
5758 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5759 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5760 Return the corresponding character code in Big5.")
5761 (ch)
5762 Lisp_Object ch;
5764 int charset, c1, c2, b1, b2;
5765 Lisp_Object val;
5767 CHECK_NUMBER (ch, 0);
5768 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5769 if (charset == CHARSET_ASCII)
5771 val = ch;
5773 else if ((charset == charset_big5_1
5774 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5775 || (charset == charset_big5_2
5776 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5778 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5779 XSETFASTINT (val, (b1 << 8) | b2);
5781 else
5782 error ("Can't encode to Big5: %d", XFASTINT (ch));
5783 return val;
5786 DEFUN ("set-terminal-coding-system-internal",
5787 Fset_terminal_coding_system_internal,
5788 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5789 (coding_system)
5790 Lisp_Object coding_system;
5792 CHECK_SYMBOL (coding_system, 0);
5793 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5794 /* We had better not send unsafe characters to terminal. */
5795 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5796 /* Characer composition should be disabled. */
5797 terminal_coding.composing = COMPOSITION_DISABLED;
5798 terminal_coding.src_multibyte = 1;
5799 terminal_coding.dst_multibyte = 0;
5800 return Qnil;
5803 DEFUN ("set-safe-terminal-coding-system-internal",
5804 Fset_safe_terminal_coding_system_internal,
5805 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5806 (coding_system)
5807 Lisp_Object coding_system;
5809 CHECK_SYMBOL (coding_system, 0);
5810 setup_coding_system (Fcheck_coding_system (coding_system),
5811 &safe_terminal_coding);
5812 /* Characer composition should be disabled. */
5813 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5814 safe_terminal_coding.src_multibyte = 1;
5815 safe_terminal_coding.dst_multibyte = 0;
5816 return Qnil;
5819 DEFUN ("terminal-coding-system",
5820 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5821 "Return coding system specified for terminal output.")
5824 return terminal_coding.symbol;
5827 DEFUN ("set-keyboard-coding-system-internal",
5828 Fset_keyboard_coding_system_internal,
5829 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5830 (coding_system)
5831 Lisp_Object coding_system;
5833 CHECK_SYMBOL (coding_system, 0);
5834 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5835 /* Characer composition should be disabled. */
5836 keyboard_coding.composing = COMPOSITION_DISABLED;
5837 return Qnil;
5840 DEFUN ("keyboard-coding-system",
5841 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5842 "Return coding system specified for decoding keyboard input.")
5845 return keyboard_coding.symbol;
5849 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5850 Sfind_operation_coding_system, 1, MANY, 0,
5851 "Choose a coding system for an operation based on the target name.\n\
5852 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5853 DECODING-SYSTEM is the coding system to use for decoding\n\
5854 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5855 for encoding (in case OPERATION does encoding).\n\
5857 The first argument OPERATION specifies an I/O primitive:\n\
5858 For file I/O, `insert-file-contents' or `write-region'.\n\
5859 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5860 For network I/O, `open-network-stream'.\n\
5862 The remaining arguments should be the same arguments that were passed\n\
5863 to the primitive. Depending on which primitive, one of those arguments\n\
5864 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5865 whichever argument specifies the file name is TARGET.\n\
5867 TARGET has a meaning which depends on OPERATION:\n\
5868 For file I/O, TARGET is a file name.\n\
5869 For process I/O, TARGET is a process name.\n\
5870 For network I/O, TARGET is a service name or a port number\n\
5872 This function looks up what specified for TARGET in,\n\
5873 `file-coding-system-alist', `process-coding-system-alist',\n\
5874 or `network-coding-system-alist' depending on OPERATION.\n\
5875 They may specify a coding system, a cons of coding systems,\n\
5876 or a function symbol to call.\n\
5877 In the last case, we call the function with one argument,\n\
5878 which is a list of all the arguments given to this function.")
5879 (nargs, args)
5880 int nargs;
5881 Lisp_Object *args;
5883 Lisp_Object operation, target_idx, target, val;
5884 register Lisp_Object chain;
5886 if (nargs < 2)
5887 error ("Too few arguments");
5888 operation = args[0];
5889 if (!SYMBOLP (operation)
5890 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5891 error ("Invalid first arguement");
5892 if (nargs < 1 + XINT (target_idx))
5893 error ("Too few arguments for operation: %s",
5894 XSYMBOL (operation)->name->data);
5895 target = args[XINT (target_idx) + 1];
5896 if (!(STRINGP (target)
5897 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5898 error ("Invalid %dth argument", XINT (target_idx) + 1);
5900 chain = ((EQ (operation, Qinsert_file_contents)
5901 || EQ (operation, Qwrite_region))
5902 ? Vfile_coding_system_alist
5903 : (EQ (operation, Qopen_network_stream)
5904 ? Vnetwork_coding_system_alist
5905 : Vprocess_coding_system_alist));
5906 if (NILP (chain))
5907 return Qnil;
5909 for (; CONSP (chain); chain = XCDR (chain))
5911 Lisp_Object elt;
5912 elt = XCAR (chain);
5914 if (CONSP (elt)
5915 && ((STRINGP (target)
5916 && STRINGP (XCAR (elt))
5917 && fast_string_match (XCAR (elt), target) >= 0)
5918 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5920 val = XCDR (elt);
5921 /* Here, if VAL is both a valid coding system and a valid
5922 function symbol, we return VAL as a coding system. */
5923 if (CONSP (val))
5924 return val;
5925 if (! SYMBOLP (val))
5926 return Qnil;
5927 if (! NILP (Fcoding_system_p (val)))
5928 return Fcons (val, val);
5929 if (! NILP (Ffboundp (val)))
5931 val = call1 (val, Flist (nargs, args));
5932 if (CONSP (val))
5933 return val;
5934 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5935 return Fcons (val, val);
5937 return Qnil;
5940 return Qnil;
5943 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5944 Supdate_coding_systems_internal, 0, 0, 0,
5945 "Update internal database for ISO2022 and CCL based coding systems.\n\
5946 When values of any coding categories are changed, you must\n\
5947 call this function")
5950 int i;
5952 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5954 Lisp_Object val;
5956 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5957 if (!NILP (val))
5959 if (! coding_system_table[i])
5960 coding_system_table[i] = ((struct coding_system *)
5961 xmalloc (sizeof (struct coding_system)));
5962 setup_coding_system (val, coding_system_table[i]);
5964 else if (coding_system_table[i])
5966 xfree (coding_system_table[i]);
5967 coding_system_table[i] = NULL;
5971 return Qnil;
5974 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5975 Sset_coding_priority_internal, 0, 0, 0,
5976 "Update internal database for the current value of `coding-category-list'.\n\
5977 This function is internal use only.")
5980 int i = 0, idx;
5981 Lisp_Object val;
5983 val = Vcoding_category_list;
5985 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5987 if (! SYMBOLP (XCAR (val)))
5988 break;
5989 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5990 if (idx >= CODING_CATEGORY_IDX_MAX)
5991 break;
5992 coding_priorities[i++] = (1 << idx);
5993 val = XCDR (val);
5995 /* If coding-category-list is valid and contains all coding
5996 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5997 the following code saves Emacs from crashing. */
5998 while (i < CODING_CATEGORY_IDX_MAX)
5999 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6001 return Qnil;
6004 #endif /* emacs */
6007 /*** 9. Post-amble ***/
6009 void
6010 init_coding ()
6012 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6015 void
6016 init_coding_once ()
6018 int i;
6020 /* Emacs' internal format specific initialize routine. */
6021 for (i = 0; i <= 0x20; i++)
6022 emacs_code_class[i] = EMACS_control_code;
6023 emacs_code_class[0x0A] = EMACS_linefeed_code;
6024 emacs_code_class[0x0D] = EMACS_carriage_return_code;
6025 for (i = 0x21 ; i < 0x7F; i++)
6026 emacs_code_class[i] = EMACS_ascii_code;
6027 emacs_code_class[0x7F] = EMACS_control_code;
6028 for (i = 0x80; i < 0xFF; i++)
6029 emacs_code_class[i] = EMACS_invalid_code;
6030 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6031 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6032 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6033 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6035 /* ISO2022 specific initialize routine. */
6036 for (i = 0; i < 0x20; i++)
6037 iso_code_class[i] = ISO_control_0;
6038 for (i = 0x21; i < 0x7F; i++)
6039 iso_code_class[i] = ISO_graphic_plane_0;
6040 for (i = 0x80; i < 0xA0; i++)
6041 iso_code_class[i] = ISO_control_1;
6042 for (i = 0xA1; i < 0xFF; i++)
6043 iso_code_class[i] = ISO_graphic_plane_1;
6044 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6045 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6046 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6047 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6048 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6049 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6050 iso_code_class[ISO_CODE_ESC] = ISO_escape;
6051 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6052 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6053 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6055 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6057 setup_coding_system (Qnil, &keyboard_coding);
6058 setup_coding_system (Qnil, &terminal_coding);
6059 setup_coding_system (Qnil, &safe_terminal_coding);
6060 setup_coding_system (Qnil, &default_buffer_file_coding);
6062 bzero (coding_system_table, sizeof coding_system_table);
6064 bzero (ascii_skip_code, sizeof ascii_skip_code);
6065 for (i = 0; i < 128; i++)
6066 ascii_skip_code[i] = 1;
6068 #if defined (MSDOS) || defined (WINDOWSNT)
6069 system_eol_type = CODING_EOL_CRLF;
6070 #else
6071 system_eol_type = CODING_EOL_LF;
6072 #endif
6074 inhibit_pre_post_conversion = 0;
6077 #ifdef emacs
6079 void
6080 syms_of_coding ()
6082 Qtarget_idx = intern ("target-idx");
6083 staticpro (&Qtarget_idx);
6085 Qcoding_system_history = intern ("coding-system-history");
6086 staticpro (&Qcoding_system_history);
6087 Fset (Qcoding_system_history, Qnil);
6089 /* Target FILENAME is the first argument. */
6090 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6091 /* Target FILENAME is the third argument. */
6092 Fput (Qwrite_region, Qtarget_idx, make_number (2));
6094 Qcall_process = intern ("call-process");
6095 staticpro (&Qcall_process);
6096 /* Target PROGRAM is the first argument. */
6097 Fput (Qcall_process, Qtarget_idx, make_number (0));
6099 Qcall_process_region = intern ("call-process-region");
6100 staticpro (&Qcall_process_region);
6101 /* Target PROGRAM is the third argument. */
6102 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6104 Qstart_process = intern ("start-process");
6105 staticpro (&Qstart_process);
6106 /* Target PROGRAM is the third argument. */
6107 Fput (Qstart_process, Qtarget_idx, make_number (2));
6109 Qopen_network_stream = intern ("open-network-stream");
6110 staticpro (&Qopen_network_stream);
6111 /* Target SERVICE is the fourth argument. */
6112 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6114 Qcoding_system = intern ("coding-system");
6115 staticpro (&Qcoding_system);
6117 Qeol_type = intern ("eol-type");
6118 staticpro (&Qeol_type);
6120 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6121 staticpro (&Qbuffer_file_coding_system);
6123 Qpost_read_conversion = intern ("post-read-conversion");
6124 staticpro (&Qpost_read_conversion);
6126 Qpre_write_conversion = intern ("pre-write-conversion");
6127 staticpro (&Qpre_write_conversion);
6129 Qno_conversion = intern ("no-conversion");
6130 staticpro (&Qno_conversion);
6132 Qundecided = intern ("undecided");
6133 staticpro (&Qundecided);
6135 Qcoding_system_p = intern ("coding-system-p");
6136 staticpro (&Qcoding_system_p);
6138 Qcoding_system_error = intern ("coding-system-error");
6139 staticpro (&Qcoding_system_error);
6141 Fput (Qcoding_system_error, Qerror_conditions,
6142 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6143 Fput (Qcoding_system_error, Qerror_message,
6144 build_string ("Invalid coding system"));
6146 Qcoding_category = intern ("coding-category");
6147 staticpro (&Qcoding_category);
6148 Qcoding_category_index = intern ("coding-category-index");
6149 staticpro (&Qcoding_category_index);
6151 Vcoding_category_table
6152 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6153 staticpro (&Vcoding_category_table);
6155 int i;
6156 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6158 XVECTOR (Vcoding_category_table)->contents[i]
6159 = intern (coding_category_name[i]);
6160 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6161 Qcoding_category_index, make_number (i));
6165 Qtranslation_table = intern ("translation-table");
6166 staticpro (&Qtranslation_table);
6167 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6169 Qtranslation_table_id = intern ("translation-table-id");
6170 staticpro (&Qtranslation_table_id);
6172 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6173 staticpro (&Qtranslation_table_for_decode);
6175 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6176 staticpro (&Qtranslation_table_for_encode);
6178 Qsafe_charsets = intern ("safe-charsets");
6179 staticpro (&Qsafe_charsets);
6181 Qvalid_codes = intern ("valid-codes");
6182 staticpro (&Qvalid_codes);
6184 Qemacs_mule = intern ("emacs-mule");
6185 staticpro (&Qemacs_mule);
6187 Qraw_text = intern ("raw-text");
6188 staticpro (&Qraw_text);
6190 defsubr (&Scoding_system_p);
6191 defsubr (&Sread_coding_system);
6192 defsubr (&Sread_non_nil_coding_system);
6193 defsubr (&Scheck_coding_system);
6194 defsubr (&Sdetect_coding_region);
6195 defsubr (&Sdetect_coding_string);
6196 defsubr (&Sdecode_coding_region);
6197 defsubr (&Sencode_coding_region);
6198 defsubr (&Sdecode_coding_string);
6199 defsubr (&Sencode_coding_string);
6200 defsubr (&Sdecode_sjis_char);
6201 defsubr (&Sencode_sjis_char);
6202 defsubr (&Sdecode_big5_char);
6203 defsubr (&Sencode_big5_char);
6204 defsubr (&Sset_terminal_coding_system_internal);
6205 defsubr (&Sset_safe_terminal_coding_system_internal);
6206 defsubr (&Sterminal_coding_system);
6207 defsubr (&Sset_keyboard_coding_system_internal);
6208 defsubr (&Skeyboard_coding_system);
6209 defsubr (&Sfind_operation_coding_system);
6210 defsubr (&Supdate_coding_systems_internal);
6211 defsubr (&Sset_coding_priority_internal);
6213 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6214 "List of coding systems.\n\
6216 Do not alter the value of this variable manually. This variable should be\n\
6217 updated by the functions `make-coding-system' and\n\
6218 `define-coding-system-alias'.");
6219 Vcoding_system_list = Qnil;
6221 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6222 "Alist of coding system names.\n\
6223 Each element is one element list of coding system name.\n\
6224 This variable is given to `completing-read' as TABLE argument.\n\
6226 Do not alter the value of this variable manually. This variable should be\n\
6227 updated by the functions `make-coding-system' and\n\
6228 `define-coding-system-alias'.");
6229 Vcoding_system_alist = Qnil;
6231 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6232 "List of coding-categories (symbols) ordered by priority.");
6234 int i;
6236 Vcoding_category_list = Qnil;
6237 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6238 Vcoding_category_list
6239 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6240 Vcoding_category_list);
6243 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6244 "Specify the coding system for read operations.\n\
6245 It is useful to bind this variable with `let', but do not set it globally.\n\
6246 If the value is a coding system, it is used for decoding on read operation.\n\
6247 If not, an appropriate element is used from one of the coding system alists:\n\
6248 There are three such tables, `file-coding-system-alist',\n\
6249 `process-coding-system-alist', and `network-coding-system-alist'.");
6250 Vcoding_system_for_read = Qnil;
6252 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6253 "Specify the coding system for write operations.\n\
6254 Programs bind this variable with `let', but you should not set it globally.\n\
6255 If the value is a coding system, it is used for encoding of output,\n\
6256 when writing it to a file and when sending it to a file or subprocess.\n\
6258 If this does not specify a coding system, an appropriate element\n\
6259 is used from one of the coding system alists:\n\
6260 There are three such tables, `file-coding-system-alist',\n\
6261 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6262 For output to files, if the above procedure does not specify a coding system,\n\
6263 the value of `buffer-file-coding-system' is used.");
6264 Vcoding_system_for_write = Qnil;
6266 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6267 "Coding system used in the latest file or process I/O.");
6268 Vlast_coding_system_used = Qnil;
6270 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6271 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6272 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6273 such conversion.");
6274 inhibit_eol_conversion = 0;
6276 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6277 "Non-nil means process buffer inherits coding system of process output.\n\
6278 Bind it to t if the process output is to be treated as if it were a file\n\
6279 read from some filesystem.");
6280 inherit_process_coding_system = 0;
6282 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6283 "Alist to decide a coding system to use for a file I/O operation.\n\
6284 The format is ((PATTERN . VAL) ...),\n\
6285 where PATTERN is a regular expression matching a file name,\n\
6286 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6287 If VAL is a coding system, it is used for both decoding and encoding\n\
6288 the file contents.\n\
6289 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6290 and the cdr part is used for encoding.\n\
6291 If VAL is a function symbol, the function must return a coding system\n\
6292 or a cons of coding systems which are used as above.\n\
6294 See also the function `find-operation-coding-system'\n\
6295 and the variable `auto-coding-alist'.");
6296 Vfile_coding_system_alist = Qnil;
6298 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6299 "Alist to decide a coding system to use for a process I/O operation.\n\
6300 The format is ((PATTERN . VAL) ...),\n\
6301 where PATTERN is a regular expression matching a program name,\n\
6302 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6303 If VAL is a coding system, it is used for both decoding what received\n\
6304 from the program and encoding what sent to the program.\n\
6305 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6306 and the cdr part is used for encoding.\n\
6307 If VAL is a function symbol, the function must return a coding system\n\
6308 or a cons of coding systems which are used as above.\n\
6310 See also the function `find-operation-coding-system'.");
6311 Vprocess_coding_system_alist = Qnil;
6313 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6314 "Alist to decide a coding system to use for a network I/O operation.\n\
6315 The format is ((PATTERN . VAL) ...),\n\
6316 where PATTERN is a regular expression matching a network service name\n\
6317 or is a port number to connect to,\n\
6318 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6319 If VAL is a coding system, it is used for both decoding what received\n\
6320 from the network stream and encoding what sent to the network stream.\n\
6321 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6322 and the cdr part is used for encoding.\n\
6323 If VAL is a function symbol, the function must return a coding system\n\
6324 or a cons of coding systems which are used as above.\n\
6326 See also the function `find-operation-coding-system'.");
6327 Vnetwork_coding_system_alist = Qnil;
6329 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6330 "Coding system to use with system messages.");
6331 Vlocale_coding_system = Qnil;
6333 /* The eol mnemonics are reset in startup.el system-dependently. */
6334 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6335 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6336 eol_mnemonic_unix = build_string (":");
6338 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6339 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6340 eol_mnemonic_dos = build_string ("\\");
6342 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6343 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6344 eol_mnemonic_mac = build_string ("/");
6346 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6347 "*String displayed in mode line when end-of-line format is not yet determined.");
6348 eol_mnemonic_undecided = build_string (":");
6350 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6351 "*Non-nil enables character translation while encoding and decoding.");
6352 Venable_character_translation = Qt;
6354 DEFVAR_LISP ("standard-translation-table-for-decode",
6355 &Vstandard_translation_table_for_decode,
6356 "Table for translating characters while decoding.");
6357 Vstandard_translation_table_for_decode = Qnil;
6359 DEFVAR_LISP ("standard-translation-table-for-encode",
6360 &Vstandard_translation_table_for_encode,
6361 "Table for translationg characters while encoding.");
6362 Vstandard_translation_table_for_encode = Qnil;
6364 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6365 "Alist of charsets vs revision numbers.\n\
6366 While encoding, if a charset (car part of an element) is found,\n\
6367 designate it with the escape sequence identifing revision (cdr part of the element).");
6368 Vcharset_revision_alist = Qnil;
6370 DEFVAR_LISP ("default-process-coding-system",
6371 &Vdefault_process_coding_system,
6372 "Cons of coding systems used for process I/O by default.\n\
6373 The car part is used for decoding a process output,\n\
6374 the cdr part is used for encoding a text to be sent to a process.");
6375 Vdefault_process_coding_system = Qnil;
6377 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6378 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6379 This is a vector of length 256.\n\
6380 If Nth element is non-nil, the existence of code N in a file\n\
6381 \(or output of subprocess) doesn't prevent it to be detected as\n\
6382 a coding system of ISO 2022 variant which has a flag\n\
6383 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6384 or reading output of a subprocess.\n\
6385 Only 128th through 159th elements has a meaning.");
6386 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6388 DEFVAR_LISP ("select-safe-coding-system-function",
6389 &Vselect_safe_coding_system_function,
6390 "Function to call to select safe coding system for encoding a text.\n\
6392 If set, this function is called to force a user to select a proper\n\
6393 coding system which can encode the text in the case that a default\n\
6394 coding system used in each operation can't encode the text.\n\
6396 The default value is `select-safe-coding-system' (which see).");
6397 Vselect_safe_coding_system_function = Qnil;
6401 char *
6402 emacs_strerror (error_number)
6403 int error_number;
6405 char *str;
6407 synchronize_system_messages_locale ();
6408 str = strerror (error_number);
6410 if (! NILP (Vlocale_coding_system))
6412 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6413 Vlocale_coding_system,
6415 str = (char *) XSTRING (dec)->data;
6418 return str;
6421 #endif /* emacs */