(smerge-auto-leave): New function and variable.
[emacs.git] / src / coding.c
blobfcd5e89e0041e83289247b8ffbedf584e55e4c84
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
45 0. Emacs' internal format (emacs-mule)
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
50 1. ISO2022
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
61 section 4.
63 3. BIG5
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
71 4. Raw text
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
76 5. Other
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112 #if 0
114 detect_coding_emacs_mule (src, src_end)
115 unsigned char *src, *src_end;
119 #endif
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
137 Below is a template of these functions. */
138 #if 0
139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
146 #endif
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
164 Below is a template of these functions. */
165 #if 0
166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
173 #endif
175 /*** COMMONLY USED MACROS ***/
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
183 #define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
191 #define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
199 #define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
214 /* Decode one ASCII character C. */
216 #define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 *dst++ = (c) & 0x7F; \
219 coding->produced_char++; \
220 } while (0)
222 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
223 position-code is C. */
225 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
226 do { \
227 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
229 *dst++ = leading_code; \
230 if ((leading_code = CHARSET_LEADING_CODE_EXT (charset)) > 0) \
231 *dst++ = leading_code; \
232 *dst++ = (c) | 0x80; \
233 coding->produced_char++; \
234 } while (0)
236 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
237 position-codes are C1 and C2. */
239 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
240 do { \
241 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
242 *dst++ = (c2) | 0x80; \
243 } while (0)
246 /*** 1. Preamble ***/
248 #ifdef emacs
249 #include <config.h>
250 #endif
252 #include <stdio.h>
254 #ifdef emacs
256 #include "lisp.h"
257 #include "buffer.h"
258 #include "charset.h"
259 #include "composite.h"
260 #include "ccl.h"
261 #include "coding.h"
262 #include "window.h"
264 #else /* not emacs */
266 #include "mulelib.h"
268 #endif /* not emacs */
270 Lisp_Object Qcoding_system, Qeol_type;
271 Lisp_Object Qbuffer_file_coding_system;
272 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
273 Lisp_Object Qno_conversion, Qundecided;
274 Lisp_Object Qcoding_system_history;
275 Lisp_Object Qsafe_charsets;
276 Lisp_Object Qvalid_codes;
278 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
279 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
280 Lisp_Object Qstart_process, Qopen_network_stream;
281 Lisp_Object Qtarget_idx;
283 Lisp_Object Vselect_safe_coding_system_function;
285 /* Mnemonic string for each format of end-of-line. */
286 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
287 /* Mnemonic string to indicate format of end-of-line is not yet
288 decided. */
289 Lisp_Object eol_mnemonic_undecided;
291 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
292 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
293 int system_eol_type;
295 #ifdef emacs
297 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
299 Lisp_Object Qcoding_system_p, Qcoding_system_error;
301 /* Coding system emacs-mule and raw-text are for converting only
302 end-of-line format. */
303 Lisp_Object Qemacs_mule, Qraw_text;
305 /* Coding-systems are handed between Emacs Lisp programs and C internal
306 routines by the following three variables. */
307 /* Coding-system for reading files and receiving data from process. */
308 Lisp_Object Vcoding_system_for_read;
309 /* Coding-system for writing files and sending data to process. */
310 Lisp_Object Vcoding_system_for_write;
311 /* Coding-system actually used in the latest I/O. */
312 Lisp_Object Vlast_coding_system_used;
314 /* A vector of length 256 which contains information about special
315 Latin codes (especially for dealing with Microsoft codes). */
316 Lisp_Object Vlatin_extra_code_table;
318 /* Flag to inhibit code conversion of end-of-line format. */
319 int inhibit_eol_conversion;
321 /* Flag to make buffer-file-coding-system inherit from process-coding. */
322 int inherit_process_coding_system;
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding;
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding;
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding;
334 /* Default coding system to be used to write a file. */
335 struct coding_system default_buffer_file_coding;
337 Lisp_Object Vfile_coding_system_alist;
338 Lisp_Object Vprocess_coding_system_alist;
339 Lisp_Object Vnetwork_coding_system_alist;
341 Lisp_Object Vlocale_coding_system;
343 #endif /* emacs */
345 Lisp_Object Qcoding_category, Qcoding_category_index;
347 /* List of symbols `coding-category-xxx' ordered by priority. */
348 Lisp_Object Vcoding_category_list;
350 /* Table of coding categories (Lisp symbols). */
351 Lisp_Object Vcoding_category_table;
353 /* Table of names of symbol for each coding-category. */
354 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
355 "coding-category-emacs-mule",
356 "coding-category-sjis",
357 "coding-category-iso-7",
358 "coding-category-iso-7-tight",
359 "coding-category-iso-8-1",
360 "coding-category-iso-8-2",
361 "coding-category-iso-7-else",
362 "coding-category-iso-8-else",
363 "coding-category-ccl",
364 "coding-category-big5",
365 "coding-category-utf-8",
366 "coding-category-utf-16-be",
367 "coding-category-utf-16-le",
368 "coding-category-raw-text",
369 "coding-category-binary"
372 /* Table of pointers to coding systems corresponding to each coding
373 categories. */
374 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
376 /* Table of coding category masks. Nth element is a mask for a coding
377 cateogry of which priority is Nth. */
378 static
379 int coding_priorities[CODING_CATEGORY_IDX_MAX];
381 /* Flag to tell if we look up translation table on character code
382 conversion. */
383 Lisp_Object Venable_character_translation;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode;
389 Lisp_Object Qtranslation_table;
390 Lisp_Object Qtranslation_table_id;
391 Lisp_Object Qtranslation_table_for_decode;
392 Lisp_Object Qtranslation_table_for_encode;
394 /* Alist of charsets vs revision number. */
395 Lisp_Object Vcharset_revision_alist;
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system;
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion;
407 /*** 2. Emacs internal format (emacs-mule) handlers ***/
409 /* Emacs' internal format for encoding multiple character sets is a
410 kind of multi-byte encoding, i.e. characters are encoded by
411 variable-length sequences of one-byte codes. ASCII characters
412 and control characters (e.g. `tab', `newline') are represented by
413 one-byte sequences which are their ASCII codes, in the range 0x00
414 through 0x7F. The other characters are represented by a sequence
415 of `base leading-code', optional `extended leading-code', and one
416 or two `position-code's. The length of the sequence is determined
417 by the base leading-code. Leading-code takes the range 0x80
418 through 0x9F, whereas extended leading-code and position-code take
419 the range 0xA0 through 0xFF. See `charset.h' for more details
420 about leading-code and position-code.
422 --- CODE RANGE of Emacs' internal format ---
423 (character set) (range)
424 ASCII 0x00 .. 0x7F
425 ELSE (1st byte) 0x81 .. 0x9F
426 (rest bytes) 0xA0 .. 0xFF
427 ---------------------------------------------
431 enum emacs_code_class_type emacs_code_class[256];
433 /* Go to the next statement only if *SRC is accessible and the code is
434 greater than 0xA0. */
435 #define CHECK_CODE_RANGE_A0_FF \
436 do { \
437 if (src >= src_end) \
438 goto label_end_of_switch; \
439 else if (*src++ < 0xA0) \
440 return 0; \
441 } while (0)
443 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
444 Check if a text is encoded in Emacs' internal format. If it is,
445 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
448 detect_coding_emacs_mule (src, src_end)
449 unsigned char *src, *src_end;
451 unsigned char c;
452 int composing = 0;
454 while (src < src_end)
456 c = *src++;
458 if (composing)
460 if (c < 0xA0)
461 composing = 0;
462 else
463 c -= 0x20;
466 switch (emacs_code_class[c])
468 case EMACS_ascii_code:
469 case EMACS_linefeed_code:
470 break;
472 case EMACS_control_code:
473 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
474 return 0;
475 break;
477 case EMACS_invalid_code:
478 return 0;
480 case EMACS_leading_code_4:
481 CHECK_CODE_RANGE_A0_FF;
482 /* fall down to check it two more times ... */
484 case EMACS_leading_code_3:
485 CHECK_CODE_RANGE_A0_FF;
486 /* fall down to check it one more time ... */
488 case EMACS_leading_code_2:
489 CHECK_CODE_RANGE_A0_FF;
490 break;
492 case 0x80: /* Old leading code for a composite character. */
493 if (composing)
494 CHECK_CODE_RANGE_A0_FF;
495 else
496 composing = 1;
497 break;
499 default:
500 label_end_of_switch:
501 break;
504 return CODING_CATEGORY_MASK_EMACS_MULE;
508 /*** 3. ISO2022 handlers ***/
510 /* The following note describes the coding system ISO2022 briefly.
511 Since the intention of this note is to help understand the
512 functions in this file, some parts are NOT ACCURATE or OVERLY
513 SIMPLIFIED. For thorough understanding, please refer to the
514 original document of ISO2022.
516 ISO2022 provides many mechanisms to encode several character sets
517 in 7-bit and 8-bit environments. For 7-bite environments, all text
518 is encoded using bytes less than 128. This may make the encoded
519 text a little bit longer, but the text passes more easily through
520 several gateways, some of which strip off MSB (Most Signigant Bit).
522 There are two kinds of character sets: control character set and
523 graphic character set. The former contains control characters such
524 as `newline' and `escape' to provide control functions (control
525 functions are also provided by escape sequences). The latter
526 contains graphic characters such as 'A' and '-'. Emacs recognizes
527 two control character sets and many graphic character sets.
529 Graphic character sets are classified into one of the following
530 four classes, according to the number of bytes (DIMENSION) and
531 number of characters in one dimension (CHARS) of the set:
532 - DIMENSION1_CHARS94
533 - DIMENSION1_CHARS96
534 - DIMENSION2_CHARS94
535 - DIMENSION2_CHARS96
537 In addition, each character set is assigned an identification tag,
538 unique for each set, called "final character" (denoted as <F>
539 hereafter). The <F> of each character set is decided by ECMA(*)
540 when it is registered in ISO. The code range of <F> is 0x30..0x7F
541 (0x30..0x3F are for private use only).
543 Note (*): ECMA = European Computer Manufacturers Association
545 Here are examples of graphic character set [NAME(<F>)]:
546 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
547 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
548 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
549 o DIMENSION2_CHARS96 -- none for the moment
551 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
552 C0 [0x00..0x1F] -- control character plane 0
553 GL [0x20..0x7F] -- graphic character plane 0
554 C1 [0x80..0x9F] -- control character plane 1
555 GR [0xA0..0xFF] -- graphic character plane 1
557 A control character set is directly designated and invoked to C0 or
558 C1 by an escape sequence. The most common case is that:
559 - ISO646's control character set is designated/invoked to C0, and
560 - ISO6429's control character set is designated/invoked to C1,
561 and usually these designations/invocations are omitted in encoded
562 text. In a 7-bit environment, only C0 can be used, and a control
563 character for C1 is encoded by an appropriate escape sequence to
564 fit into the environment. All control characters for C1 are
565 defined to have corresponding escape sequences.
567 A graphic character set is at first designated to one of four
568 graphic registers (G0 through G3), then these graphic registers are
569 invoked to GL or GR. These designations and invocations can be
570 done independently. The most common case is that G0 is invoked to
571 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
572 these invocations and designations are omitted in encoded text.
573 In a 7-bit environment, only GL can be used.
575 When a graphic character set of CHARS94 is invoked to GL, codes
576 0x20 and 0x7F of the GL area work as control characters SPACE and
577 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
578 be used.
580 There are two ways of invocation: locking-shift and single-shift.
581 With locking-shift, the invocation lasts until the next different
582 invocation, whereas with single-shift, the invocation affects the
583 following character only and doesn't affect the locking-shift
584 state. Invocations are done by the following control characters or
585 escape sequences:
587 ----------------------------------------------------------------------
588 abbrev function cntrl escape seq description
589 ----------------------------------------------------------------------
590 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
591 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
592 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
593 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
594 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
595 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
596 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
597 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
598 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
599 ----------------------------------------------------------------------
600 (*) These are not used by any known coding system.
602 Control characters for these functions are defined by macros
603 ISO_CODE_XXX in `coding.h'.
605 Designations are done by the following escape sequences:
606 ----------------------------------------------------------------------
607 escape sequence description
608 ----------------------------------------------------------------------
609 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
610 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
611 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
612 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
613 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
614 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
615 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
616 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
617 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
618 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
619 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
620 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
621 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
622 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
623 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
624 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
625 ----------------------------------------------------------------------
627 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
628 of dimension 1, chars 94, and final character <F>, etc...
630 Note (*): Although these designations are not allowed in ISO2022,
631 Emacs accepts them on decoding, and produces them on encoding
632 CHARS96 character sets in a coding system which is characterized as
633 7-bit environment, non-locking-shift, and non-single-shift.
635 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
636 '(' can be omitted. We refer to this as "short-form" hereafter.
638 Now you may notice that there are a lot of ways for encoding the
639 same multilingual text in ISO2022. Actually, there exist many
640 coding systems such as Compound Text (used in X11's inter client
641 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
642 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
643 localized platforms), and all of these are variants of ISO2022.
645 In addition to the above, Emacs handles two more kinds of escape
646 sequences: ISO6429's direction specification and Emacs' private
647 sequence for specifying character composition.
649 ISO6429's direction specification takes the following form:
650 o CSI ']' -- end of the current direction
651 o CSI '0' ']' -- end of the current direction
652 o CSI '1' ']' -- start of left-to-right text
653 o CSI '2' ']' -- start of right-to-left text
654 The control character CSI (0x9B: control sequence introducer) is
655 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
657 Character composition specification takes the following form:
658 o ESC '0' -- start relative composition
659 o ESC '1' -- end composition
660 o ESC '2' -- start rule-base composition (*)
661 o ESC '3' -- start relative composition with alternate chars (**)
662 o ESC '4' -- start rule-base composition with alternate chars (**)
663 Since these are not standard escape sequences of any ISO standard,
664 the use of them for these meaning is restricted to Emacs only.
666 (*) This form is used only in Emacs 20.5 and the older versions,
667 but the newer versions can safely decode it.
668 (**) This form is used only in Emacs 21.1 and the newer versions,
669 and the older versions can't decode it.
671 Here's a list of examples usages of these composition escape
672 sequences (categorized by `enum composition_method').
674 COMPOSITION_RELATIVE:
675 ESC 0 CHAR [ CHAR ] ESC 1
676 COMPOSITOIN_WITH_RULE:
677 ESC 2 CHAR [ RULE CHAR ] ESC 1
678 COMPOSITION_WITH_ALTCHARS:
679 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
680 COMPOSITION_WITH_RULE_ALTCHARS:
681 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
683 enum iso_code_class_type iso_code_class[256];
685 #define CHARSET_OK(idx, charset) \
686 (coding_system_table[idx] \
687 && (coding_system_table[idx]->safe_charsets[charset] \
688 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
689 (coding_system_table[idx], charset) \
690 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
692 #define SHIFT_OUT_OK(idx) \
693 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
695 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
696 Check if a text is encoded in ISO2022. If it is, returns an
697 integer in which appropriate flag bits any of:
698 CODING_CATEGORY_MASK_ISO_7
699 CODING_CATEGORY_MASK_ISO_7_TIGHT
700 CODING_CATEGORY_MASK_ISO_8_1
701 CODING_CATEGORY_MASK_ISO_8_2
702 CODING_CATEGORY_MASK_ISO_7_ELSE
703 CODING_CATEGORY_MASK_ISO_8_ELSE
704 are set. If a code which should never appear in ISO2022 is found,
705 returns 0. */
708 detect_coding_iso2022 (src, src_end)
709 unsigned char *src, *src_end;
711 int mask = CODING_CATEGORY_MASK_ISO;
712 int mask_found = 0;
713 int reg[4], shift_out = 0, single_shifting = 0;
714 int c, c1, i, charset;
716 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
717 while (mask && src < src_end)
719 c = *src++;
720 switch (c)
722 case ISO_CODE_ESC:
723 single_shifting = 0;
724 if (src >= src_end)
725 break;
726 c = *src++;
727 if (c >= '(' && c <= '/')
729 /* Designation sequence for a charset of dimension 1. */
730 if (src >= src_end)
731 break;
732 c1 = *src++;
733 if (c1 < ' ' || c1 >= 0x80
734 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
735 /* Invalid designation sequence. Just ignore. */
736 break;
737 reg[(c - '(') % 4] = charset;
739 else if (c == '$')
741 /* Designation sequence for a charset of dimension 2. */
742 if (src >= src_end)
743 break;
744 c = *src++;
745 if (c >= '@' && c <= 'B')
746 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
747 reg[0] = charset = iso_charset_table[1][0][c];
748 else if (c >= '(' && c <= '/')
750 if (src >= src_end)
751 break;
752 c1 = *src++;
753 if (c1 < ' ' || c1 >= 0x80
754 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
755 /* Invalid designation sequence. Just ignore. */
756 break;
757 reg[(c - '(') % 4] = charset;
759 else
760 /* Invalid designation sequence. Just ignore. */
761 break;
763 else if (c == 'N' || c == 'O')
765 /* ESC <Fe> for SS2 or SS3. */
766 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
767 break;
769 else if (c >= '0' && c <= '4')
771 /* ESC <Fp> for start/end composition. */
772 mask_found |= CODING_CATEGORY_MASK_ISO;
773 break;
775 else
776 /* Invalid escape sequence. Just ignore. */
777 break;
779 /* We found a valid designation sequence for CHARSET. */
780 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
781 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
782 mask_found |= CODING_CATEGORY_MASK_ISO_7;
783 else
784 mask &= ~CODING_CATEGORY_MASK_ISO_7;
785 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
786 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
787 else
788 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
789 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
790 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
791 else
792 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
793 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
794 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
795 else
796 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
797 break;
799 case ISO_CODE_SO:
800 single_shifting = 0;
801 if (shift_out == 0
802 && (reg[1] >= 0
803 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
804 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
806 /* Locking shift out. */
807 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
808 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
810 break;
812 case ISO_CODE_SI:
813 single_shifting = 0;
814 if (shift_out == 1)
816 /* Locking shift in. */
817 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
818 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
820 break;
822 case ISO_CODE_CSI:
823 single_shifting = 0;
824 case ISO_CODE_SS2:
825 case ISO_CODE_SS3:
827 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
829 if (c != ISO_CODE_CSI)
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
832 & CODING_FLAG_ISO_SINGLE_SHIFT)
833 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
834 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
835 & CODING_FLAG_ISO_SINGLE_SHIFT)
836 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
837 single_shifting = 1;
839 if (VECTORP (Vlatin_extra_code_table)
840 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
842 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
843 & CODING_FLAG_ISO_LATIN_EXTRA)
844 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
845 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
846 & CODING_FLAG_ISO_LATIN_EXTRA)
847 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
849 mask &= newmask;
850 mask_found |= newmask;
852 break;
854 default:
855 if (c < 0x80)
857 single_shifting = 0;
858 break;
860 else if (c < 0xA0)
862 single_shifting = 0;
863 if (VECTORP (Vlatin_extra_code_table)
864 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
866 int newmask = 0;
868 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
869 & CODING_FLAG_ISO_LATIN_EXTRA)
870 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
871 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
872 & CODING_FLAG_ISO_LATIN_EXTRA)
873 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
874 mask &= newmask;
875 mask_found |= newmask;
877 else
878 return 0;
880 else
882 unsigned char *src_begin = src;
884 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
885 | CODING_CATEGORY_MASK_ISO_7_ELSE);
886 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
887 /* Check the length of succeeding codes of the range
888 0xA0..0FF. If the byte length is odd, we exclude
889 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
890 when we are not single shifting. */
891 if (!single_shifting)
893 while (src < src_end && *src >= 0xA0)
894 src++;
895 if ((src - src_begin - 1) & 1 && src < src_end)
896 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
897 else
898 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
901 break;
905 return (mask & mask_found);
908 /* Decode a character of which charset is CHARSET and the 1st position
909 code is C1. If dimension of CHARSET is 2, the 2nd position code is
910 fetched from SRC and set to C2. If CHARSET is negative, it means
911 that we are decoding ill formed text, and what we can do is just to
912 read C1 as is.
914 If we are now in the middle of composition sequence, the decoded
915 character may be ALTCHAR (see the comment above). In that case,
916 the character goes to coding->cmp_data->data instead of DST. */
918 #define DECODE_ISO_CHARACTER(charset, c1) \
919 do { \
920 int c_alt = -1, charset_alt = (charset); \
921 if (charset_alt >= 0) \
923 if (CHARSET_DIMENSION (charset_alt) == 2) \
925 ONE_MORE_BYTE (c2); \
926 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
927 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
929 src--; \
930 charset_alt = CHARSET_ASCII; \
933 if (!NILP (translation_table) \
934 && ((c_alt = translate_char (translation_table, \
935 -1, charset_alt, c1, c2)) >= 0)) \
936 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
938 if (! COMPOSING_P (coding) \
939 || coding->composing == COMPOSITION_RELATIVE \
940 || coding->composing == COMPOSITION_WITH_RULE) \
942 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
943 DECODE_CHARACTER_ASCII (c1); \
944 else if (CHARSET_DIMENSION (charset_alt) == 1) \
945 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
946 else \
947 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
949 if (COMPOSING_P (coding) \
950 && coding->composing != COMPOSITION_RELATIVE) \
952 if (c_alt < 0) \
953 c_alt = MAKE_CHAR (charset_alt, c1, c2); \
954 CODING_ADD_COMPOSITION_COMPONENT (coding, c_alt); \
955 coding->composition_rule_follows \
956 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
958 } while (0)
960 /* Set designation state into CODING. */
961 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
962 do { \
963 int charset; \
965 if (final_char < '0' || final_char >= 128) \
966 goto label_invalid_code; \
967 charset = ISO_CHARSET_TABLE (make_number (dimension), \
968 make_number (chars), \
969 make_number (final_char)); \
970 if (charset >= 0 \
971 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
972 || coding->safe_charsets[charset])) \
974 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
975 && reg == 0 \
976 && charset == CHARSET_ASCII) \
978 /* We should insert this designation sequence as is so \
979 that it is surely written back to a file. */ \
980 coding->spec.iso2022.last_invalid_designation_register = -1; \
981 goto label_invalid_code; \
983 coding->spec.iso2022.last_invalid_designation_register = -1; \
984 if ((coding->mode & CODING_MODE_DIRECTION) \
985 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
986 charset = CHARSET_REVERSE_CHARSET (charset); \
987 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
989 else \
991 coding->spec.iso2022.last_invalid_designation_register = reg; \
992 goto label_invalid_code; \
994 } while (0)
996 /* Allocate a memory block for storing information about compositions.
997 The block is chained to the already allocated blocks. */
999 static void
1000 coding_allocate_composition_data (coding, char_offset)
1001 struct coding_system *coding;
1002 int char_offset;
1004 struct composition_data *cmp_data
1005 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1007 cmp_data->char_offset = char_offset;
1008 cmp_data->used = 0;
1009 cmp_data->prev = coding->cmp_data;
1010 cmp_data->next = NULL;
1011 if (coding->cmp_data)
1012 coding->cmp_data->next = cmp_data;
1013 coding->cmp_data = cmp_data;
1014 coding->cmp_data_start = 0;
1017 /* Record the starting position START and METHOD of one composition. */
1019 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1020 do { \
1021 struct composition_data *cmp_data = coding->cmp_data; \
1022 int *data = cmp_data->data + cmp_data->used; \
1023 coding->cmp_data_start = cmp_data->used; \
1024 data[0] = -1; \
1025 data[1] = cmp_data->char_offset + start; \
1026 data[3] = (int) method; \
1027 cmp_data->used += 4; \
1028 } while (0)
1030 /* Record the ending position END of the current composition. */
1032 #define CODING_ADD_COMPOSITION_END(coding, end) \
1033 do { \
1034 struct composition_data *cmp_data = coding->cmp_data; \
1035 int *data = cmp_data->data + coding->cmp_data_start; \
1036 data[0] = cmp_data->used - coding->cmp_data_start; \
1037 data[2] = cmp_data->char_offset + end; \
1038 } while (0)
1040 /* Record one COMPONENT (alternate character or composition rule). */
1042 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1043 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1045 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1047 #define DECODE_COMPOSITION_START(c1) \
1048 do { \
1049 if (coding->composing == COMPOSITION_DISABLED) \
1051 *dst++ = ISO_CODE_ESC; \
1052 *dst++ = c1 & 0x7f; \
1053 coding->produced_char += 2; \
1055 else if (!COMPOSING_P (coding)) \
1057 /* This is surely the start of a composition. We must be sure \
1058 that coding->cmp_data has enough space to store the \
1059 information about the composition. If not, terminate the \
1060 current decoding loop, allocate one more memory block for \
1061 coding->cmp_data in the calller, then start the decoding \
1062 loop again. We can't allocate memory here directly because \
1063 it may cause buffer/string relocation. */ \
1064 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1065 >= COMPOSITION_DATA_SIZE) \
1067 result = CODING_FINISH_INSUFFICIENT_CMP; \
1068 goto label_end_of_loop_2; \
1070 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1071 : c1 == '2' ? COMPOSITION_WITH_RULE \
1072 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1073 : COMPOSITION_WITH_RULE_ALTCHARS); \
1074 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1075 coding->composing); \
1076 coding->composition_rule_follows = 0; \
1078 else \
1080 /* We are already handling a composition. If the method is \
1081 the following two, the codes following the current escape \
1082 sequence are actual characters stored in a buffer. */ \
1083 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1084 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1086 coding->composing = COMPOSITION_RELATIVE; \
1087 coding->composition_rule_follows = 0; \
1090 } while (0)
1092 /* Handle compositoin end sequence ESC 1. */
1094 #define DECODE_COMPOSITION_END(c1) \
1095 do { \
1096 if (coding->composing == COMPOSITION_DISABLED) \
1098 *dst++ = ISO_CODE_ESC; \
1099 *dst++ = c1; \
1100 coding->produced_char += 2; \
1102 else \
1104 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1105 coding->composing = COMPOSITION_NO; \
1107 } while (0)
1109 /* Decode a composition rule from the byte C1 (and maybe one more byte
1110 from SRC) and store one encoded composition rule in
1111 coding->cmp_data. */
1113 #define DECODE_COMPOSITION_RULE(c1) \
1114 do { \
1115 int rule = 0; \
1116 (c1) -= 32; \
1117 if (c1 < 81) /* old format (before ver.21) */ \
1119 int gref = (c1) / 9; \
1120 int nref = (c1) % 9; \
1121 if (gref == 4) gref = 10; \
1122 if (nref == 4) nref = 10; \
1123 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1125 else if (c1 < 93) /* new format (after ver.21 */ \
1127 ONE_MORE_BYTE (c2); \
1128 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1130 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1131 coding->composition_rule_follows = 0; \
1132 } while (0)
1135 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1138 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1139 struct coding_system *coding;
1140 unsigned char *source, *destination;
1141 int src_bytes, dst_bytes;
1143 unsigned char *src = source;
1144 unsigned char *src_end = source + src_bytes;
1145 unsigned char *dst = destination;
1146 unsigned char *dst_end = destination + dst_bytes;
1147 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1148 from DST_END to assure that overflow checking is necessary only
1149 at the head of loop. */
1150 unsigned char *adjusted_dst_end = dst_end - 6;
1151 int charset;
1152 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1153 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1154 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1155 Lisp_Object translation_table
1156 = coding->translation_table_for_decode;
1157 int result = CODING_FINISH_NORMAL;
1159 if (!NILP (Venable_character_translation) && NILP (translation_table))
1160 translation_table = Vstandard_translation_table_for_decode;
1162 coding->produced_char = 0;
1163 coding->fake_multibyte = 0;
1164 while (src < src_end && (dst_bytes
1165 ? (dst < adjusted_dst_end)
1166 : (dst < src - 6)))
1168 /* SRC_BASE remembers the start position in source in each loop.
1169 The loop will be exited when there's not enough source text
1170 to analyze long escape sequence or 2-byte code (within macros
1171 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1172 to SRC_BASE before exiting. */
1173 unsigned char *src_base = src;
1174 int c1 = *src++, c2;
1176 /* We produce no character or one character. */
1177 switch (iso_code_class [c1])
1179 case ISO_0x20_or_0x7F:
1180 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1182 DECODE_COMPOSITION_RULE (c1);
1183 break;
1185 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1187 /* This is SPACE or DEL. */
1188 *dst++ = c1;
1189 coding->produced_char++;
1190 break;
1192 /* This is a graphic character, we fall down ... */
1194 case ISO_graphic_plane_0:
1195 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1196 DECODE_COMPOSITION_RULE (c1);
1197 else
1198 DECODE_ISO_CHARACTER (charset0, c1);
1199 break;
1201 case ISO_0xA0_or_0xFF:
1202 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1203 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1204 goto label_invalid_code;
1205 /* This is a graphic character, we fall down ... */
1207 case ISO_graphic_plane_1:
1208 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1209 goto label_invalid_code;
1210 DECODE_ISO_CHARACTER (charset1, c1);
1211 break;
1213 case ISO_control_code:
1214 if (COMPOSING_P (coding))
1215 DECODE_COMPOSITION_END ('1');
1217 /* All ISO2022 control characters in this class have the
1218 same representation in Emacs internal format. */
1219 if (c1 == '\n'
1220 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1221 && (coding->eol_type == CODING_EOL_CR
1222 || coding->eol_type == CODING_EOL_CRLF))
1224 result = CODING_FINISH_INCONSISTENT_EOL;
1225 goto label_end_of_loop_2;
1227 *dst++ = c1;
1228 coding->produced_char++;
1229 break;
1231 case ISO_carriage_return:
1232 if (COMPOSING_P (coding))
1233 DECODE_COMPOSITION_END ('1');
1235 if (coding->eol_type == CODING_EOL_CR)
1236 *dst++ = '\n';
1237 else if (coding->eol_type == CODING_EOL_CRLF)
1239 ONE_MORE_BYTE (c1);
1240 if (c1 == ISO_CODE_LF)
1241 *dst++ = '\n';
1242 else
1244 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1246 result = CODING_FINISH_INCONSISTENT_EOL;
1247 goto label_end_of_loop_2;
1249 src--;
1250 *dst++ = '\r';
1253 else
1254 *dst++ = c1;
1255 coding->produced_char++;
1256 break;
1258 case ISO_shift_out:
1259 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1260 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1261 goto label_invalid_code;
1262 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1263 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1264 break;
1266 case ISO_shift_in:
1267 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1268 goto label_invalid_code;
1269 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1270 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1271 break;
1273 case ISO_single_shift_2_7:
1274 case ISO_single_shift_2:
1275 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1276 goto label_invalid_code;
1277 /* SS2 is handled as an escape sequence of ESC 'N' */
1278 c1 = 'N';
1279 goto label_escape_sequence;
1281 case ISO_single_shift_3:
1282 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1283 goto label_invalid_code;
1284 /* SS2 is handled as an escape sequence of ESC 'O' */
1285 c1 = 'O';
1286 goto label_escape_sequence;
1288 case ISO_control_sequence_introducer:
1289 /* CSI is handled as an escape sequence of ESC '[' ... */
1290 c1 = '[';
1291 goto label_escape_sequence;
1293 case ISO_escape:
1294 ONE_MORE_BYTE (c1);
1295 label_escape_sequence:
1296 /* Escape sequences handled by Emacs are invocation,
1297 designation, direction specification, and character
1298 composition specification. */
1299 switch (c1)
1301 case '&': /* revision of following character set */
1302 ONE_MORE_BYTE (c1);
1303 if (!(c1 >= '@' && c1 <= '~'))
1304 goto label_invalid_code;
1305 ONE_MORE_BYTE (c1);
1306 if (c1 != ISO_CODE_ESC)
1307 goto label_invalid_code;
1308 ONE_MORE_BYTE (c1);
1309 goto label_escape_sequence;
1311 case '$': /* designation of 2-byte character set */
1312 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1313 goto label_invalid_code;
1314 ONE_MORE_BYTE (c1);
1315 if (c1 >= '@' && c1 <= 'B')
1316 { /* designation of JISX0208.1978, GB2312.1980,
1317 or JISX0208.1980 */
1318 DECODE_DESIGNATION (0, 2, 94, c1);
1320 else if (c1 >= 0x28 && c1 <= 0x2B)
1321 { /* designation of DIMENSION2_CHARS94 character set */
1322 ONE_MORE_BYTE (c2);
1323 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1325 else if (c1 >= 0x2C && c1 <= 0x2F)
1326 { /* designation of DIMENSION2_CHARS96 character set */
1327 ONE_MORE_BYTE (c2);
1328 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1330 else
1331 goto label_invalid_code;
1332 break;
1334 case 'n': /* invocation of locking-shift-2 */
1335 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1336 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1337 goto label_invalid_code;
1338 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1339 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1340 break;
1342 case 'o': /* invocation of locking-shift-3 */
1343 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1344 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1345 goto label_invalid_code;
1346 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1347 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1348 break;
1350 case 'N': /* invocation of single-shift-2 */
1351 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1352 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1353 goto label_invalid_code;
1354 ONE_MORE_BYTE (c1);
1355 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1356 DECODE_ISO_CHARACTER (charset, c1);
1357 break;
1359 case 'O': /* invocation of single-shift-3 */
1360 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1361 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1362 goto label_invalid_code;
1363 ONE_MORE_BYTE (c1);
1364 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1365 DECODE_ISO_CHARACTER (charset, c1);
1366 break;
1368 case '0': case '2': case '3': case '4': /* start composition */
1369 DECODE_COMPOSITION_START (c1);
1370 break;
1372 case '1': /* end composition */
1373 DECODE_COMPOSITION_END (c1);
1374 break;
1376 case '[': /* specification of direction */
1377 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1378 goto label_invalid_code;
1379 /* For the moment, nested direction is not supported.
1380 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1381 left-to-right, and nozero means right-to-left. */
1382 ONE_MORE_BYTE (c1);
1383 switch (c1)
1385 case ']': /* end of the current direction */
1386 coding->mode &= ~CODING_MODE_DIRECTION;
1388 case '0': /* end of the current direction */
1389 case '1': /* start of left-to-right direction */
1390 ONE_MORE_BYTE (c1);
1391 if (c1 == ']')
1392 coding->mode &= ~CODING_MODE_DIRECTION;
1393 else
1394 goto label_invalid_code;
1395 break;
1397 case '2': /* start of right-to-left direction */
1398 ONE_MORE_BYTE (c1);
1399 if (c1 == ']')
1400 coding->mode |= CODING_MODE_DIRECTION;
1401 else
1402 goto label_invalid_code;
1403 break;
1405 default:
1406 goto label_invalid_code;
1408 break;
1410 default:
1411 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1412 goto label_invalid_code;
1413 if (c1 >= 0x28 && c1 <= 0x2B)
1414 { /* designation of DIMENSION1_CHARS94 character set */
1415 ONE_MORE_BYTE (c2);
1416 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1418 else if (c1 >= 0x2C && c1 <= 0x2F)
1419 { /* designation of DIMENSION1_CHARS96 character set */
1420 ONE_MORE_BYTE (c2);
1421 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1423 else
1425 goto label_invalid_code;
1428 /* We must update these variables now. */
1429 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1430 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1431 break;
1433 label_invalid_code:
1434 if (COMPOSING_P (coding))
1435 DECODE_COMPOSITION_END ('1');
1436 coding->produced_char += src - src_base;
1437 while (src_base < src)
1438 *dst++ = (*src_base++) & 0x7F;
1440 continue;
1442 label_end_of_loop:
1443 result = CODING_FINISH_INSUFFICIENT_SRC;
1444 label_end_of_loop_2:
1445 src = src_base;
1446 break;
1449 if (src < src_end)
1451 if (result == CODING_FINISH_NORMAL)
1452 result = CODING_FINISH_INSUFFICIENT_DST;
1453 else if (result != CODING_FINISH_INCONSISTENT_EOL
1454 && coding->mode & CODING_MODE_LAST_BLOCK)
1456 /* This is the last block of the text to be decoded. We had
1457 better just flush out all remaining codes in the text
1458 although they are not valid characters. */
1459 if (COMPOSING_P (coding))
1460 DECODE_COMPOSITION_END ('1');
1461 src_bytes = src_end - src;
1462 if (dst_bytes && (dst_end - dst < src_end - src))
1463 src_end = src + (dst_end - dst);
1464 coding->produced_char += src_end - src;
1465 while (src < src_end)
1466 *dst++ = (*src++) & 0x7F;
1470 coding->consumed = coding->consumed_char = src - source;
1471 coding->produced = dst - destination;
1472 return result;
1475 /* ISO2022 encoding stuff. */
1478 It is not enough to say just "ISO2022" on encoding, we have to
1479 specify more details. In Emacs, each coding system of ISO2022
1480 variant has the following specifications:
1481 1. Initial designation to G0 thru G3.
1482 2. Allows short-form designation?
1483 3. ASCII should be designated to G0 before control characters?
1484 4. ASCII should be designated to G0 at end of line?
1485 5. 7-bit environment or 8-bit environment?
1486 6. Use locking-shift?
1487 7. Use Single-shift?
1488 And the following two are only for Japanese:
1489 8. Use ASCII in place of JIS0201-1976-Roman?
1490 9. Use JISX0208-1983 in place of JISX0208-1978?
1491 These specifications are encoded in `coding->flags' as flag bits
1492 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1493 details.
1496 /* Produce codes (escape sequence) for designating CHARSET to graphic
1497 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1498 the coding system CODING allows, produce designation sequence of
1499 short-form. */
1501 #define ENCODE_DESIGNATION(charset, reg, coding) \
1502 do { \
1503 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1504 char *intermediate_char_94 = "()*+"; \
1505 char *intermediate_char_96 = ",-./"; \
1506 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1507 if (revision < 255) \
1509 *dst++ = ISO_CODE_ESC; \
1510 *dst++ = '&'; \
1511 *dst++ = '@' + revision; \
1513 *dst++ = ISO_CODE_ESC; \
1514 if (CHARSET_DIMENSION (charset) == 1) \
1516 if (CHARSET_CHARS (charset) == 94) \
1517 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1518 else \
1519 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1521 else \
1523 *dst++ = '$'; \
1524 if (CHARSET_CHARS (charset) == 94) \
1526 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1527 || reg != 0 \
1528 || final_char < '@' || final_char > 'B') \
1529 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1531 else \
1532 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1534 *dst++ = final_char; \
1535 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1536 } while (0)
1538 /* The following two macros produce codes (control character or escape
1539 sequence) for ISO2022 single-shift functions (single-shift-2 and
1540 single-shift-3). */
1542 #define ENCODE_SINGLE_SHIFT_2 \
1543 do { \
1544 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1545 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1546 else \
1548 *dst++ = ISO_CODE_SS2; \
1549 coding->fake_multibyte = 1; \
1551 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1552 } while (0)
1554 #define ENCODE_SINGLE_SHIFT_3 \
1555 do { \
1556 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1557 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1558 else \
1560 *dst++ = ISO_CODE_SS3; \
1561 coding->fake_multibyte = 1; \
1563 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1564 } while (0)
1566 /* The following four macros produce codes (control character or
1567 escape sequence) for ISO2022 locking-shift functions (shift-in,
1568 shift-out, locking-shift-2, and locking-shift-3). */
1570 #define ENCODE_SHIFT_IN \
1571 do { \
1572 *dst++ = ISO_CODE_SI; \
1573 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1574 } while (0)
1576 #define ENCODE_SHIFT_OUT \
1577 do { \
1578 *dst++ = ISO_CODE_SO; \
1579 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1580 } while (0)
1582 #define ENCODE_LOCKING_SHIFT_2 \
1583 do { \
1584 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1585 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1586 } while (0)
1588 #define ENCODE_LOCKING_SHIFT_3 \
1589 do { \
1590 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1591 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1592 } while (0)
1594 /* Produce codes for a DIMENSION1 character whose character set is
1595 CHARSET and whose position-code is C1. Designation and invocation
1596 sequences are also produced in advance if necessary. */
1599 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1600 do { \
1601 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1603 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1604 *dst++ = c1 & 0x7F; \
1605 else \
1606 *dst++ = c1 | 0x80; \
1607 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1608 break; \
1610 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1612 *dst++ = c1 & 0x7F; \
1613 break; \
1615 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1617 *dst++ = c1 | 0x80; \
1618 break; \
1620 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1621 && !coding->safe_charsets[charset]) \
1623 /* We should not encode this character, instead produce one or \
1624 two `?'s. */ \
1625 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1626 if (CHARSET_WIDTH (charset) == 2) \
1627 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1628 break; \
1630 else \
1631 /* Since CHARSET is not yet invoked to any graphic planes, we \
1632 must invoke it, or, at first, designate it to some graphic \
1633 register. Then repeat the loop to actually produce the \
1634 character. */ \
1635 dst = encode_invocation_designation (charset, coding, dst); \
1636 } while (1)
1638 /* Produce codes for a DIMENSION2 character whose character set is
1639 CHARSET and whose position-codes are C1 and C2. Designation and
1640 invocation codes are also produced in advance if necessary. */
1642 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1643 do { \
1644 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1646 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1647 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1648 else \
1649 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1650 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1651 break; \
1653 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1655 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1656 break; \
1658 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1660 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1661 break; \
1663 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1664 && !coding->safe_charsets[charset]) \
1666 /* We should not encode this character, instead produce one or \
1667 two `?'s. */ \
1668 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1669 if (CHARSET_WIDTH (charset) == 2) \
1670 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1671 break; \
1673 else \
1674 /* Since CHARSET is not yet invoked to any graphic planes, we \
1675 must invoke it, or, at first, designate it to some graphic \
1676 register. Then repeat the loop to actually produce the \
1677 character. */ \
1678 dst = encode_invocation_designation (charset, coding, dst); \
1679 } while (1)
1681 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1682 do { \
1683 int c_alt, charset_alt; \
1685 if (!NILP (translation_table) \
1686 && ((c_alt = translate_char (translation_table, -1, \
1687 charset, c1, c2)) \
1688 >= 0)) \
1689 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1690 else \
1691 charset_alt = charset; \
1692 if (CHARSET_DEFINED_P (charset_alt)) \
1694 if (CHARSET_DIMENSION (charset_alt) == 1) \
1696 if (charset == CHARSET_ASCII \
1697 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1698 charset_alt = charset_latin_jisx0201; \
1699 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1701 else \
1703 if (charset == charset_jisx0208 \
1704 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1705 charset_alt = charset_jisx0208_1978; \
1706 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1709 else \
1711 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1713 *dst++ = charset & 0x7f; \
1714 *dst++ = c1 & 0x7f; \
1715 if (c2) \
1716 *dst++ = c2 & 0x7f; \
1718 else \
1720 *dst++ = charset; \
1721 *dst++ = c1; \
1722 if (c2) \
1723 *dst++ = c2; \
1726 coding->consumed_char++; \
1727 } while (0)
1729 /* Produce designation and invocation codes at a place pointed by DST
1730 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1731 Return new DST. */
1733 unsigned char *
1734 encode_invocation_designation (charset, coding, dst)
1735 int charset;
1736 struct coding_system *coding;
1737 unsigned char *dst;
1739 int reg; /* graphic register number */
1741 /* At first, check designations. */
1742 for (reg = 0; reg < 4; reg++)
1743 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1744 break;
1746 if (reg >= 4)
1748 /* CHARSET is not yet designated to any graphic registers. */
1749 /* At first check the requested designation. */
1750 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1751 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1752 /* Since CHARSET requests no special designation, designate it
1753 to graphic register 0. */
1754 reg = 0;
1756 ENCODE_DESIGNATION (charset, reg, coding);
1759 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1760 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1762 /* Since the graphic register REG is not invoked to any graphic
1763 planes, invoke it to graphic plane 0. */
1764 switch (reg)
1766 case 0: /* graphic register 0 */
1767 ENCODE_SHIFT_IN;
1768 break;
1770 case 1: /* graphic register 1 */
1771 ENCODE_SHIFT_OUT;
1772 break;
1774 case 2: /* graphic register 2 */
1775 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1776 ENCODE_SINGLE_SHIFT_2;
1777 else
1778 ENCODE_LOCKING_SHIFT_2;
1779 break;
1781 case 3: /* graphic register 3 */
1782 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1783 ENCODE_SINGLE_SHIFT_3;
1784 else
1785 ENCODE_LOCKING_SHIFT_3;
1786 break;
1789 return dst;
1792 /* Produce 2-byte codes for encoded composition rule RULE. */
1794 #define ENCODE_COMPOSITION_RULE(rule) \
1795 do { \
1796 int gref, nref; \
1797 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1798 *dst++ = 32 + 81 + gref; \
1799 *dst++ = 32 + nref; \
1800 } while (0)
1802 /* Produce codes for indicating the start of a composition sequence
1803 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1804 which specify information about the composition. See the comment
1805 in coding.h for the format of DATA. */
1807 #define ENCODE_COMPOSITION_START(coding, data) \
1808 do { \
1809 coding->composing = data[3]; \
1810 *dst++ = ISO_CODE_ESC; \
1811 if (coding->composing == COMPOSITION_RELATIVE) \
1812 *dst++ = '0'; \
1813 else \
1815 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1816 ? '3' : '4'); \
1817 coding->cmp_data_index = coding->cmp_data_start + 4; \
1818 coding->composition_rule_follows = 0; \
1820 } while (0)
1822 /* Produce codes for indicating the end of the current composition. */
1824 #define ENCODE_COMPOSITION_END(coding, data) \
1825 do { \
1826 *dst++ = ISO_CODE_ESC; \
1827 *dst++ = '1'; \
1828 coding->cmp_data_start += data[0]; \
1829 coding->composing = COMPOSITION_NO; \
1830 if (coding->cmp_data_start == coding->cmp_data->used \
1831 && coding->cmp_data->next) \
1833 coding->cmp_data = coding->cmp_data->next; \
1834 coding->cmp_data_start = 0; \
1836 } while (0)
1838 /* Produce composition start sequence ESC 0. Here, this sequence
1839 doesn't mean the start of a new composition but means that we have
1840 just produced components (alternate chars and composition rules) of
1841 the composition and the actual text follows in SRC. */
1843 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1844 do { \
1845 *dst++ = ISO_CODE_ESC; \
1846 *dst++ = '0'; \
1847 coding->composing = COMPOSITION_RELATIVE; \
1848 } while (0)
1850 /* The following three macros produce codes for indicating direction
1851 of text. */
1852 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1853 do { \
1854 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1855 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1856 else \
1857 *dst++ = ISO_CODE_CSI; \
1858 } while (0)
1860 #define ENCODE_DIRECTION_R2L \
1861 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1863 #define ENCODE_DIRECTION_L2R \
1864 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1866 /* Produce codes for designation and invocation to reset the graphic
1867 planes and registers to initial state. */
1868 #define ENCODE_RESET_PLANE_AND_REGISTER \
1869 do { \
1870 int reg; \
1871 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1872 ENCODE_SHIFT_IN; \
1873 for (reg = 0; reg < 4; reg++) \
1874 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1875 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1876 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1877 ENCODE_DESIGNATION \
1878 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1879 } while (0)
1881 /* Produce designation sequences of charsets in the line started from
1882 SRC to a place pointed by *DSTP, and update DSTP.
1884 If the current block ends before any end-of-line, we may fail to
1885 find all the necessary designations. */
1887 void
1888 encode_designation_at_bol (coding, table, src, src_end, dstp)
1889 struct coding_system *coding;
1890 Lisp_Object table;
1891 unsigned char *src, *src_end, **dstp;
1893 int charset, c, found = 0, reg;
1894 /* Table of charsets to be designated to each graphic register. */
1895 int r[4];
1896 unsigned char *dst = *dstp;
1898 for (reg = 0; reg < 4; reg++)
1899 r[reg] = -1;
1901 while (src < src_end && *src != '\n' && found < 4)
1903 int bytes = BYTES_BY_CHAR_HEAD (*src);
1905 if (NILP (table))
1906 charset = CHARSET_AT (src);
1907 else
1909 int c_alt;
1910 unsigned char c1, c2;
1912 SPLIT_STRING(src, bytes, charset, c1, c2);
1913 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1914 charset = CHAR_CHARSET (c_alt);
1917 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1918 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1920 found++;
1921 r[reg] = charset;
1924 src += bytes;
1927 if (found)
1929 for (reg = 0; reg < 4; reg++)
1930 if (r[reg] >= 0
1931 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1932 ENCODE_DESIGNATION (r[reg], reg, coding);
1933 *dstp = dst;
1937 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1940 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1941 struct coding_system *coding;
1942 unsigned char *source, *destination;
1943 int src_bytes, dst_bytes;
1945 unsigned char *src = source;
1946 unsigned char *src_end = source + src_bytes;
1947 unsigned char *dst = destination;
1948 unsigned char *dst_end = destination + dst_bytes;
1949 /* Since the maximum bytes produced by each loop is 14, we subtract 13
1950 from DST_END to assure overflow checking is necessary only at the
1951 head of loop. */
1952 unsigned char *adjusted_dst_end = dst_end - 13;
1953 Lisp_Object translation_table
1954 = coding->translation_table_for_encode;
1955 int result = CODING_FINISH_NORMAL;
1957 if (!NILP (Venable_character_translation) && NILP (translation_table))
1958 translation_table = Vstandard_translation_table_for_encode;
1960 coding->consumed_char = 0;
1961 coding->fake_multibyte = 0;
1962 while (src < src_end && (dst_bytes
1963 ? (dst < adjusted_dst_end)
1964 : (dst < src - 13)))
1966 /* SRC_BASE remembers the start position in source in each loop.
1967 The loop will be exited when there's not enough source text
1968 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1969 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1970 reset to SRC_BASE before exiting. */
1971 unsigned char *src_base = src;
1972 int charset, c1, c2, c3, c4;
1974 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1975 && CODING_SPEC_ISO_BOL (coding))
1977 /* We have to produce designation sequences if any now. */
1978 encode_designation_at_bol (coding, translation_table,
1979 src, src_end, &dst);
1980 CODING_SPEC_ISO_BOL (coding) = 0;
1983 /* Check composition start and end. */
1984 if (coding->composing != COMPOSITION_DISABLED
1985 && coding->cmp_data_start < coding->cmp_data->used)
1987 struct composition_data *cmp_data = coding->cmp_data;
1988 int *data = cmp_data->data + coding->cmp_data_start;
1989 int this_pos = cmp_data->char_offset + coding->consumed_char;
1991 if (coding->composing == COMPOSITION_RELATIVE)
1993 if (this_pos == data[2])
1995 ENCODE_COMPOSITION_END (coding, data);
1996 cmp_data = coding->cmp_data;
1997 data = cmp_data->data + coding->cmp_data_start;
2000 else if (COMPOSING_P (coding))
2002 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2003 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2004 /* We have consumed components of the composition.
2005 What follows in SRC is the compositions's base
2006 text. */
2007 ENCODE_COMPOSITION_FAKE_START (coding);
2008 else
2010 int c = cmp_data->data[coding->cmp_data_index++];
2011 if (coding->composition_rule_follows)
2013 ENCODE_COMPOSITION_RULE (c);
2014 coding->composition_rule_follows = 0;
2016 else
2018 SPLIT_CHAR (c, charset, c1, c2);
2019 ENCODE_ISO_CHARACTER (charset, c1, c2);
2020 /* But, we didn't consume a character in SRC. */
2021 coding->consumed_char--;
2022 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2023 coding->composition_rule_follows = 1;
2025 continue;
2028 if (!COMPOSING_P (coding))
2030 if (this_pos == data[1])
2032 ENCODE_COMPOSITION_START (coding, data);
2033 continue;
2038 c1 = *src++;
2039 /* Now encode one character. C1 is a control character, an
2040 ASCII character, or a leading-code of multi-byte character. */
2041 switch (emacs_code_class[c1])
2043 case EMACS_ascii_code:
2044 c2 = 0;
2045 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
2046 break;
2048 case EMACS_control_code:
2049 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2050 ENCODE_RESET_PLANE_AND_REGISTER;
2051 *dst++ = c1;
2052 coding->consumed_char++;
2053 break;
2055 case EMACS_carriage_return_code:
2056 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2058 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2059 ENCODE_RESET_PLANE_AND_REGISTER;
2060 *dst++ = c1;
2061 coding->consumed_char++;
2062 break;
2064 /* fall down to treat '\r' as '\n' ... */
2066 case EMACS_linefeed_code:
2067 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2068 ENCODE_RESET_PLANE_AND_REGISTER;
2069 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2070 bcopy (coding->spec.iso2022.initial_designation,
2071 coding->spec.iso2022.current_designation,
2072 sizeof coding->spec.iso2022.initial_designation);
2073 if (coding->eol_type == CODING_EOL_LF
2074 || coding->eol_type == CODING_EOL_UNDECIDED)
2075 *dst++ = ISO_CODE_LF;
2076 else if (coding->eol_type == CODING_EOL_CRLF)
2077 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2078 else
2079 *dst++ = ISO_CODE_CR;
2080 CODING_SPEC_ISO_BOL (coding) = 1;
2081 coding->consumed_char++;
2082 break;
2084 case EMACS_leading_code_2:
2085 ONE_MORE_BYTE (c2);
2086 c3 = 0;
2087 if (c2 < 0xA0)
2089 /* invalid sequence */
2090 *dst++ = c1;
2091 src--;
2092 coding->consumed_char++;
2094 else
2095 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
2096 break;
2098 case EMACS_leading_code_3:
2099 TWO_MORE_BYTES (c2, c3);
2100 c4 = 0;
2101 if (c2 < 0xA0 || c3 < 0xA0)
2103 /* invalid sequence */
2104 *dst++ = c1;
2105 src -= 2;
2106 coding->consumed_char++;
2108 else if (c1 < LEADING_CODE_PRIVATE_11)
2109 ENCODE_ISO_CHARACTER (c1, c2, c3);
2110 else
2111 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
2112 break;
2114 case EMACS_leading_code_4:
2115 THREE_MORE_BYTES (c2, c3, c4);
2116 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
2118 /* invalid sequence */
2119 *dst++ = c1;
2120 src -= 3;
2121 coding->consumed_char++;
2123 else
2124 ENCODE_ISO_CHARACTER (c2, c3, c4);
2125 break;
2127 case EMACS_invalid_code:
2128 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2129 ENCODE_RESET_PLANE_AND_REGISTER;
2130 *dst++ = c1;
2131 coding->consumed_char++;
2132 break;
2134 continue;
2135 label_end_of_loop:
2136 result = CODING_FINISH_INSUFFICIENT_SRC;
2137 src = src_base;
2138 break;
2141 if (src < src_end && result == CODING_FINISH_NORMAL)
2142 result = CODING_FINISH_INSUFFICIENT_DST;
2144 /* If this is the last block of the text to be encoded, we must
2145 reset graphic planes and registers to the initial state, and
2146 flush out the carryover if any. */
2147 if (coding->mode & CODING_MODE_LAST_BLOCK)
2149 ENCODE_RESET_PLANE_AND_REGISTER;
2150 if (COMPOSING_P (coding))
2151 *dst++ = ISO_CODE_ESC, *dst++ = '1';
2152 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2154 while (src < src_end && dst < dst_end)
2155 *dst++ = *src++;
2158 coding->consumed = src - source;
2159 coding->produced = coding->produced_char = dst - destination;
2160 return result;
2164 /*** 4. SJIS and BIG5 handlers ***/
2166 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2167 quite widely. So, for the moment, Emacs supports them in the bare
2168 C code. But, in the future, they may be supported only by CCL. */
2170 /* SJIS is a coding system encoding three character sets: ASCII, right
2171 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2172 as is. A character of charset katakana-jisx0201 is encoded by
2173 "position-code + 0x80". A character of charset japanese-jisx0208
2174 is encoded in 2-byte but two position-codes are divided and shifted
2175 so that it fit in the range below.
2177 --- CODE RANGE of SJIS ---
2178 (character set) (range)
2179 ASCII 0x00 .. 0x7F
2180 KATAKANA-JISX0201 0xA0 .. 0xDF
2181 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2182 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2183 -------------------------------
2187 /* BIG5 is a coding system encoding two character sets: ASCII and
2188 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2189 character set and is encoded in two-byte.
2191 --- CODE RANGE of BIG5 ---
2192 (character set) (range)
2193 ASCII 0x00 .. 0x7F
2194 Big5 (1st byte) 0xA1 .. 0xFE
2195 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2196 --------------------------
2198 Since the number of characters in Big5 is larger than maximum
2199 characters in Emacs' charset (96x96), it can't be handled as one
2200 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2201 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2202 contains frequently used characters and the latter contains less
2203 frequently used characters. */
2205 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2206 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2207 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2208 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2210 /* Number of Big5 characters which have the same code in 1st byte. */
2211 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2213 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2214 do { \
2215 unsigned int temp \
2216 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2217 if (b1 < 0xC9) \
2218 charset = charset_big5_1; \
2219 else \
2221 charset = charset_big5_2; \
2222 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2224 c1 = temp / (0xFF - 0xA1) + 0x21; \
2225 c2 = temp % (0xFF - 0xA1) + 0x21; \
2226 } while (0)
2228 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2229 do { \
2230 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2231 if (charset == charset_big5_2) \
2232 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2233 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2234 b2 = temp % BIG5_SAME_ROW; \
2235 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2236 } while (0)
2238 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2239 do { \
2240 int c_alt, charset_alt = (charset); \
2241 if (!NILP (translation_table) \
2242 && ((c_alt = translate_char (translation_table, \
2243 -1, (charset), c1, c2)) >= 0)) \
2244 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2245 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2246 DECODE_CHARACTER_ASCII (c1); \
2247 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2248 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2249 else \
2250 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2251 } while (0)
2253 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2254 do { \
2255 int c_alt, charset_alt; \
2256 if (!NILP (translation_table) \
2257 && ((c_alt = translate_char (translation_table, -1, \
2258 charset, c1, c2)) \
2259 >= 0)) \
2260 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2261 else \
2262 charset_alt = charset; \
2263 if (charset_alt == charset_ascii) \
2264 *dst++ = c1; \
2265 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2267 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2268 *dst++ = c1; \
2269 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2270 *dst++ = c1 & 0x7F; \
2271 else \
2273 *dst++ = charset_alt, *dst++ = c1; \
2274 coding->fake_multibyte = 1; \
2277 else \
2279 c1 &= 0x7F, c2 &= 0x7F; \
2280 if (sjis_p && (charset_alt == charset_jisx0208 \
2281 || charset_alt == charset_jisx0208_1978))\
2283 unsigned char s1, s2; \
2285 ENCODE_SJIS (c1, c2, s1, s2); \
2286 *dst++ = s1, *dst++ = s2; \
2287 coding->fake_multibyte = 1; \
2289 else if (!sjis_p \
2290 && (charset_alt == charset_big5_1 \
2291 || charset_alt == charset_big5_2)) \
2293 unsigned char b1, b2; \
2295 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2296 *dst++ = b1, *dst++ = b2; \
2298 else \
2300 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2301 coding->fake_multibyte = 1; \
2304 coding->consumed_char++; \
2305 } while (0)
2307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2308 Check if a text is encoded in SJIS. If it is, return
2309 CODING_CATEGORY_MASK_SJIS, else return 0. */
2312 detect_coding_sjis (src, src_end)
2313 unsigned char *src, *src_end;
2315 unsigned char c;
2317 while (src < src_end)
2319 c = *src++;
2320 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2322 if (src < src_end && *src++ < 0x40)
2323 return 0;
2326 return CODING_CATEGORY_MASK_SJIS;
2329 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2330 Check if a text is encoded in BIG5. If it is, return
2331 CODING_CATEGORY_MASK_BIG5, else return 0. */
2334 detect_coding_big5 (src, src_end)
2335 unsigned char *src, *src_end;
2337 unsigned char c;
2339 while (src < src_end)
2341 c = *src++;
2342 if (c >= 0xA1)
2344 if (src >= src_end)
2345 break;
2346 c = *src++;
2347 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2348 return 0;
2351 return CODING_CATEGORY_MASK_BIG5;
2354 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2355 Check if a text is encoded in UTF-8. If it is, return
2356 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2358 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2359 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2360 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2361 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2362 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2363 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2364 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2367 detect_coding_utf_8 (src, src_end)
2368 unsigned char *src, *src_end;
2370 unsigned char c;
2371 int seq_maybe_bytes;
2373 while (src < src_end)
2375 c = *src++;
2376 if (UTF_8_1_OCTET_P (c))
2377 continue;
2378 else if (UTF_8_2_OCTET_LEADING_P (c))
2379 seq_maybe_bytes = 1;
2380 else if (UTF_8_3_OCTET_LEADING_P (c))
2381 seq_maybe_bytes = 2;
2382 else if (UTF_8_4_OCTET_LEADING_P (c))
2383 seq_maybe_bytes = 3;
2384 else if (UTF_8_5_OCTET_LEADING_P (c))
2385 seq_maybe_bytes = 4;
2386 else if (UTF_8_6_OCTET_LEADING_P (c))
2387 seq_maybe_bytes = 5;
2388 else
2389 return 0;
2393 if (src >= src_end)
2394 return CODING_CATEGORY_MASK_UTF_8;
2396 c = *src++;
2397 if (!UTF_8_EXTRA_OCTET_P (c))
2398 return 0;
2399 seq_maybe_bytes--;
2401 while (seq_maybe_bytes > 0);
2404 return CODING_CATEGORY_MASK_UTF_8;
2407 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2408 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2409 Little Endian (otherwise). If it is, return
2410 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2411 else return 0. */
2413 #define UTF_16_INVALID_P(val) \
2414 (((val) == 0xFFFE) \
2415 || ((val) == 0xFFFF))
2417 #define UTF_16_HIGH_SURROGATE_P(val) \
2418 (((val) & 0xD800) == 0xD800)
2420 #define UTF_16_LOW_SURROGATE_P(val) \
2421 (((val) & 0xDC00) == 0xDC00)
2424 detect_coding_utf_16 (src, src_end)
2425 unsigned char *src, *src_end;
2427 if ((src + 1) >= src_end) return 0;
2429 if ((src[0] == 0xFF) && (src[1] == 0xFE))
2430 return CODING_CATEGORY_MASK_UTF_16_LE;
2431 else if ((src[0] == 0xFE) && (src[1] == 0xFF))
2432 return CODING_CATEGORY_MASK_UTF_16_BE;
2434 return 0;
2437 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2438 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2441 decode_coding_sjis_big5 (coding, source, destination,
2442 src_bytes, dst_bytes, sjis_p)
2443 struct coding_system *coding;
2444 unsigned char *source, *destination;
2445 int src_bytes, dst_bytes;
2446 int sjis_p;
2448 unsigned char *src = source;
2449 unsigned char *src_end = source + src_bytes;
2450 unsigned char *dst = destination;
2451 unsigned char *dst_end = destination + dst_bytes;
2452 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2453 from DST_END to assure overflow checking is necessary only at the
2454 head of loop. */
2455 unsigned char *adjusted_dst_end = dst_end - 3;
2456 Lisp_Object translation_table
2457 = coding->translation_table_for_decode;
2458 int result = CODING_FINISH_NORMAL;
2460 if (!NILP (Venable_character_translation) && NILP (translation_table))
2461 translation_table = Vstandard_translation_table_for_decode;
2463 coding->produced_char = 0;
2464 coding->fake_multibyte = 0;
2465 while (src < src_end && (dst_bytes
2466 ? (dst < adjusted_dst_end)
2467 : (dst < src - 3)))
2469 /* SRC_BASE remembers the start position in source in each loop.
2470 The loop will be exited when there's not enough source text
2471 to analyze two-byte character (within macro ONE_MORE_BYTE).
2472 In that case, SRC is reset to SRC_BASE before exiting. */
2473 unsigned char *src_base = src;
2474 unsigned char c1 = *src++, c2, c3, c4;
2476 if (c1 < 0x20)
2478 if (c1 == '\r')
2480 if (coding->eol_type == CODING_EOL_CRLF)
2482 ONE_MORE_BYTE (c2);
2483 if (c2 == '\n')
2484 *dst++ = c2;
2485 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2487 result = CODING_FINISH_INCONSISTENT_EOL;
2488 goto label_end_of_loop_2;
2490 else
2491 /* To process C2 again, SRC is subtracted by 1. */
2492 *dst++ = c1, src--;
2494 else if (coding->eol_type == CODING_EOL_CR)
2495 *dst++ = '\n';
2496 else
2497 *dst++ = c1;
2499 else if (c1 == '\n'
2500 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2501 && (coding->eol_type == CODING_EOL_CR
2502 || coding->eol_type == CODING_EOL_CRLF))
2504 result = CODING_FINISH_INCONSISTENT_EOL;
2505 goto label_end_of_loop_2;
2507 else
2508 *dst++ = c1;
2509 coding->produced_char++;
2511 else if (c1 < 0x80)
2513 c2 = 0; /* avoid warning */
2514 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2516 else
2518 if (sjis_p)
2520 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2522 /* SJIS -> JISX0208 */
2523 ONE_MORE_BYTE (c2);
2524 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2526 DECODE_SJIS (c1, c2, c3, c4);
2527 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2529 else
2530 goto label_invalid_code_2;
2532 else if (c1 < 0xE0)
2533 /* SJIS -> JISX0201-Kana */
2535 c2 = 0; /* avoid warning */
2536 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2537 /* dummy */ c2);
2539 else
2540 goto label_invalid_code_1;
2542 else
2544 /* BIG5 -> Big5 */
2545 if (c1 >= 0xA1 && c1 <= 0xFE)
2547 ONE_MORE_BYTE (c2);
2548 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2550 int charset;
2552 DECODE_BIG5 (c1, c2, charset, c3, c4);
2553 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2555 else
2556 goto label_invalid_code_2;
2558 else
2559 goto label_invalid_code_1;
2562 continue;
2564 label_invalid_code_1:
2565 *dst++ = c1;
2566 coding->produced_char++;
2567 coding->fake_multibyte = 1;
2568 continue;
2570 label_invalid_code_2:
2571 *dst++ = c1; *dst++= c2;
2572 coding->produced_char += 2;
2573 coding->fake_multibyte = 1;
2574 continue;
2576 label_end_of_loop:
2577 result = CODING_FINISH_INSUFFICIENT_SRC;
2578 label_end_of_loop_2:
2579 src = src_base;
2580 break;
2583 if (src < src_end)
2585 if (result == CODING_FINISH_NORMAL)
2586 result = CODING_FINISH_INSUFFICIENT_DST;
2587 else if (result != CODING_FINISH_INCONSISTENT_EOL
2588 && coding->mode & CODING_MODE_LAST_BLOCK)
2590 src_bytes = src_end - src;
2591 if (dst_bytes && (dst_end - dst < src_bytes))
2592 src_bytes = dst_end - dst;
2593 bcopy (dst, src, src_bytes);
2594 src += src_bytes;
2595 dst += src_bytes;
2596 coding->fake_multibyte = 1;
2600 coding->consumed = coding->consumed_char = src - source;
2601 coding->produced = dst - destination;
2602 return result;
2605 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2606 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2607 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2608 sure that all these charsets are registered as official charset
2609 (i.e. do not have extended leading-codes). Characters of other
2610 charsets are produced without any encoding. If SJIS_P is 1, encode
2611 SJIS text, else encode BIG5 text. */
2614 encode_coding_sjis_big5 (coding, source, destination,
2615 src_bytes, dst_bytes, sjis_p)
2616 struct coding_system *coding;
2617 unsigned char *source, *destination;
2618 int src_bytes, dst_bytes;
2619 int sjis_p;
2621 unsigned char *src = source;
2622 unsigned char *src_end = source + src_bytes;
2623 unsigned char *dst = destination;
2624 unsigned char *dst_end = destination + dst_bytes;
2625 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2626 from DST_END to assure overflow checking is necessary only at the
2627 head of loop. */
2628 unsigned char *adjusted_dst_end = dst_end - 1;
2629 Lisp_Object translation_table
2630 = coding->translation_table_for_encode;
2631 int result = CODING_FINISH_NORMAL;
2633 if (!NILP (Venable_character_translation) && NILP (translation_table))
2634 translation_table = Vstandard_translation_table_for_encode;
2636 coding->consumed_char = 0;
2637 coding->fake_multibyte = 0;
2638 while (src < src_end && (dst_bytes
2639 ? (dst < adjusted_dst_end)
2640 : (dst < src - 1)))
2642 /* SRC_BASE remembers the start position in source in each loop.
2643 The loop will be exited when there's not enough source text
2644 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2645 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2646 before exiting. */
2647 unsigned char *src_base = src;
2648 unsigned char c1 = *src++, c2, c3, c4;
2650 switch (emacs_code_class[c1])
2652 case EMACS_ascii_code:
2653 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2654 break;
2656 case EMACS_control_code:
2657 *dst++ = c1;
2658 coding->consumed_char++;
2659 break;
2661 case EMACS_carriage_return_code:
2662 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2664 *dst++ = c1;
2665 coding->consumed_char++;
2666 break;
2668 /* fall down to treat '\r' as '\n' ... */
2670 case EMACS_linefeed_code:
2671 if (coding->eol_type == CODING_EOL_LF
2672 || coding->eol_type == CODING_EOL_UNDECIDED)
2673 *dst++ = '\n';
2674 else if (coding->eol_type == CODING_EOL_CRLF)
2675 *dst++ = '\r', *dst++ = '\n';
2676 else
2677 *dst++ = '\r';
2678 coding->consumed_char++;
2679 break;
2681 case EMACS_leading_code_2:
2682 ONE_MORE_BYTE (c2);
2683 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2684 break;
2686 case EMACS_leading_code_3:
2687 TWO_MORE_BYTES (c2, c3);
2688 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2689 break;
2691 case EMACS_leading_code_4:
2692 THREE_MORE_BYTES (c2, c3, c4);
2693 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2694 break;
2696 default: /* i.e. case EMACS_invalid_code: */
2697 *dst++ = c1;
2698 coding->consumed_char++;
2700 continue;
2702 label_end_of_loop:
2703 result = CODING_FINISH_INSUFFICIENT_SRC;
2704 src = src_base;
2705 break;
2708 if (result == CODING_FINISH_NORMAL
2709 && src < src_end)
2710 result = CODING_FINISH_INSUFFICIENT_DST;
2711 coding->consumed = src - source;
2712 coding->produced = coding->produced_char = dst - destination;
2713 return result;
2717 /*** 5. CCL handlers ***/
2719 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2720 Check if a text is encoded in a coding system of which
2721 encoder/decoder are written in CCL program. If it is, return
2722 CODING_CATEGORY_MASK_CCL, else return 0. */
2725 detect_coding_ccl (src, src_end)
2726 unsigned char *src, *src_end;
2728 unsigned char *valid;
2730 /* No coding system is assigned to coding-category-ccl. */
2731 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2732 return 0;
2734 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2735 while (src < src_end)
2737 if (! valid[*src]) return 0;
2738 src++;
2740 return CODING_CATEGORY_MASK_CCL;
2744 /*** 6. End-of-line handlers ***/
2746 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2747 This function is called only when `coding->eol_type' is
2748 CODING_EOL_CRLF or CODING_EOL_CR. */
2751 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2752 struct coding_system *coding;
2753 unsigned char *source, *destination;
2754 int src_bytes, dst_bytes;
2756 unsigned char *src = source;
2757 unsigned char *src_end = source + src_bytes;
2758 unsigned char *dst = destination;
2759 unsigned char *dst_end = destination + dst_bytes;
2760 unsigned char c;
2761 int result = CODING_FINISH_NORMAL;
2763 coding->fake_multibyte = 0;
2765 if (src_bytes <= 0)
2767 coding->produced = coding->produced_char = 0;
2768 coding->consumed = coding->consumed_char = 0;
2769 return result;
2772 switch (coding->eol_type)
2774 case CODING_EOL_CRLF:
2776 /* Since the maximum bytes produced by each loop is 2, we
2777 subtract 1 from DST_END to assure overflow checking is
2778 necessary only at the head of loop. */
2779 unsigned char *adjusted_dst_end = dst_end - 1;
2781 while (src < src_end && (dst_bytes
2782 ? (dst < adjusted_dst_end)
2783 : (dst < src - 1)))
2785 unsigned char *src_base = src;
2787 c = *src++;
2788 if (c == '\r')
2790 ONE_MORE_BYTE (c);
2791 if (c == '\n')
2792 *dst++ = c;
2793 else
2795 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2797 result = CODING_FINISH_INCONSISTENT_EOL;
2798 goto label_end_of_loop_2;
2800 src--;
2801 *dst++ = '\r';
2802 if (BASE_LEADING_CODE_P (c))
2803 coding->fake_multibyte = 1;
2806 else if (c == '\n'
2807 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2809 result = CODING_FINISH_INCONSISTENT_EOL;
2810 goto label_end_of_loop_2;
2812 else
2814 *dst++ = c;
2815 if (BASE_LEADING_CODE_P (c))
2816 coding->fake_multibyte = 1;
2818 continue;
2820 label_end_of_loop:
2821 result = CODING_FINISH_INSUFFICIENT_SRC;
2822 label_end_of_loop_2:
2823 src = src_base;
2824 break;
2826 if (src < src_end)
2828 if (result == CODING_FINISH_NORMAL)
2829 result = CODING_FINISH_INSUFFICIENT_DST;
2830 else if (result != CODING_FINISH_INCONSISTENT_EOL
2831 && coding->mode & CODING_MODE_LAST_BLOCK)
2833 /* This is the last block of the text to be decoded.
2834 We flush out all remaining codes. */
2835 src_bytes = src_end - src;
2836 if (dst_bytes && (dst_end - dst < src_bytes))
2837 src_bytes = dst_end - dst;
2838 bcopy (src, dst, src_bytes);
2839 dst += src_bytes;
2840 src += src_bytes;
2844 break;
2846 case CODING_EOL_CR:
2847 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2849 while (src < src_end)
2851 if ((c = *src++) == '\n')
2852 break;
2853 if (BASE_LEADING_CODE_P (c))
2854 coding->fake_multibyte = 1;
2856 if (*--src == '\n')
2858 src_bytes = src - source;
2859 result = CODING_FINISH_INCONSISTENT_EOL;
2862 if (dst_bytes && src_bytes > dst_bytes)
2864 result = CODING_FINISH_INSUFFICIENT_DST;
2865 src_bytes = dst_bytes;
2867 if (dst_bytes)
2868 bcopy (source, destination, src_bytes);
2869 else
2870 safe_bcopy (source, destination, src_bytes);
2871 src = source + src_bytes;
2872 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2873 break;
2875 default: /* i.e. case: CODING_EOL_LF */
2876 if (dst_bytes && src_bytes > dst_bytes)
2878 result = CODING_FINISH_INSUFFICIENT_DST;
2879 src_bytes = dst_bytes;
2881 if (dst_bytes)
2882 bcopy (source, destination, src_bytes);
2883 else
2884 safe_bcopy (source, destination, src_bytes);
2885 src += src_bytes;
2886 dst += src_bytes;
2887 coding->fake_multibyte = 1;
2888 break;
2891 coding->consumed = coding->consumed_char = src - source;
2892 coding->produced = coding->produced_char = dst - destination;
2893 return result;
2896 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2897 format of end-of-line according to `coding->eol_type'. If
2898 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2899 '\r' in source text also means end-of-line. */
2902 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2903 struct coding_system *coding;
2904 unsigned char *source, *destination;
2905 int src_bytes, dst_bytes;
2907 unsigned char *src = source;
2908 unsigned char *dst = destination;
2909 int result = CODING_FINISH_NORMAL;
2911 coding->fake_multibyte = 0;
2913 if (coding->eol_type == CODING_EOL_CRLF)
2915 unsigned char c;
2916 unsigned char *src_end = source + src_bytes;
2917 unsigned char *dst_end = destination + dst_bytes;
2918 /* Since the maximum bytes produced by each loop is 2, we
2919 subtract 1 from DST_END to assure overflow checking is
2920 necessary only at the head of loop. */
2921 unsigned char *adjusted_dst_end = dst_end - 1;
2923 while (src < src_end && (dst_bytes
2924 ? (dst < adjusted_dst_end)
2925 : (dst < src - 1)))
2927 c = *src++;
2928 if (c == '\n'
2929 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2930 *dst++ = '\r', *dst++ = '\n';
2931 else
2933 *dst++ = c;
2934 if (BASE_LEADING_CODE_P (c))
2935 coding->fake_multibyte = 1;
2938 if (src < src_end)
2939 result = CODING_FINISH_INSUFFICIENT_DST;
2941 else
2943 unsigned char c;
2945 if (dst_bytes && src_bytes > dst_bytes)
2947 src_bytes = dst_bytes;
2948 result = CODING_FINISH_INSUFFICIENT_DST;
2950 if (dst_bytes)
2951 bcopy (source, destination, src_bytes);
2952 else
2953 safe_bcopy (source, destination, src_bytes);
2954 dst_bytes = src_bytes;
2955 if (coding->eol_type == CODING_EOL_CR)
2957 while (src_bytes--)
2959 if ((c = *dst++) == '\n')
2960 dst[-1] = '\r';
2961 else if (BASE_LEADING_CODE_P (c))
2962 coding->fake_multibyte = 1;
2965 else
2967 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2969 while (src_bytes--)
2970 if (*dst++ == '\r') dst[-1] = '\n';
2972 coding->fake_multibyte = 1;
2974 src = source + dst_bytes;
2975 dst = destination + dst_bytes;
2978 coding->consumed = coding->consumed_char = src - source;
2979 coding->produced = coding->produced_char = dst - destination;
2980 return result;
2984 /*** 7. C library functions ***/
2986 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2987 has a property `coding-system'. The value of this property is a
2988 vector of length 5 (called as coding-vector). Among elements of
2989 this vector, the first (element[0]) and the fifth (element[4])
2990 carry important information for decoding/encoding. Before
2991 decoding/encoding, this information should be set in fields of a
2992 structure of type `coding_system'.
2994 A value of property `coding-system' can be a symbol of another
2995 subsidiary coding-system. In that case, Emacs gets coding-vector
2996 from that symbol.
2998 `element[0]' contains information to be set in `coding->type'. The
2999 value and its meaning is as follows:
3001 0 -- coding_type_emacs_mule
3002 1 -- coding_type_sjis
3003 2 -- coding_type_iso2022
3004 3 -- coding_type_big5
3005 4 -- coding_type_ccl encoder/decoder written in CCL
3006 nil -- coding_type_no_conversion
3007 t -- coding_type_undecided (automatic conversion on decoding,
3008 no-conversion on encoding)
3010 `element[4]' contains information to be set in `coding->flags' and
3011 `coding->spec'. The meaning varies by `coding->type'.
3013 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3014 of length 32 (of which the first 13 sub-elements are used now).
3015 Meanings of these sub-elements are:
3017 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3018 If the value is an integer of valid charset, the charset is
3019 assumed to be designated to graphic register N initially.
3021 If the value is minus, it is a minus value of charset which
3022 reserves graphic register N, which means that the charset is
3023 not designated initially but should be designated to graphic
3024 register N just before encoding a character in that charset.
3026 If the value is nil, graphic register N is never used on
3027 encoding.
3029 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3030 Each value takes t or nil. See the section ISO2022 of
3031 `coding.h' for more information.
3033 If `coding->type' is `coding_type_big5', element[4] is t to denote
3034 BIG5-ETen or nil to denote BIG5-HKU.
3036 If `coding->type' takes the other value, element[4] is ignored.
3038 Emacs Lisp's coding system also carries information about format of
3039 end-of-line in a value of property `eol-type'. If the value is
3040 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3041 means CODING_EOL_CR. If it is not integer, it should be a vector
3042 of subsidiary coding systems of which property `eol-type' has one
3043 of above values.
3047 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3048 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3049 is setup so that no conversion is necessary and return -1, else
3050 return 0. */
3053 setup_coding_system (coding_system, coding)
3054 Lisp_Object coding_system;
3055 struct coding_system *coding;
3057 Lisp_Object coding_spec, coding_type, eol_type, plist;
3058 Lisp_Object val;
3059 int i;
3061 /* Initialize some fields required for all kinds of coding systems. */
3062 coding->symbol = coding_system;
3063 coding->common_flags = 0;
3064 coding->mode = 0;
3065 coding->heading_ascii = -1;
3066 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3067 coding->composing = COMPOSITION_DISABLED;
3068 coding->cmp_data = NULL;
3070 if (NILP (coding_system))
3071 goto label_invalid_coding_system;
3073 coding_spec = Fget (coding_system, Qcoding_system);
3075 if (!VECTORP (coding_spec)
3076 || XVECTOR (coding_spec)->size != 5
3077 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3078 goto label_invalid_coding_system;
3080 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3081 if (VECTORP (eol_type))
3083 coding->eol_type = CODING_EOL_UNDECIDED;
3084 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3086 else if (XFASTINT (eol_type) == 1)
3088 coding->eol_type = CODING_EOL_CRLF;
3089 coding->common_flags
3090 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3092 else if (XFASTINT (eol_type) == 2)
3094 coding->eol_type = CODING_EOL_CR;
3095 coding->common_flags
3096 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3098 else
3099 coding->eol_type = CODING_EOL_LF;
3101 coding_type = XVECTOR (coding_spec)->contents[0];
3102 /* Try short cut. */
3103 if (SYMBOLP (coding_type))
3105 if (EQ (coding_type, Qt))
3107 coding->type = coding_type_undecided;
3108 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3110 else
3111 coding->type = coding_type_no_conversion;
3112 return 0;
3115 /* Get values of coding system properties:
3116 `post-read-conversion', `pre-write-conversion',
3117 `translation-table-for-decode', `translation-table-for-encode'. */
3118 plist = XVECTOR (coding_spec)->contents[3];
3119 /* Pre & post conversion functions should be disabled if
3120 inhibit_eol_conversion is nozero. This is the case that a code
3121 conversion function is called while those functions are running. */
3122 if (! inhibit_pre_post_conversion)
3124 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3125 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3127 val = Fplist_get (plist, Qtranslation_table_for_decode);
3128 if (SYMBOLP (val))
3129 val = Fget (val, Qtranslation_table_for_decode);
3130 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3131 val = Fplist_get (plist, Qtranslation_table_for_encode);
3132 if (SYMBOLP (val))
3133 val = Fget (val, Qtranslation_table_for_encode);
3134 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3135 val = Fplist_get (plist, Qcoding_category);
3136 if (!NILP (val))
3138 val = Fget (val, Qcoding_category_index);
3139 if (INTEGERP (val))
3140 coding->category_idx = XINT (val);
3141 else
3142 goto label_invalid_coding_system;
3144 else
3145 goto label_invalid_coding_system;
3147 val = Fplist_get (plist, Qsafe_charsets);
3148 if (EQ (val, Qt))
3150 for (i = 0; i <= MAX_CHARSET; i++)
3151 coding->safe_charsets[i] = 1;
3153 else
3155 bzero (coding->safe_charsets, MAX_CHARSET + 1);
3156 while (CONSP (val))
3158 if ((i = get_charset_id (XCAR (val))) >= 0)
3159 coding->safe_charsets[i] = 1;
3160 val = XCDR (val);
3164 /* If the coding system has non-nil `composition' property, enable
3165 composition handling. */
3166 val = Fplist_get (plist, Qcomposition);
3167 if (!NILP (val))
3168 coding->composing = COMPOSITION_NO;
3170 switch (XFASTINT (coding_type))
3172 case 0:
3173 coding->type = coding_type_emacs_mule;
3174 if (!NILP (coding->post_read_conversion))
3175 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3176 if (!NILP (coding->pre_write_conversion))
3177 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3178 break;
3180 case 1:
3181 coding->type = coding_type_sjis;
3182 coding->common_flags
3183 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3184 break;
3186 case 2:
3187 coding->type = coding_type_iso2022;
3188 coding->common_flags
3189 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3191 Lisp_Object val, temp;
3192 Lisp_Object *flags;
3193 int i, charset, reg_bits = 0;
3195 val = XVECTOR (coding_spec)->contents[4];
3197 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3198 goto label_invalid_coding_system;
3200 flags = XVECTOR (val)->contents;
3201 coding->flags
3202 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3203 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3204 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3205 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3206 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3207 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3208 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3209 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3210 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3211 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3212 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3213 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3214 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3217 /* Invoke graphic register 0 to plane 0. */
3218 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3219 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3220 CODING_SPEC_ISO_INVOCATION (coding, 1)
3221 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3222 /* Not single shifting at first. */
3223 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3224 /* Beginning of buffer should also be regarded as bol. */
3225 CODING_SPEC_ISO_BOL (coding) = 1;
3227 for (charset = 0; charset <= MAX_CHARSET; charset++)
3228 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3229 val = Vcharset_revision_alist;
3230 while (CONSP (val))
3232 charset = get_charset_id (Fcar_safe (XCAR (val)));
3233 if (charset >= 0
3234 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3235 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3236 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3237 val = XCDR (val);
3240 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3241 FLAGS[REG] can be one of below:
3242 integer CHARSET: CHARSET occupies register I,
3243 t: designate nothing to REG initially, but can be used
3244 by any charsets,
3245 list of integer, nil, or t: designate the first
3246 element (if integer) to REG initially, the remaining
3247 elements (if integer) is designated to REG on request,
3248 if an element is t, REG can be used by any charsets,
3249 nil: REG is never used. */
3250 for (charset = 0; charset <= MAX_CHARSET; charset++)
3251 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3252 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3253 for (i = 0; i < 4; i++)
3255 if (INTEGERP (flags[i])
3256 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3257 || (charset = get_charset_id (flags[i])) >= 0)
3259 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3260 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3262 else if (EQ (flags[i], Qt))
3264 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3265 reg_bits |= 1 << i;
3266 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3268 else if (CONSP (flags[i]))
3270 Lisp_Object tail;
3271 tail = flags[i];
3273 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3274 if (INTEGERP (XCAR (tail))
3275 && (charset = XINT (XCAR (tail)),
3276 CHARSET_VALID_P (charset))
3277 || (charset = get_charset_id (XCAR (tail))) >= 0)
3279 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3280 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3282 else
3283 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3284 tail = XCDR (tail);
3285 while (CONSP (tail))
3287 if (INTEGERP (XCAR (tail))
3288 && (charset = XINT (XCAR (tail)),
3289 CHARSET_VALID_P (charset))
3290 || (charset = get_charset_id (XCAR (tail))) >= 0)
3291 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3292 = i;
3293 else if (EQ (XCAR (tail), Qt))
3294 reg_bits |= 1 << i;
3295 tail = XCDR (tail);
3298 else
3299 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3301 CODING_SPEC_ISO_DESIGNATION (coding, i)
3302 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3305 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3307 /* REG 1 can be used only by locking shift in 7-bit env. */
3308 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3309 reg_bits &= ~2;
3310 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3311 /* Without any shifting, only REG 0 and 1 can be used. */
3312 reg_bits &= 3;
3315 if (reg_bits)
3316 for (charset = 0; charset <= MAX_CHARSET; charset++)
3318 if (CHARSET_VALID_P (charset))
3320 /* There exist some default graphic registers to be
3321 used CHARSET. */
3323 /* We had better avoid designating a charset of
3324 CHARS96 to REG 0 as far as possible. */
3325 if (CHARSET_CHARS (charset) == 96)
3326 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3327 = (reg_bits & 2
3328 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3329 else
3330 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3331 = (reg_bits & 1
3332 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3336 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3337 coding->spec.iso2022.last_invalid_designation_register = -1;
3338 break;
3340 case 3:
3341 coding->type = coding_type_big5;
3342 coding->common_flags
3343 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3344 coding->flags
3345 = (NILP (XVECTOR (coding_spec)->contents[4])
3346 ? CODING_FLAG_BIG5_HKU
3347 : CODING_FLAG_BIG5_ETEN);
3348 break;
3350 case 4:
3351 coding->type = coding_type_ccl;
3352 coding->common_flags
3353 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3355 val = XVECTOR (coding_spec)->contents[4];
3356 if (! CONSP (val)
3357 || setup_ccl_program (&(coding->spec.ccl.decoder),
3358 XCAR (val)) < 0
3359 || setup_ccl_program (&(coding->spec.ccl.encoder),
3360 XCDR (val)) < 0)
3361 goto label_invalid_coding_system;
3363 bzero (coding->spec.ccl.valid_codes, 256);
3364 val = Fplist_get (plist, Qvalid_codes);
3365 if (CONSP (val))
3367 Lisp_Object this;
3369 for (; CONSP (val); val = XCDR (val))
3371 this = XCAR (val);
3372 if (INTEGERP (this)
3373 && XINT (this) >= 0 && XINT (this) < 256)
3374 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3375 else if (CONSP (this)
3376 && INTEGERP (XCAR (this))
3377 && INTEGERP (XCDR (this)))
3379 int start = XINT (XCAR (this));
3380 int end = XINT (XCDR (this));
3382 if (start >= 0 && start <= end && end < 256)
3383 while (start <= end)
3384 coding->spec.ccl.valid_codes[start++] = 1;
3389 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3390 break;
3392 case 5:
3393 coding->type = coding_type_raw_text;
3394 break;
3396 default:
3397 goto label_invalid_coding_system;
3399 return 0;
3401 label_invalid_coding_system:
3402 coding->type = coding_type_no_conversion;
3403 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3404 coding->common_flags = 0;
3405 coding->eol_type = CODING_EOL_LF;
3406 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3407 return -1;
3410 /* Free memory blocks allocated for storing composition information. */
3412 void
3413 coding_free_composition_data (coding)
3414 struct coding_system *coding;
3416 struct composition_data *cmp_data = coding->cmp_data, *next;
3418 if (!cmp_data)
3419 return;
3420 /* Memory blocks are chained. At first, rewind to the first, then,
3421 free blocks one by one. */
3422 while (cmp_data->prev)
3423 cmp_data = cmp_data->prev;
3424 while (cmp_data)
3426 next = cmp_data->next;
3427 xfree (cmp_data);
3428 cmp_data = next;
3430 coding->cmp_data = NULL;
3433 /* Set `char_offset' member of all memory blocks pointed by
3434 coding->cmp_data to POS. */
3436 void
3437 coding_adjust_composition_offset (coding, pos)
3438 struct coding_system *coding;
3439 int pos;
3441 struct composition_data *cmp_data;
3443 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3444 cmp_data->char_offset = pos;
3447 /* Setup raw-text or one of its subsidiaries in the structure
3448 coding_system CODING according to the already setup value eol_type
3449 in CODING. CODING should be setup for some coding system in
3450 advance. */
3452 void
3453 setup_raw_text_coding_system (coding)
3454 struct coding_system *coding;
3456 if (coding->type != coding_type_raw_text)
3458 coding->symbol = Qraw_text;
3459 coding->type = coding_type_raw_text;
3460 if (coding->eol_type != CODING_EOL_UNDECIDED)
3462 Lisp_Object subsidiaries;
3463 subsidiaries = Fget (Qraw_text, Qeol_type);
3465 if (VECTORP (subsidiaries)
3466 && XVECTOR (subsidiaries)->size == 3)
3467 coding->symbol
3468 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3470 setup_coding_system (coding->symbol, coding);
3472 return;
3475 /* Emacs has a mechanism to automatically detect a coding system if it
3476 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3477 it's impossible to distinguish some coding systems accurately
3478 because they use the same range of codes. So, at first, coding
3479 systems are categorized into 7, those are:
3481 o coding-category-emacs-mule
3483 The category for a coding system which has the same code range
3484 as Emacs' internal format. Assigned the coding-system (Lisp
3485 symbol) `emacs-mule' by default.
3487 o coding-category-sjis
3489 The category for a coding system which has the same code range
3490 as SJIS. Assigned the coding-system (Lisp
3491 symbol) `japanese-shift-jis' by default.
3493 o coding-category-iso-7
3495 The category for a coding system which has the same code range
3496 as ISO2022 of 7-bit environment. This doesn't use any locking
3497 shift and single shift functions. This can encode/decode all
3498 charsets. Assigned the coding-system (Lisp symbol)
3499 `iso-2022-7bit' by default.
3501 o coding-category-iso-7-tight
3503 Same as coding-category-iso-7 except that this can
3504 encode/decode only the specified charsets.
3506 o coding-category-iso-8-1
3508 The category for a coding system which has the same code range
3509 as ISO2022 of 8-bit environment and graphic plane 1 used only
3510 for DIMENSION1 charset. This doesn't use any locking shift
3511 and single shift functions. Assigned the coding-system (Lisp
3512 symbol) `iso-latin-1' by default.
3514 o coding-category-iso-8-2
3516 The category for a coding system which has the same code range
3517 as ISO2022 of 8-bit environment and graphic plane 1 used only
3518 for DIMENSION2 charset. This doesn't use any locking shift
3519 and single shift functions. Assigned the coding-system (Lisp
3520 symbol) `japanese-iso-8bit' by default.
3522 o coding-category-iso-7-else
3524 The category for a coding system which has the same code range
3525 as ISO2022 of 7-bit environemnt but uses locking shift or
3526 single shift functions. Assigned the coding-system (Lisp
3527 symbol) `iso-2022-7bit-lock' by default.
3529 o coding-category-iso-8-else
3531 The category for a coding system which has the same code range
3532 as ISO2022 of 8-bit environemnt but uses locking shift or
3533 single shift functions. Assigned the coding-system (Lisp
3534 symbol) `iso-2022-8bit-ss2' by default.
3536 o coding-category-big5
3538 The category for a coding system which has the same code range
3539 as BIG5. Assigned the coding-system (Lisp symbol)
3540 `cn-big5' by default.
3542 o coding-category-utf-8
3544 The category for a coding system which has the same code range
3545 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3546 symbol) `utf-8' by default.
3548 o coding-category-utf-16-be
3550 The category for a coding system in which a text has an
3551 Unicode signature (cf. Unicode Standard) in the order of BIG
3552 endian at the head. Assigned the coding-system (Lisp symbol)
3553 `utf-16-be' by default.
3555 o coding-category-utf-16-le
3557 The category for a coding system in which a text has an
3558 Unicode signature (cf. Unicode Standard) in the order of
3559 LITTLE endian at the head. Assigned the coding-system (Lisp
3560 symbol) `utf-16-le' by default.
3562 o coding-category-ccl
3564 The category for a coding system of which encoder/decoder is
3565 written in CCL programs. The default value is nil, i.e., no
3566 coding system is assigned.
3568 o coding-category-binary
3570 The category for a coding system not categorized in any of the
3571 above. Assigned the coding-system (Lisp symbol)
3572 `no-conversion' by default.
3574 Each of them is a Lisp symbol and the value is an actual
3575 `coding-system's (this is also a Lisp symbol) assigned by a user.
3576 What Emacs does actually is to detect a category of coding system.
3577 Then, it uses a `coding-system' assigned to it. If Emacs can't
3578 decide only one possible category, it selects a category of the
3579 highest priority. Priorities of categories are also specified by a
3580 user in a Lisp variable `coding-category-list'.
3584 static
3585 int ascii_skip_code[256];
3587 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3588 If it detects possible coding systems, return an integer in which
3589 appropriate flag bits are set. Flag bits are defined by macros
3590 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3591 it should point the table `coding_priorities'. In that case, only
3592 the flag bit for a coding system of the highest priority is set in
3593 the returned value.
3595 How many ASCII characters are at the head is returned as *SKIP. */
3597 static int
3598 detect_coding_mask (source, src_bytes, priorities, skip)
3599 unsigned char *source;
3600 int src_bytes, *priorities, *skip;
3602 register unsigned char c;
3603 unsigned char *src = source, *src_end = source + src_bytes;
3604 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3605 int i, idx;
3607 /* At first, skip all ASCII characters and control characters except
3608 for three ISO2022 specific control characters. */
3609 ascii_skip_code[ISO_CODE_SO] = 0;
3610 ascii_skip_code[ISO_CODE_SI] = 0;
3611 ascii_skip_code[ISO_CODE_ESC] = 0;
3613 label_loop_detect_coding:
3614 while (src < src_end && ascii_skip_code[*src]) src++;
3615 *skip = src - source;
3617 if (src >= src_end)
3618 /* We found nothing other than ASCII. There's nothing to do. */
3619 return 0;
3621 c = *src;
3622 /* The text seems to be encoded in some multilingual coding system.
3623 Now, try to find in which coding system the text is encoded. */
3624 if (c < 0x80)
3626 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3627 /* C is an ISO2022 specific control code of C0. */
3628 mask = detect_coding_iso2022 (src, src_end);
3629 if (mask == 0)
3631 /* No valid ISO2022 code follows C. Try again. */
3632 src++;
3633 if (c == ISO_CODE_ESC)
3634 ascii_skip_code[ISO_CODE_ESC] = 1;
3635 else
3636 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3637 goto label_loop_detect_coding;
3639 if (priorities)
3641 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3643 if (mask & priorities[i])
3644 return priorities[i];
3646 return CODING_CATEGORY_MASK_RAW_TEXT;
3649 else
3651 int try;
3653 if (c < 0xA0)
3655 /* C is the first byte of SJIS character code,
3656 or a leading-code of Emacs' internal format (emacs-mule),
3657 or the first byte of UTF-16. */
3658 try = (CODING_CATEGORY_MASK_SJIS
3659 | CODING_CATEGORY_MASK_EMACS_MULE
3660 | CODING_CATEGORY_MASK_UTF_16_BE
3661 | CODING_CATEGORY_MASK_UTF_16_LE);
3663 /* Or, if C is a special latin extra code,
3664 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3665 or is an ISO2022 control-sequence-introducer (CSI),
3666 we should also consider the possibility of ISO2022 codings. */
3667 if ((VECTORP (Vlatin_extra_code_table)
3668 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3669 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3670 || (c == ISO_CODE_CSI
3671 && (src < src_end
3672 && (*src == ']'
3673 || ((*src == '0' || *src == '1' || *src == '2')
3674 && src + 1 < src_end
3675 && src[1] == ']')))))
3676 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3677 | CODING_CATEGORY_MASK_ISO_8BIT);
3679 else
3680 /* C is a character of ISO2022 in graphic plane right,
3681 or a SJIS's 1-byte character code (i.e. JISX0201),
3682 or the first byte of BIG5's 2-byte code,
3683 or the first byte of UTF-8/16. */
3684 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3685 | CODING_CATEGORY_MASK_ISO_8BIT
3686 | CODING_CATEGORY_MASK_SJIS
3687 | CODING_CATEGORY_MASK_BIG5
3688 | CODING_CATEGORY_MASK_UTF_8
3689 | CODING_CATEGORY_MASK_UTF_16_BE
3690 | CODING_CATEGORY_MASK_UTF_16_LE);
3692 /* Or, we may have to consider the possibility of CCL. */
3693 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3694 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3695 ->spec.ccl.valid_codes)[c])
3696 try |= CODING_CATEGORY_MASK_CCL;
3698 mask = 0;
3699 utf16_examined_p = iso2022_examined_p = 0;
3700 if (priorities)
3702 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3704 if (!iso2022_examined_p
3705 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3707 mask |= detect_coding_iso2022 (src, src_end);
3708 iso2022_examined_p = 1;
3710 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3711 mask |= detect_coding_sjis (src, src_end);
3712 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3713 mask |= detect_coding_utf_8 (src, src_end);
3714 else if (!utf16_examined_p
3715 && (priorities[i] & try &
3716 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3718 mask |= detect_coding_utf_16 (src, src_end);
3719 utf16_examined_p = 1;
3721 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3722 mask |= detect_coding_big5 (src, src_end);
3723 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3724 mask |= detect_coding_emacs_mule (src, src_end);
3725 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3726 mask |= detect_coding_ccl (src, src_end);
3727 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3728 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3729 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3730 mask |= CODING_CATEGORY_MASK_BINARY;
3731 if (mask & priorities[i])
3732 return priorities[i];
3734 return CODING_CATEGORY_MASK_RAW_TEXT;
3736 if (try & CODING_CATEGORY_MASK_ISO)
3737 mask |= detect_coding_iso2022 (src, src_end);
3738 if (try & CODING_CATEGORY_MASK_SJIS)
3739 mask |= detect_coding_sjis (src, src_end);
3740 if (try & CODING_CATEGORY_MASK_BIG5)
3741 mask |= detect_coding_big5 (src, src_end);
3742 if (try & CODING_CATEGORY_MASK_UTF_8)
3743 mask |= detect_coding_utf_8 (src, src_end);
3744 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3745 mask |= detect_coding_utf_16 (src, src_end);
3746 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3747 mask |= detect_coding_emacs_mule (src, src_end);
3748 if (try & CODING_CATEGORY_MASK_CCL)
3749 mask |= detect_coding_ccl (src, src_end);
3751 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3754 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3755 The information of the detected coding system is set in CODING. */
3757 void
3758 detect_coding (coding, src, src_bytes)
3759 struct coding_system *coding;
3760 unsigned char *src;
3761 int src_bytes;
3763 unsigned int idx;
3764 int skip, mask, i;
3765 Lisp_Object val;
3767 val = Vcoding_category_list;
3768 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3769 coding->heading_ascii = skip;
3771 if (!mask) return;
3773 /* We found a single coding system of the highest priority in MASK. */
3774 idx = 0;
3775 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3776 if (! mask)
3777 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3779 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3781 if (coding->eol_type != CODING_EOL_UNDECIDED)
3783 Lisp_Object tmp;
3785 tmp = Fget (val, Qeol_type);
3786 if (VECTORP (tmp))
3787 val = XVECTOR (tmp)->contents[coding->eol_type];
3789 setup_coding_system (val, coding);
3790 /* Set this again because setup_coding_system reset this member. */
3791 coding->heading_ascii = skip;
3794 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3795 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3796 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3798 How many non-eol characters are at the head is returned as *SKIP. */
3800 #define MAX_EOL_CHECK_COUNT 3
3802 static int
3803 detect_eol_type (source, src_bytes, skip)
3804 unsigned char *source;
3805 int src_bytes, *skip;
3807 unsigned char *src = source, *src_end = src + src_bytes;
3808 unsigned char c;
3809 int total = 0; /* How many end-of-lines are found so far. */
3810 int eol_type = CODING_EOL_UNDECIDED;
3811 int this_eol_type;
3813 *skip = 0;
3815 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3817 c = *src++;
3818 if (c == '\n' || c == '\r')
3820 if (*skip == 0)
3821 *skip = src - 1 - source;
3822 total++;
3823 if (c == '\n')
3824 this_eol_type = CODING_EOL_LF;
3825 else if (src >= src_end || *src != '\n')
3826 this_eol_type = CODING_EOL_CR;
3827 else
3828 this_eol_type = CODING_EOL_CRLF, src++;
3830 if (eol_type == CODING_EOL_UNDECIDED)
3831 /* This is the first end-of-line. */
3832 eol_type = this_eol_type;
3833 else if (eol_type != this_eol_type)
3835 /* The found type is different from what found before. */
3836 eol_type = CODING_EOL_INCONSISTENT;
3837 break;
3842 if (*skip == 0)
3843 *skip = src_end - source;
3844 return eol_type;
3847 /* Like detect_eol_type, but detect EOL type in 2-octet
3848 big-endian/little-endian format for coding systems utf-16-be and
3849 utf-16-le. */
3851 static int
3852 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3853 unsigned char *source;
3854 int src_bytes, *skip;
3856 unsigned char *src = source, *src_end = src + src_bytes;
3857 unsigned int c1, c2;
3858 int total = 0; /* How many end-of-lines are found so far. */
3859 int eol_type = CODING_EOL_UNDECIDED;
3860 int this_eol_type;
3861 int msb, lsb;
3863 if (big_endian_p)
3864 msb = 0, lsb = 1;
3865 else
3866 msb = 1, lsb = 0;
3868 *skip = 0;
3870 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3872 c1 = (src[msb] << 8) | (src[lsb]);
3873 src += 2;
3875 if (c1 == '\n' || c1 == '\r')
3877 if (*skip == 0)
3878 *skip = src - 2 - source;
3879 total++;
3880 if (c1 == '\n')
3882 this_eol_type = CODING_EOL_LF;
3884 else
3886 if ((src + 1) >= src_end)
3888 this_eol_type = CODING_EOL_CR;
3890 else
3892 c2 = (src[msb] << 8) | (src[lsb]);
3893 if (c2 == '\n')
3894 this_eol_type = CODING_EOL_CRLF, src += 2;
3895 else
3896 this_eol_type = CODING_EOL_CR;
3900 if (eol_type == CODING_EOL_UNDECIDED)
3901 /* This is the first end-of-line. */
3902 eol_type = this_eol_type;
3903 else if (eol_type != this_eol_type)
3905 /* The found type is different from what found before. */
3906 eol_type = CODING_EOL_INCONSISTENT;
3907 break;
3912 if (*skip == 0)
3913 *skip = src_end - source;
3914 return eol_type;
3917 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3918 is encoded. If it detects an appropriate format of end-of-line, it
3919 sets the information in *CODING. */
3921 void
3922 detect_eol (coding, src, src_bytes)
3923 struct coding_system *coding;
3924 unsigned char *src;
3925 int src_bytes;
3927 Lisp_Object val;
3928 int skip;
3929 int eol_type;
3931 switch (coding->category_idx)
3933 case CODING_CATEGORY_IDX_UTF_16_BE:
3934 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3935 break;
3936 case CODING_CATEGORY_IDX_UTF_16_LE:
3937 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3938 break;
3939 default:
3940 eol_type = detect_eol_type (src, src_bytes, &skip);
3941 break;
3944 if (coding->heading_ascii > skip)
3945 coding->heading_ascii = skip;
3946 else
3947 skip = coding->heading_ascii;
3949 if (eol_type == CODING_EOL_UNDECIDED)
3950 return;
3951 if (eol_type == CODING_EOL_INCONSISTENT)
3953 #if 0
3954 /* This code is suppressed until we find a better way to
3955 distinguish raw text file and binary file. */
3957 /* If we have already detected that the coding is raw-text, the
3958 coding should actually be no-conversion. */
3959 if (coding->type == coding_type_raw_text)
3961 setup_coding_system (Qno_conversion, coding);
3962 return;
3964 /* Else, let's decode only text code anyway. */
3965 #endif /* 0 */
3966 eol_type = CODING_EOL_LF;
3969 val = Fget (coding->symbol, Qeol_type);
3970 if (VECTORP (val) && XVECTOR (val)->size == 3)
3972 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3973 coding->heading_ascii = skip;
3977 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3979 #define DECODING_BUFFER_MAG(coding) \
3980 (coding->type == coding_type_iso2022 \
3981 ? 3 \
3982 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3983 ? 2 \
3984 : (coding->type == coding_type_raw_text \
3985 ? 1 \
3986 : (coding->type == coding_type_ccl \
3987 ? coding->spec.ccl.decoder.buf_magnification \
3988 : 2))))
3990 /* Return maximum size (bytes) of a buffer enough for decoding
3991 SRC_BYTES of text encoded in CODING. */
3994 decoding_buffer_size (coding, src_bytes)
3995 struct coding_system *coding;
3996 int src_bytes;
3998 return (src_bytes * DECODING_BUFFER_MAG (coding)
3999 + CONVERSION_BUFFER_EXTRA_ROOM);
4002 /* Return maximum size (bytes) of a buffer enough for encoding
4003 SRC_BYTES of text to CODING. */
4006 encoding_buffer_size (coding, src_bytes)
4007 struct coding_system *coding;
4008 int src_bytes;
4010 int magnification;
4012 if (coding->type == coding_type_ccl)
4013 magnification = coding->spec.ccl.encoder.buf_magnification;
4014 else
4015 magnification = 3;
4017 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4020 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
4021 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
4022 #endif
4024 char *conversion_buffer;
4025 int conversion_buffer_size;
4027 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
4028 or decoding. Sufficient memory is allocated automatically. If we
4029 run out of memory, return NULL. */
4031 char *
4032 get_conversion_buffer (size)
4033 int size;
4035 if (size > conversion_buffer_size)
4037 char *buf;
4038 int real_size = conversion_buffer_size * 2;
4040 while (real_size < size) real_size *= 2;
4041 buf = (char *) xmalloc (real_size);
4042 xfree (conversion_buffer);
4043 conversion_buffer = buf;
4044 conversion_buffer_size = real_size;
4046 return conversion_buffer;
4050 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4051 struct coding_system *coding;
4052 unsigned char *source, *destination;
4053 int src_bytes, dst_bytes, encodep;
4055 struct ccl_program *ccl
4056 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4057 int result;
4059 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4061 coding->produced = ccl_driver (ccl, source, destination,
4062 src_bytes, dst_bytes, &(coding->consumed));
4063 coding->produced_char
4064 = (encodep
4065 ? coding->produced
4066 : multibyte_chars_in_text (destination, coding->produced));
4067 coding->consumed_char
4068 = multibyte_chars_in_text (source, coding->consumed);
4070 switch (ccl->status)
4072 case CCL_STAT_SUSPEND_BY_SRC:
4073 result = CODING_FINISH_INSUFFICIENT_SRC;
4074 break;
4075 case CCL_STAT_SUSPEND_BY_DST:
4076 result = CODING_FINISH_INSUFFICIENT_DST;
4077 break;
4078 case CCL_STAT_QUIT:
4079 case CCL_STAT_INVALID_CMD:
4080 result = CODING_FINISH_INTERRUPT;
4081 break;
4082 default:
4083 result = CODING_FINISH_NORMAL;
4084 break;
4086 return result;
4089 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4090 decoding, it may detect coding system and format of end-of-line if
4091 those are not yet decided.
4093 This function does not make full use of DESTINATION buffer. For
4094 instance, if coding->type is coding_type_iso2022, it uses only
4095 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
4096 DST_BYTES is decided by the function decoding_buffer_size, it
4097 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
4098 So, this function can decode the full SOURCE. But, in the other
4099 case, if you want to avoid carry over, you must supply at least 7
4100 bytes more area in DESTINATION buffer than expected maximum bytes
4101 that will be produced by this function. */
4104 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4105 struct coding_system *coding;
4106 unsigned char *source, *destination;
4107 int src_bytes, dst_bytes;
4109 int result;
4111 if (src_bytes <= 0
4112 && coding->type != coding_type_ccl
4113 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4114 && CODING_REQUIRE_FLUSHING (coding)))
4116 coding->produced = coding->produced_char = 0;
4117 coding->consumed = coding->consumed_char = 0;
4118 coding->fake_multibyte = 0;
4119 return CODING_FINISH_NORMAL;
4122 if (coding->type == coding_type_undecided)
4123 detect_coding (coding, source, src_bytes);
4125 if (coding->eol_type == CODING_EOL_UNDECIDED)
4126 detect_eol (coding, source, src_bytes);
4128 switch (coding->type)
4130 case coding_type_emacs_mule:
4131 case coding_type_undecided:
4132 case coding_type_raw_text:
4133 if (coding->eol_type == CODING_EOL_LF
4134 || coding->eol_type == CODING_EOL_UNDECIDED)
4135 goto label_no_conversion;
4136 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4137 break;
4139 case coding_type_sjis:
4140 result = decode_coding_sjis_big5 (coding, source, destination,
4141 src_bytes, dst_bytes, 1);
4142 break;
4144 case coding_type_iso2022:
4145 result = decode_coding_iso2022 (coding, source, destination,
4146 src_bytes, dst_bytes);
4147 break;
4149 case coding_type_big5:
4150 result = decode_coding_sjis_big5 (coding, source, destination,
4151 src_bytes, dst_bytes, 0);
4152 break;
4154 case coding_type_ccl:
4155 result = ccl_coding_driver (coding, source, destination,
4156 src_bytes, dst_bytes, 0);
4157 break;
4159 default: /* i.e. case coding_type_no_conversion: */
4160 label_no_conversion:
4161 if (dst_bytes && src_bytes > dst_bytes)
4163 coding->produced = dst_bytes;
4164 result = CODING_FINISH_INSUFFICIENT_DST;
4166 else
4168 coding->produced = src_bytes;
4169 result = CODING_FINISH_NORMAL;
4171 if (dst_bytes)
4172 bcopy (source, destination, coding->produced);
4173 else
4174 safe_bcopy (source, destination, coding->produced);
4175 coding->fake_multibyte = 1;
4176 coding->consumed
4177 = coding->consumed_char = coding->produced_char = coding->produced;
4178 break;
4181 return result;
4184 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
4186 This function does not make full use of DESTINATION buffer. For
4187 instance, if coding->type is coding_type_iso2022, it uses only
4188 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
4189 DST_BYTES is decided by the function encoding_buffer_size, it
4190 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
4191 So, this function can encode the full SOURCE. But, in the other
4192 case, if you want to avoid carry over, you must supply at least 20
4193 bytes more area in DESTINATION buffer than expected maximum bytes
4194 that will be produced by this function. */
4197 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4198 struct coding_system *coding;
4199 unsigned char *source, *destination;
4200 int src_bytes, dst_bytes;
4202 int result;
4204 if (src_bytes <= 0
4205 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4206 && CODING_REQUIRE_FLUSHING (coding)))
4208 coding->produced = coding->produced_char = 0;
4209 coding->consumed = coding->consumed_char = 0;
4210 coding->fake_multibyte = 0;
4211 return CODING_FINISH_NORMAL;
4214 switch (coding->type)
4216 case coding_type_emacs_mule:
4217 case coding_type_undecided:
4218 case coding_type_raw_text:
4219 if (coding->eol_type == CODING_EOL_LF
4220 || coding->eol_type == CODING_EOL_UNDECIDED)
4221 goto label_no_conversion;
4222 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4223 break;
4225 case coding_type_sjis:
4226 result = encode_coding_sjis_big5 (coding, source, destination,
4227 src_bytes, dst_bytes, 1);
4228 break;
4230 case coding_type_iso2022:
4231 result = encode_coding_iso2022 (coding, source, destination,
4232 src_bytes, dst_bytes);
4233 break;
4235 case coding_type_big5:
4236 result = encode_coding_sjis_big5 (coding, source, destination,
4237 src_bytes, dst_bytes, 0);
4238 break;
4240 case coding_type_ccl:
4241 result = ccl_coding_driver (coding, source, destination,
4242 src_bytes, dst_bytes, 1);
4243 break;
4245 default: /* i.e. case coding_type_no_conversion: */
4246 label_no_conversion:
4247 if (dst_bytes && src_bytes > dst_bytes)
4249 coding->produced = dst_bytes;
4250 result = CODING_FINISH_INSUFFICIENT_DST;
4252 else
4254 coding->produced = src_bytes;
4255 result = CODING_FINISH_NORMAL;
4257 if (dst_bytes)
4258 bcopy (source, destination, coding->produced);
4259 else
4260 safe_bcopy (source, destination, coding->produced);
4261 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
4263 unsigned char *p = destination, *pend = p + coding->produced;
4264 while (p < pend)
4265 if (*p++ == '\015') p[-1] = '\n';
4267 coding->fake_multibyte = 1;
4268 coding->consumed
4269 = coding->consumed_char = coding->produced_char = coding->produced;
4270 break;
4273 return result;
4276 /* Scan text in the region between *BEG and *END (byte positions),
4277 skip characters which we don't have to decode by coding system
4278 CODING at the head and tail, then set *BEG and *END to the region
4279 of the text we actually have to convert. The caller should move
4280 the gap out of the region in advance.
4282 If STR is not NULL, *BEG and *END are indices into STR. */
4284 static void
4285 shrink_decoding_region (beg, end, coding, str)
4286 int *beg, *end;
4287 struct coding_system *coding;
4288 unsigned char *str;
4290 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4291 int eol_conversion;
4292 Lisp_Object translation_table;
4294 if (coding->type == coding_type_ccl
4295 || coding->type == coding_type_undecided
4296 || !NILP (coding->post_read_conversion))
4298 /* We can't skip any data. */
4299 return;
4301 else if (coding->type == coding_type_no_conversion)
4303 /* We need no conversion, but don't have to skip any data here.
4304 Decoding routine handles them effectively anyway. */
4305 return;
4308 translation_table = coding->translation_table_for_decode;
4309 if (NILP (translation_table) && !NILP (Venable_character_translation))
4310 translation_table = Vstandard_translation_table_for_decode;
4311 if (CHAR_TABLE_P (translation_table))
4313 int i;
4314 for (i = 0; i < 128; i++)
4315 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4316 break;
4317 if (i < 128)
4318 /* Some ASCII character should be tranlsated. We give up
4319 shrinking. */
4320 return;
4323 eol_conversion = (coding->eol_type != CODING_EOL_LF);
4325 if ((! eol_conversion) && (coding->heading_ascii >= 0))
4326 /* Detection routine has already found how much we can skip at the
4327 head. */
4328 *beg += coding->heading_ascii;
4330 if (str)
4332 begp_orig = begp = str + *beg;
4333 endp_orig = endp = str + *end;
4335 else
4337 begp_orig = begp = BYTE_POS_ADDR (*beg);
4338 endp_orig = endp = begp + *end - *beg;
4341 switch (coding->type)
4343 case coding_type_emacs_mule:
4344 case coding_type_raw_text:
4345 if (eol_conversion)
4347 if (coding->heading_ascii < 0)
4348 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
4349 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
4350 endp--;
4351 /* Do not consider LF as ascii if preceded by CR, since that
4352 confuses eol decoding. */
4353 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4354 endp++;
4356 else
4357 begp = endp;
4358 break;
4360 case coding_type_sjis:
4361 case coding_type_big5:
4362 /* We can skip all ASCII characters at the head. */
4363 if (coding->heading_ascii < 0)
4365 if (eol_conversion)
4366 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4367 else
4368 while (begp < endp && *begp < 0x80) begp++;
4370 /* We can skip all ASCII characters at the tail except for the
4371 second byte of SJIS or BIG5 code. */
4372 if (eol_conversion)
4373 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4374 else
4375 while (begp < endp && endp[-1] < 0x80) endp--;
4376 /* Do not consider LF as ascii if preceded by CR, since that
4377 confuses eol decoding. */
4378 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4379 endp++;
4380 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4381 endp++;
4382 break;
4384 default: /* i.e. case coding_type_iso2022: */
4385 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4386 /* We can't skip any data. */
4387 break;
4388 if (coding->heading_ascii < 0)
4390 /* We can skip all ASCII characters at the head except for a
4391 few control codes. */
4392 while (begp < endp && (c = *begp) < 0x80
4393 && c != ISO_CODE_CR && c != ISO_CODE_SO
4394 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4395 && (!eol_conversion || c != ISO_CODE_LF))
4396 begp++;
4398 switch (coding->category_idx)
4400 case CODING_CATEGORY_IDX_ISO_8_1:
4401 case CODING_CATEGORY_IDX_ISO_8_2:
4402 /* We can skip all ASCII characters at the tail. */
4403 if (eol_conversion)
4404 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4405 else
4406 while (begp < endp && endp[-1] < 0x80) endp--;
4407 /* Do not consider LF as ascii if preceded by CR, since that
4408 confuses eol decoding. */
4409 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4410 endp++;
4411 break;
4413 case CODING_CATEGORY_IDX_ISO_7:
4414 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4416 /* We can skip all charactes at the tail except for 8-bit
4417 codes and ESC and the following 2-byte at the tail. */
4418 unsigned char *eight_bit = NULL;
4420 if (eol_conversion)
4421 while (begp < endp
4422 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4424 if (!eight_bit && c & 0x80) eight_bit = endp;
4425 endp--;
4427 else
4428 while (begp < endp
4429 && (c = endp[-1]) != ISO_CODE_ESC)
4431 if (!eight_bit && c & 0x80) eight_bit = endp;
4432 endp--;
4434 /* Do not consider LF as ascii if preceded by CR, since that
4435 confuses eol decoding. */
4436 if (begp < endp && endp < endp_orig
4437 && endp[-1] == '\r' && endp[0] == '\n')
4438 endp++;
4439 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4441 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4442 /* This is an ASCII designation sequence. We can
4443 surely skip the tail. But, if we have
4444 encountered an 8-bit code, skip only the codes
4445 after that. */
4446 endp = eight_bit ? eight_bit : endp + 2;
4447 else
4448 /* Hmmm, we can't skip the tail. */
4449 endp = endp_orig;
4451 else if (eight_bit)
4452 endp = eight_bit;
4456 *beg += begp - begp_orig;
4457 *end += endp - endp_orig;
4458 return;
4461 /* Like shrink_decoding_region but for encoding. */
4463 static void
4464 shrink_encoding_region (beg, end, coding, str)
4465 int *beg, *end;
4466 struct coding_system *coding;
4467 unsigned char *str;
4469 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4470 int eol_conversion;
4471 Lisp_Object translation_table;
4473 if (coding->type == coding_type_ccl)
4474 /* We can't skip any data. */
4475 return;
4476 else if (coding->type == coding_type_no_conversion)
4478 /* We need no conversion. */
4479 *beg = *end;
4480 return;
4483 translation_table = coding->translation_table_for_encode;
4484 if (NILP (translation_table) && !NILP (Venable_character_translation))
4485 translation_table = Vstandard_translation_table_for_encode;
4486 if (CHAR_TABLE_P (translation_table))
4488 int i;
4489 for (i = 0; i < 128; i++)
4490 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4491 break;
4492 if (i < 128)
4493 /* Some ASCII character should be tranlsated. We give up
4494 shrinking. */
4495 return;
4498 if (str)
4500 begp_orig = begp = str + *beg;
4501 endp_orig = endp = str + *end;
4503 else
4505 begp_orig = begp = BYTE_POS_ADDR (*beg);
4506 endp_orig = endp = begp + *end - *beg;
4509 eol_conversion = (coding->eol_type == CODING_EOL_CR
4510 || coding->eol_type == CODING_EOL_CRLF);
4512 /* Here, we don't have to check coding->pre_write_conversion because
4513 the caller is expected to have handled it already. */
4514 switch (coding->type)
4516 case coding_type_undecided:
4517 case coding_type_emacs_mule:
4518 case coding_type_raw_text:
4519 if (eol_conversion)
4521 while (begp < endp && *begp != '\n') begp++;
4522 while (begp < endp && endp[-1] != '\n') endp--;
4524 else
4525 begp = endp;
4526 break;
4528 case coding_type_iso2022:
4529 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4530 /* We can't skip any data. */
4531 break;
4532 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4534 unsigned char *bol = begp;
4535 while (begp < endp && *begp < 0x80)
4537 begp++;
4538 if (begp[-1] == '\n')
4539 bol = begp;
4541 begp = bol;
4542 goto label_skip_tail;
4544 /* fall down ... */
4546 default:
4547 /* We can skip all ASCII characters at the head and tail. */
4548 if (eol_conversion)
4549 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4550 else
4551 while (begp < endp && *begp < 0x80) begp++;
4552 label_skip_tail:
4553 if (eol_conversion)
4554 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4555 else
4556 while (begp < endp && *(endp - 1) < 0x80) endp--;
4557 break;
4560 *beg += begp - begp_orig;
4561 *end += endp - endp_orig;
4562 return;
4565 /* As shrinking conversion region requires some overhead, we don't try
4566 shrinking if the length of conversion region is less than this
4567 value. */
4568 static int shrink_conversion_region_threshhold = 1024;
4570 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4571 do { \
4572 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4574 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4575 else shrink_decoding_region (beg, end, coding, str); \
4577 } while (0)
4579 static Lisp_Object
4580 code_convert_region_unwind (dummy)
4581 Lisp_Object dummy;
4583 inhibit_pre_post_conversion = 0;
4584 return Qnil;
4587 /* Store information about all compositions in the range FROM and TO
4588 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4589 buffer or a string, defaults to the current buffer. */
4591 void
4592 coding_save_composition (coding, from, to, obj)
4593 struct coding_system *coding;
4594 int from, to;
4595 Lisp_Object obj;
4597 Lisp_Object prop;
4598 int start, end;
4600 if (coding->composing == COMPOSITION_DISABLED)
4601 return;
4602 if (!coding->cmp_data)
4603 coding_allocate_composition_data (coding, from);
4604 if (!find_composition (from, to, &start, &end, &prop, obj)
4605 || end > to)
4606 return;
4607 if (start < from
4608 && (!find_composition (end, to, &start, &end, &prop, obj)
4609 || end > to))
4610 return;
4611 coding->composing = COMPOSITION_NO;
4614 if (COMPOSITION_VALID_P (start, end, prop))
4616 enum composition_method method = COMPOSITION_METHOD (prop);
4617 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4618 >= COMPOSITION_DATA_SIZE)
4619 coding_allocate_composition_data (coding, from);
4620 /* For relative composition, we remember start and end
4621 positions, for the other compositions, we also remember
4622 components. */
4623 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4624 if (method != COMPOSITION_RELATIVE)
4626 /* We must store a*/
4627 Lisp_Object val, ch;
4629 val = COMPOSITION_COMPONENTS (prop);
4630 if (CONSP (val))
4631 while (CONSP (val))
4633 ch = XCAR (val), val = XCDR (val);
4634 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4636 else if (VECTORP (val) || STRINGP (val))
4638 int len = (VECTORP (val)
4639 ? XVECTOR (val)->size : XSTRING (val)->size);
4640 int i;
4641 for (i = 0; i < len; i++)
4643 ch = (STRINGP (val)
4644 ? Faref (val, make_number (i))
4645 : XVECTOR (val)->contents[i]);
4646 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4649 else /* INTEGERP (val) */
4650 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4652 CODING_ADD_COMPOSITION_END (coding, end - from);
4654 start = end;
4656 while (start < to
4657 && find_composition (start, to, &start, &end, &prop, obj)
4658 && end <= to);
4660 /* Make coding->cmp_data point to the first memory block. */
4661 while (coding->cmp_data->prev)
4662 coding->cmp_data = coding->cmp_data->prev;
4663 coding->cmp_data_start = 0;
4666 /* Reflect the saved information about compositions to OBJ.
4667 CODING->cmp_data points to a memory block for the informaiton. OBJ
4668 is a buffer or a string, defaults to the current buffer. */
4670 static void
4671 coding_restore_composition (coding, obj)
4672 struct coding_system *coding;
4673 Lisp_Object obj;
4675 struct composition_data *cmp_data = coding->cmp_data;
4677 if (!cmp_data)
4678 return;
4680 while (cmp_data->prev)
4681 cmp_data = cmp_data->prev;
4683 while (cmp_data)
4685 int i;
4687 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4689 int *data = cmp_data->data + i;
4690 enum composition_method method = (enum composition_method) data[3];
4691 Lisp_Object components;
4693 if (method == COMPOSITION_RELATIVE)
4694 components = Qnil;
4695 else
4697 int len = data[0] - 4, j;
4698 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4700 for (j = 0; j < len; j++)
4701 args[j] = make_number (data[4 + j]);
4702 components = (method == COMPOSITION_WITH_ALTCHARS
4703 ? Fstring (len, args) : Fvector (len, args));
4705 compose_text (data[1], data[2], components, Qnil, obj);
4707 cmp_data = cmp_data->next;
4711 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4712 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4713 coding system CODING, and return the status code of code conversion
4714 (currently, this value has no meaning).
4716 How many characters (and bytes) are converted to how many
4717 characters (and bytes) are recorded in members of the structure
4718 CODING.
4720 If REPLACE is nonzero, we do various things as if the original text
4721 is deleted and a new text is inserted. See the comments in
4722 replace_range (insdel.c) to know what we are doing. */
4725 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4726 int from, from_byte, to, to_byte, encodep, replace;
4727 struct coding_system *coding;
4729 int len = to - from, len_byte = to_byte - from_byte;
4730 int require, inserted, inserted_byte;
4731 int head_skip, tail_skip, total_skip = 0;
4732 Lisp_Object saved_coding_symbol;
4733 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4734 int first = 1;
4735 int fake_multibyte = 0;
4736 unsigned char *src, *dst;
4737 Lisp_Object deletion;
4738 int orig_point = PT, orig_len = len;
4739 int prev_Z;
4741 deletion = Qnil;
4742 saved_coding_symbol = Qnil;
4744 if (from < PT && PT < to)
4746 TEMP_SET_PT_BOTH (from, from_byte);
4747 orig_point = from;
4750 if (replace)
4752 int saved_from = from;
4754 prepare_to_modify_buffer (from, to, &from);
4755 if (saved_from != from)
4757 to = from + len;
4758 if (multibyte)
4759 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4760 else
4761 from_byte = from, to_byte = to;
4762 len_byte = to_byte - from_byte;
4766 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4768 /* We must detect encoding of text and eol format. */
4770 if (from < GPT && to > GPT)
4771 move_gap_both (from, from_byte);
4772 if (coding->type == coding_type_undecided)
4774 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4775 if (coding->type == coding_type_undecided)
4776 /* It seems that the text contains only ASCII, but we
4777 should not left it undecided because the deeper
4778 decoding routine (decode_coding) tries to detect the
4779 encodings again in vain. */
4780 coding->type = coding_type_emacs_mule;
4782 if (coding->eol_type == CODING_EOL_UNDECIDED)
4784 saved_coding_symbol = coding->symbol;
4785 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4786 if (coding->eol_type == CODING_EOL_UNDECIDED)
4787 coding->eol_type = CODING_EOL_LF;
4788 /* We had better recover the original eol format if we
4789 encounter an inconsitent eol format while decoding. */
4790 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4794 if (encodep
4795 ? ! CODING_REQUIRE_ENCODING (coding)
4796 : ! CODING_REQUIRE_DECODING (coding))
4798 coding->consumed_char = len;
4799 coding->consumed = len_byte;
4800 coding->produced = len_byte;
4801 if (multibyte
4802 && ! replace
4803 /* See the comment of the member heading_ascii in coding.h. */
4804 && coding->heading_ascii < len_byte)
4806 /* We still may have to combine byte at the head and the
4807 tail of the text in the region. */
4808 if (from < GPT && GPT < to)
4809 move_gap_both (to, to_byte);
4810 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4811 adjust_after_insert (from, from_byte, to, to_byte, len);
4812 coding->produced_char = len;
4814 else
4816 if (!replace)
4817 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4818 coding->produced_char = len_byte;
4820 return 0;
4823 /* Now we convert the text. */
4825 /* For encoding, we must process pre-write-conversion in advance. */
4826 if (encodep
4827 && ! NILP (coding->pre_write_conversion)
4828 && SYMBOLP (coding->pre_write_conversion)
4829 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4831 /* The function in pre-write-conversion may put a new text in a
4832 new buffer. */
4833 struct buffer *prev = current_buffer;
4834 Lisp_Object new;
4835 int count = specpdl_ptr - specpdl;
4837 record_unwind_protect (code_convert_region_unwind, Qnil);
4838 /* We should not call any more pre-write/post-read-conversion
4839 functions while this pre-write-conversion is running. */
4840 inhibit_pre_post_conversion = 1;
4841 call2 (coding->pre_write_conversion,
4842 make_number (from), make_number (to));
4843 inhibit_pre_post_conversion = 0;
4844 /* Discard the unwind protect. */
4845 specpdl_ptr--;
4847 if (current_buffer != prev)
4849 len = ZV - BEGV;
4850 new = Fcurrent_buffer ();
4851 set_buffer_internal_1 (prev);
4852 del_range_2 (from, from_byte, to, to_byte, 0);
4853 TEMP_SET_PT_BOTH (from, from_byte);
4854 insert_from_buffer (XBUFFER (new), 1, len, 0);
4855 Fkill_buffer (new);
4856 if (orig_point >= to)
4857 orig_point += len - orig_len;
4858 else if (orig_point > from)
4859 orig_point = from;
4860 orig_len = len;
4861 to = from + len;
4862 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4863 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4864 len_byte = to_byte - from_byte;
4865 TEMP_SET_PT_BOTH (from, from_byte);
4869 if (replace)
4870 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4872 if (coding->composing != COMPOSITION_DISABLED)
4874 if (encodep)
4875 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4876 else
4877 coding_allocate_composition_data (coding, from);
4880 /* For conversion by CCL program and for encoding with composition
4881 handling, we can't skip any character because we may convert or
4882 compose even ASCII characters. */
4883 if (coding->type != coding_type_ccl
4884 && (!encodep || coding->cmp_data == NULL))
4886 /* Try to skip the heading and tailing ASCIIs. */
4887 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4889 if (from < GPT && GPT < to)
4890 move_gap_both (from, from_byte);
4891 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4892 if (from_byte == to_byte
4893 && (encodep || NILP (coding->post_read_conversion))
4894 && ! CODING_REQUIRE_FLUSHING (coding))
4896 coding->produced = len_byte;
4897 coding->produced_char = multibyte ? len : len_byte;
4898 if (!replace)
4899 /* We must record and adjust for this new text now. */
4900 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4901 return 0;
4904 head_skip = from_byte - from_byte_orig;
4905 tail_skip = to_byte_orig - to_byte;
4906 total_skip = head_skip + tail_skip;
4907 from += head_skip;
4908 to -= tail_skip;
4909 len -= total_skip; len_byte -= total_skip;
4911 if (coding->cmp_data)
4912 coding->cmp_data->char_offset = from;
4915 /* The code conversion routine can not preserve text properties for
4916 now. So, we must remove all text properties in the region.
4917 Here, we must suppress all modification hooks. */
4918 if (replace)
4920 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4921 inhibit_modification_hooks = 1;
4922 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4923 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4926 /* For converion, we must put the gap before the text in addition to
4927 making the gap larger for efficient decoding. The required gap
4928 size starts from 2000 which is the magic number used in make_gap.
4929 But, after one batch of conversion, it will be incremented if we
4930 find that it is not enough . */
4931 require = 2000;
4933 if (GAP_SIZE < require)
4934 make_gap (require - GAP_SIZE);
4935 move_gap_both (from, from_byte);
4937 inserted = inserted_byte = 0;
4939 GAP_SIZE += len_byte;
4940 ZV -= len;
4941 Z -= len;
4942 ZV_BYTE -= len_byte;
4943 Z_BYTE -= len_byte;
4945 if (GPT - BEG < BEG_UNCHANGED)
4946 BEG_UNCHANGED = GPT - BEG;
4947 if (Z - GPT < END_UNCHANGED)
4948 END_UNCHANGED = Z - GPT;
4950 for (;;)
4952 int result;
4954 /* The buffer memory is now:
4955 +--------+converted-text+---------+-------original-text------+---+
4956 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4957 |<------------------- GAP_SIZE -------------------->| */
4958 src = GAP_END_ADDR - len_byte;
4959 dst = GPT_ADDR + inserted_byte;
4961 if (encodep)
4962 result = encode_coding (coding, src, dst, len_byte, 0);
4963 else
4964 result = decode_coding (coding, src, dst, len_byte, 0);
4966 /* The buffer memory is now:
4967 +--------+-------converted-text--------+--+---original-text--+---+
4968 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4969 |<------------------- GAP_SIZE -------------------->| */
4971 if (coding->fake_multibyte)
4972 fake_multibyte = 1;
4974 if (!encodep && !multibyte)
4975 coding->produced_char = coding->produced;
4976 inserted += coding->produced_char;
4977 inserted_byte += coding->produced;
4978 len_byte -= coding->consumed;
4980 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4982 coding_allocate_composition_data (coding, from + inserted);
4983 continue;
4986 src += coding->consumed;
4987 dst += coding->produced;
4989 if (result == CODING_FINISH_NORMAL)
4991 src += len_byte;
4992 break;
4994 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4996 unsigned char *pend = dst, *p = pend - inserted_byte;
4997 Lisp_Object eol_type;
4999 /* Encode LFs back to the original eol format (CR or CRLF). */
5000 if (coding->eol_type == CODING_EOL_CR)
5002 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5004 else
5006 int count = 0;
5008 while (p < pend) if (*p++ == '\n') count++;
5009 if (src - dst < count)
5011 /* We don't have sufficient room for encoding LFs
5012 back to CRLF. We must record converted and
5013 not-yet-converted text back to the buffer
5014 content, enlarge the gap, then record them out of
5015 the buffer contents again. */
5016 int add = len_byte + inserted_byte;
5018 GAP_SIZE -= add;
5019 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5020 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5021 make_gap (count - GAP_SIZE);
5022 GAP_SIZE += add;
5023 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5024 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5025 /* Don't forget to update SRC, DST, and PEND. */
5026 src = GAP_END_ADDR - len_byte;
5027 dst = GPT_ADDR + inserted_byte;
5028 pend = dst;
5030 inserted += count;
5031 inserted_byte += count;
5032 coding->produced += count;
5033 p = dst = pend + count;
5034 while (count)
5036 *--p = *--pend;
5037 if (*p == '\n') count--, *--p = '\r';
5041 /* Suppress eol-format conversion in the further conversion. */
5042 coding->eol_type = CODING_EOL_LF;
5044 /* Set the coding system symbol to that for Unix-like EOL. */
5045 eol_type = Fget (saved_coding_symbol, Qeol_type);
5046 if (VECTORP (eol_type)
5047 && XVECTOR (eol_type)->size == 3
5048 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5049 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5050 else
5051 coding->symbol = saved_coding_symbol;
5053 continue;
5055 if (len_byte <= 0)
5057 if (coding->type != coding_type_ccl
5058 || coding->mode & CODING_MODE_LAST_BLOCK)
5059 break;
5060 coding->mode |= CODING_MODE_LAST_BLOCK;
5061 continue;
5063 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5065 /* The source text ends in invalid codes. Let's just
5066 make them valid buffer contents, and finish conversion. */
5067 inserted += len_byte;
5068 inserted_byte += len_byte;
5069 while (len_byte--)
5070 *dst++ = *src++;
5071 fake_multibyte = 1;
5072 break;
5074 if (result == CODING_FINISH_INTERRUPT)
5076 /* The conversion procedure was interrupted by a user. */
5077 fake_multibyte = 1;
5078 break;
5080 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5081 if (coding->consumed < 1)
5083 /* It's quite strange to require more memory without
5084 consuming any bytes. Perhaps CCL program bug. */
5085 fake_multibyte = 1;
5086 break;
5088 if (first)
5090 /* We have just done the first batch of conversion which was
5091 stoped because of insufficient gap. Let's reconsider the
5092 required gap size (i.e. SRT - DST) now.
5094 We have converted ORIG bytes (== coding->consumed) into
5095 NEW bytes (coding->produced). To convert the remaining
5096 LEN bytes, we may need REQUIRE bytes of gap, where:
5097 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5098 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5099 Here, we are sure that NEW >= ORIG. */
5100 float ratio = coding->produced - coding->consumed;
5101 ratio /= coding->consumed;
5102 require = len_byte * ratio;
5103 first = 0;
5105 if ((src - dst) < (require + 2000))
5107 /* See the comment above the previous call of make_gap. */
5108 int add = len_byte + inserted_byte;
5110 GAP_SIZE -= add;
5111 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5112 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5113 make_gap (require + 2000);
5114 GAP_SIZE += add;
5115 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5116 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5119 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5121 if (multibyte
5122 && (encodep
5123 || fake_multibyte
5124 || (to - from) != (to_byte - from_byte)))
5125 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
5127 /* If we have shrinked the conversion area, adjust it now. */
5128 if (total_skip > 0)
5130 if (tail_skip > 0)
5131 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5132 inserted += total_skip; inserted_byte += total_skip;
5133 GAP_SIZE += total_skip;
5134 GPT -= head_skip; GPT_BYTE -= head_skip;
5135 ZV -= total_skip; ZV_BYTE -= total_skip;
5136 Z -= total_skip; Z_BYTE -= total_skip;
5137 from -= head_skip; from_byte -= head_skip;
5138 to += tail_skip; to_byte += tail_skip;
5141 prev_Z = Z;
5142 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5143 inserted = Z - prev_Z;
5145 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5146 coding_restore_composition (coding, Fcurrent_buffer ());
5147 coding_free_composition_data (coding);
5149 if (! encodep && ! NILP (coding->post_read_conversion))
5151 Lisp_Object val;
5152 int count = specpdl_ptr - specpdl;
5154 if (from != PT)
5155 TEMP_SET_PT_BOTH (from, from_byte);
5156 prev_Z = Z;
5157 record_unwind_protect (code_convert_region_unwind, Qnil);
5158 /* We should not call any more pre-write/post-read-conversion
5159 functions while this post-read-conversion is running. */
5160 inhibit_pre_post_conversion = 1;
5161 val = call1 (coding->post_read_conversion, make_number (inserted));
5162 inhibit_pre_post_conversion = 0;
5163 /* Discard the unwind protect. */
5164 specpdl_ptr--;
5165 CHECK_NUMBER (val, 0);
5166 inserted += Z - prev_Z;
5169 if (orig_point >= from)
5171 if (orig_point >= from + orig_len)
5172 orig_point += inserted - orig_len;
5173 else
5174 orig_point = from;
5175 TEMP_SET_PT (orig_point);
5178 if (replace)
5180 signal_after_change (from, to - from, inserted);
5181 update_compositions (from, from + inserted, CHECK_BORDER);
5185 coding->consumed = to_byte - from_byte;
5186 coding->consumed_char = to - from;
5187 coding->produced = inserted_byte;
5188 coding->produced_char = inserted;
5191 return 0;
5194 Lisp_Object
5195 code_convert_string (str, coding, encodep, nocopy)
5196 Lisp_Object str;
5197 struct coding_system *coding;
5198 int encodep, nocopy;
5200 int len;
5201 char *buf;
5202 int from = 0, to = XSTRING (str)->size;
5203 int to_byte = STRING_BYTES (XSTRING (str));
5204 struct gcpro gcpro1;
5205 Lisp_Object saved_coding_symbol;
5206 int result;
5208 saved_coding_symbol = Qnil;
5209 if ((encodep && !NILP (coding->pre_write_conversion)
5210 || !encodep && !NILP (coding->post_read_conversion)))
5212 /* Since we have to call Lisp functions which assume target text
5213 is in a buffer, after setting a temporary buffer, call
5214 code_convert_region. */
5215 int count = specpdl_ptr - specpdl;
5216 struct buffer *prev = current_buffer;
5217 int multibyte = STRING_MULTIBYTE (str);
5219 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5220 record_unwind_protect (code_convert_region_unwind, Qnil);
5221 inhibit_pre_post_conversion = 1;
5222 GCPRO1 (str);
5223 temp_output_buffer_setup (" *code-converting-work*");
5224 set_buffer_internal (XBUFFER (Vstandard_output));
5225 /* We must insert the contents of STR as is without
5226 unibyte<->multibyte conversion. For that, we adjust the
5227 multibyteness of the working buffer to that of STR. */
5228 Ferase_buffer (); /* for safety */
5229 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5230 insert_from_string (str, 0, 0, to, to_byte, 0);
5231 UNGCPRO;
5232 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
5233 /* Make a unibyte string if we are encoding, otherwise make a
5234 multibyte string. */
5235 Fset_buffer_multibyte (encodep ? Qnil : Qt);
5236 str = make_buffer_string (BEGV, ZV, 0);
5237 return unbind_to (count, str);
5240 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5242 /* See the comments in code_convert_region. */
5243 if (coding->type == coding_type_undecided)
5245 detect_coding (coding, XSTRING (str)->data, to_byte);
5246 if (coding->type == coding_type_undecided)
5247 coding->type = coding_type_emacs_mule;
5249 if (coding->eol_type == CODING_EOL_UNDECIDED)
5251 saved_coding_symbol = coding->symbol;
5252 detect_eol (coding, XSTRING (str)->data, to_byte);
5253 if (coding->eol_type == CODING_EOL_UNDECIDED)
5254 coding->eol_type = CODING_EOL_LF;
5255 /* We had better recover the original eol format if we
5256 encounter an inconsitent eol format while decoding. */
5257 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5261 if (encodep
5262 ? ! CODING_REQUIRE_ENCODING (coding)
5263 : ! CODING_REQUIRE_DECODING (coding))
5264 return (nocopy ? str : Fcopy_sequence (str));
5266 if (coding->composing != COMPOSITION_DISABLED)
5268 if (encodep)
5269 coding_save_composition (coding, from, to, str);
5270 else
5271 coding_allocate_composition_data (coding, from);
5274 /* For conversion by CCL program and for encoding with composition
5275 handling, we can't skip any character because we may convert or
5276 compose even ASCII characters. */
5277 if (coding->type != coding_type_ccl
5278 && (!encodep || coding->cmp_data == NULL))
5280 /* Try to skip the heading and tailing ASCIIs. */
5281 int from_orig = from;
5283 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5284 encodep);
5285 if (from == to_byte)
5286 return (nocopy ? str : Fcopy_sequence (str));
5288 if (coding->cmp_data)
5289 coding->cmp_data->char_offset = from;
5292 if (encodep)
5293 len = encoding_buffer_size (coding, to_byte - from);
5294 else
5295 len = decoding_buffer_size (coding, to_byte - from);
5296 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5297 GCPRO1 (str);
5298 buf = get_conversion_buffer (len);
5299 UNGCPRO;
5301 if (from > 0)
5302 bcopy (XSTRING (str)->data, buf, from);
5303 result = (encodep
5304 ? encode_coding (coding, XSTRING (str)->data + from,
5305 buf + from, to_byte - from, len)
5306 : decode_coding (coding, XSTRING (str)->data + from,
5307 buf + from, to_byte - from, len));
5308 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5310 /* We simply try to decode the whole string again but without
5311 eol-conversion this time. */
5312 coding->eol_type = CODING_EOL_LF;
5313 coding->symbol = saved_coding_symbol;
5314 coding_free_composition_data (coding);
5315 return code_convert_string (str, coding, encodep, nocopy);
5318 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5319 STRING_BYTES (XSTRING (str)) - to_byte);
5321 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5322 if (encodep)
5323 str = make_unibyte_string (buf, len + coding->produced);
5324 else
5326 int chars= (coding->fake_multibyte
5327 ? multibyte_chars_in_text (buf + from, coding->produced)
5328 : coding->produced_char);
5329 str = make_multibyte_string (buf, len + chars, len + coding->produced);
5332 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5333 coding_restore_composition (coding, str);
5335 coding_free_composition_data (coding);
5336 return str;
5340 #ifdef emacs
5341 /*** 8. Emacs Lisp library functions ***/
5343 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5344 "Return t if OBJECT is nil or a coding-system.\n\
5345 See the documentation of `make-coding-system' for information\n\
5346 about coding-system objects.")
5347 (obj)
5348 Lisp_Object obj;
5350 if (NILP (obj))
5351 return Qt;
5352 if (!SYMBOLP (obj))
5353 return Qnil;
5354 /* Get coding-spec vector for OBJ. */
5355 obj = Fget (obj, Qcoding_system);
5356 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5357 ? Qt : Qnil);
5360 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5361 Sread_non_nil_coding_system, 1, 1, 0,
5362 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5363 (prompt)
5364 Lisp_Object prompt;
5366 Lisp_Object val;
5369 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5370 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5372 while (XSTRING (val)->size == 0);
5373 return (Fintern (val, Qnil));
5376 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5377 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5378 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5379 (prompt, default_coding_system)
5380 Lisp_Object prompt, default_coding_system;
5382 Lisp_Object val;
5383 if (SYMBOLP (default_coding_system))
5384 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5385 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5386 Qt, Qnil, Qcoding_system_history,
5387 default_coding_system, Qnil);
5388 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5391 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5392 1, 1, 0,
5393 "Check validity of CODING-SYSTEM.\n\
5394 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5395 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5396 The value of property should be a vector of length 5.")
5397 (coding_system)
5398 Lisp_Object coding_system;
5400 CHECK_SYMBOL (coding_system, 0);
5401 if (!NILP (Fcoding_system_p (coding_system)))
5402 return coding_system;
5403 while (1)
5404 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5407 Lisp_Object
5408 detect_coding_system (src, src_bytes, highest)
5409 unsigned char *src;
5410 int src_bytes, highest;
5412 int coding_mask, eol_type;
5413 Lisp_Object val, tmp;
5414 int dummy;
5416 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5417 eol_type = detect_eol_type (src, src_bytes, &dummy);
5418 if (eol_type == CODING_EOL_INCONSISTENT)
5419 eol_type = CODING_EOL_UNDECIDED;
5421 if (!coding_mask)
5423 val = Qundecided;
5424 if (eol_type != CODING_EOL_UNDECIDED)
5426 Lisp_Object val2;
5427 val2 = Fget (Qundecided, Qeol_type);
5428 if (VECTORP (val2))
5429 val = XVECTOR (val2)->contents[eol_type];
5431 return (highest ? val : Fcons (val, Qnil));
5434 /* At first, gather possible coding systems in VAL. */
5435 val = Qnil;
5436 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5438 Lisp_Object category_val, category_index;
5440 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5441 category_val = Fsymbol_value (XCAR (tmp));
5442 if (!NILP (category_val)
5443 && NATNUMP (category_index)
5444 && (coding_mask & (1 << XFASTINT (category_index))))
5446 val = Fcons (category_val, val);
5447 if (highest)
5448 break;
5451 if (!highest)
5452 val = Fnreverse (val);
5454 /* Then, replace the elements with subsidiary coding systems. */
5455 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5457 if (eol_type != CODING_EOL_UNDECIDED
5458 && eol_type != CODING_EOL_INCONSISTENT)
5460 Lisp_Object eol;
5461 eol = Fget (XCAR (tmp), Qeol_type);
5462 if (VECTORP (eol))
5463 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5466 return (highest ? XCAR (val) : val);
5469 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5470 2, 3, 0,
5471 "Detect coding system of the text in the region between START and END.\n\
5472 Return a list of possible coding systems ordered by priority.\n\
5474 If only ASCII characters are found, it returns a list of single element\n\
5475 `undecided' or its subsidiary coding system according to a detected\n\
5476 end-of-line format.\n\
5478 If optional argument HIGHEST is non-nil, return the coding system of\n\
5479 highest priority.")
5480 (start, end, highest)
5481 Lisp_Object start, end, highest;
5483 int from, to;
5484 int from_byte, to_byte;
5486 CHECK_NUMBER_COERCE_MARKER (start, 0);
5487 CHECK_NUMBER_COERCE_MARKER (end, 1);
5489 validate_region (&start, &end);
5490 from = XINT (start), to = XINT (end);
5491 from_byte = CHAR_TO_BYTE (from);
5492 to_byte = CHAR_TO_BYTE (to);
5494 if (from < GPT && to >= GPT)
5495 move_gap_both (to, to_byte);
5497 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5498 to_byte - from_byte,
5499 !NILP (highest));
5502 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5503 1, 2, 0,
5504 "Detect coding system of the text in STRING.\n\
5505 Return a list of possible coding systems ordered by priority.\n\
5507 If only ASCII characters are found, it returns a list of single element\n\
5508 `undecided' or its subsidiary coding system according to a detected\n\
5509 end-of-line format.\n\
5511 If optional argument HIGHEST is non-nil, return the coding system of\n\
5512 highest priority.")
5513 (string, highest)
5514 Lisp_Object string, highest;
5516 CHECK_STRING (string, 0);
5518 return detect_coding_system (XSTRING (string)->data,
5519 STRING_BYTES (XSTRING (string)),
5520 !NILP (highest));
5523 Lisp_Object
5524 code_convert_region1 (start, end, coding_system, encodep)
5525 Lisp_Object start, end, coding_system;
5526 int encodep;
5528 struct coding_system coding;
5529 int from, to, len;
5531 CHECK_NUMBER_COERCE_MARKER (start, 0);
5532 CHECK_NUMBER_COERCE_MARKER (end, 1);
5533 CHECK_SYMBOL (coding_system, 2);
5535 validate_region (&start, &end);
5536 from = XFASTINT (start);
5537 to = XFASTINT (end);
5539 if (NILP (coding_system))
5540 return make_number (to - from);
5542 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5543 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5545 coding.mode |= CODING_MODE_LAST_BLOCK;
5546 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5547 &coding, encodep, 1);
5548 Vlast_coding_system_used = coding.symbol;
5549 return make_number (coding.produced_char);
5552 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5553 3, 3, "r\nzCoding system: ",
5554 "Decode the current region by specified coding system.\n\
5555 When called from a program, takes three arguments:\n\
5556 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5557 This function sets `last-coding-system-used' to the precise coding system\n\
5558 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5559 not fully specified.)\n\
5560 It returns the length of the decoded text.")
5561 (start, end, coding_system)
5562 Lisp_Object start, end, coding_system;
5564 return code_convert_region1 (start, end, coding_system, 0);
5567 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5568 3, 3, "r\nzCoding system: ",
5569 "Encode the current region by specified coding system.\n\
5570 When called from a program, takes three arguments:\n\
5571 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5572 This function sets `last-coding-system-used' to the precise coding system\n\
5573 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5574 not fully specified.)\n\
5575 It returns the length of the encoded text.")
5576 (start, end, coding_system)
5577 Lisp_Object start, end, coding_system;
5579 return code_convert_region1 (start, end, coding_system, 1);
5582 Lisp_Object
5583 code_convert_string1 (string, coding_system, nocopy, encodep)
5584 Lisp_Object string, coding_system, nocopy;
5585 int encodep;
5587 struct coding_system coding;
5589 CHECK_STRING (string, 0);
5590 CHECK_SYMBOL (coding_system, 1);
5592 if (NILP (coding_system))
5593 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5595 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5596 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5598 coding.mode |= CODING_MODE_LAST_BLOCK;
5599 string = code_convert_string (string, &coding, encodep, !NILP (nocopy));
5600 Vlast_coding_system_used = coding.symbol;
5602 return string;
5605 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5606 2, 3, 0,
5607 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5608 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5609 if the decoding operation is trivial.\n\
5610 This function sets `last-coding-system-used' to the precise coding system\n\
5611 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5612 not fully specified.)")
5613 (string, coding_system, nocopy)
5614 Lisp_Object string, coding_system, nocopy;
5616 return code_convert_string1 (string, coding_system, nocopy, 0);
5619 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5620 2, 3, 0,
5621 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5622 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5623 if the encoding operation is trivial.\n\
5624 This function sets `last-coding-system-used' to the precise coding system\n\
5625 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5626 not fully specified.)")
5627 (string, coding_system, nocopy)
5628 Lisp_Object string, coding_system, nocopy;
5630 return code_convert_string1 (string, coding_system, nocopy, 1);
5633 /* Encode or decode STRING according to CODING_SYSTEM.
5634 Do not set Vlast_coding_system_used.
5636 This function is called only from macros DECODE_FILE and
5637 ENCODE_FILE, thus we ignore character composition. */
5639 Lisp_Object
5640 code_convert_string_norecord (string, coding_system, encodep)
5641 Lisp_Object string, coding_system;
5642 int encodep;
5644 struct coding_system coding;
5646 CHECK_STRING (string, 0);
5647 CHECK_SYMBOL (coding_system, 1);
5649 if (NILP (coding_system))
5650 return string;
5652 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5653 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5655 coding.composing = COMPOSITION_DISABLED;
5656 coding.mode |= CODING_MODE_LAST_BLOCK;
5657 return code_convert_string (string, &coding, encodep, Qt);
5660 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5661 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5662 Return the corresponding character.")
5663 (code)
5664 Lisp_Object code;
5666 unsigned char c1, c2, s1, s2;
5667 Lisp_Object val;
5669 CHECK_NUMBER (code, 0);
5670 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5671 if (s1 == 0)
5673 if (s2 < 0x80)
5674 XSETFASTINT (val, s2);
5675 else if (s2 >= 0xA0 || s2 <= 0xDF)
5676 XSETFASTINT (val,
5677 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5678 else
5679 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5681 else
5683 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5684 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5685 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5686 DECODE_SJIS (s1, s2, c1, c2);
5687 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5689 return val;
5692 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5693 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5694 Return the corresponding code in SJIS.")
5695 (ch)
5696 Lisp_Object ch;
5698 int charset, c1, c2, s1, s2;
5699 Lisp_Object val;
5701 CHECK_NUMBER (ch, 0);
5702 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5703 if (charset == CHARSET_ASCII)
5705 val = ch;
5707 else if (charset == charset_jisx0208
5708 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5710 ENCODE_SJIS (c1, c2, s1, s2);
5711 XSETFASTINT (val, (s1 << 8) | s2);
5713 else if (charset == charset_katakana_jisx0201
5714 && c1 > 0x20 && c2 < 0xE0)
5716 XSETFASTINT (val, c1 | 0x80);
5718 else
5719 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5720 return val;
5723 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5724 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5725 Return the corresponding character.")
5726 (code)
5727 Lisp_Object code;
5729 int charset;
5730 unsigned char b1, b2, c1, c2;
5731 Lisp_Object val;
5733 CHECK_NUMBER (code, 0);
5734 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5735 if (b1 == 0)
5737 if (b2 >= 0x80)
5738 error ("Invalid BIG5 code: %x", XFASTINT (code));
5739 val = code;
5741 else
5743 if ((b1 < 0xA1 || b1 > 0xFE)
5744 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5745 error ("Invalid BIG5 code: %x", XFASTINT (code));
5746 DECODE_BIG5 (b1, b2, charset, c1, c2);
5747 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5749 return val;
5752 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5753 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5754 Return the corresponding character code in Big5.")
5755 (ch)
5756 Lisp_Object ch;
5758 int charset, c1, c2, b1, b2;
5759 Lisp_Object val;
5761 CHECK_NUMBER (ch, 0);
5762 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5763 if (charset == CHARSET_ASCII)
5765 val = ch;
5767 else if ((charset == charset_big5_1
5768 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5769 || (charset == charset_big5_2
5770 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5772 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5773 XSETFASTINT (val, (b1 << 8) | b2);
5775 else
5776 error ("Can't encode to Big5: %d", XFASTINT (ch));
5777 return val;
5780 DEFUN ("set-terminal-coding-system-internal",
5781 Fset_terminal_coding_system_internal,
5782 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5783 (coding_system)
5784 Lisp_Object coding_system;
5786 CHECK_SYMBOL (coding_system, 0);
5787 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5788 /* We had better not send unsafe characters to terminal. */
5789 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5790 /* Characer composition should be disabled. */
5791 terminal_coding.composing = COMPOSITION_DISABLED;
5792 return Qnil;
5795 DEFUN ("set-safe-terminal-coding-system-internal",
5796 Fset_safe_terminal_coding_system_internal,
5797 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5798 (coding_system)
5799 Lisp_Object coding_system;
5801 CHECK_SYMBOL (coding_system, 0);
5802 setup_coding_system (Fcheck_coding_system (coding_system),
5803 &safe_terminal_coding);
5804 /* Characer composition should be disabled. */
5805 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5806 return Qnil;
5809 DEFUN ("terminal-coding-system",
5810 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5811 "Return coding system specified for terminal output.")
5814 return terminal_coding.symbol;
5817 DEFUN ("set-keyboard-coding-system-internal",
5818 Fset_keyboard_coding_system_internal,
5819 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5820 (coding_system)
5821 Lisp_Object coding_system;
5823 CHECK_SYMBOL (coding_system, 0);
5824 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5825 /* Characer composition should be disabled. */
5826 keyboard_coding.composing = COMPOSITION_DISABLED;
5827 return Qnil;
5830 DEFUN ("keyboard-coding-system",
5831 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5832 "Return coding system specified for decoding keyboard input.")
5835 return keyboard_coding.symbol;
5839 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5840 Sfind_operation_coding_system, 1, MANY, 0,
5841 "Choose a coding system for an operation based on the target name.\n\
5842 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5843 DECODING-SYSTEM is the coding system to use for decoding\n\
5844 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5845 for encoding (in case OPERATION does encoding).\n\
5847 The first argument OPERATION specifies an I/O primitive:\n\
5848 For file I/O, `insert-file-contents' or `write-region'.\n\
5849 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5850 For network I/O, `open-network-stream'.\n\
5852 The remaining arguments should be the same arguments that were passed\n\
5853 to the primitive. Depending on which primitive, one of those arguments\n\
5854 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5855 whichever argument specifies the file name is TARGET.\n\
5857 TARGET has a meaning which depends on OPERATION:\n\
5858 For file I/O, TARGET is a file name.\n\
5859 For process I/O, TARGET is a process name.\n\
5860 For network I/O, TARGET is a service name or a port number\n\
5862 This function looks up what specified for TARGET in,\n\
5863 `file-coding-system-alist', `process-coding-system-alist',\n\
5864 or `network-coding-system-alist' depending on OPERATION.\n\
5865 They may specify a coding system, a cons of coding systems,\n\
5866 or a function symbol to call.\n\
5867 In the last case, we call the function with one argument,\n\
5868 which is a list of all the arguments given to this function.")
5869 (nargs, args)
5870 int nargs;
5871 Lisp_Object *args;
5873 Lisp_Object operation, target_idx, target, val;
5874 register Lisp_Object chain;
5876 if (nargs < 2)
5877 error ("Too few arguments");
5878 operation = args[0];
5879 if (!SYMBOLP (operation)
5880 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5881 error ("Invalid first arguement");
5882 if (nargs < 1 + XINT (target_idx))
5883 error ("Too few arguments for operation: %s",
5884 XSYMBOL (operation)->name->data);
5885 target = args[XINT (target_idx) + 1];
5886 if (!(STRINGP (target)
5887 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5888 error ("Invalid %dth argument", XINT (target_idx) + 1);
5890 chain = ((EQ (operation, Qinsert_file_contents)
5891 || EQ (operation, Qwrite_region))
5892 ? Vfile_coding_system_alist
5893 : (EQ (operation, Qopen_network_stream)
5894 ? Vnetwork_coding_system_alist
5895 : Vprocess_coding_system_alist));
5896 if (NILP (chain))
5897 return Qnil;
5899 for (; CONSP (chain); chain = XCDR (chain))
5901 Lisp_Object elt;
5902 elt = XCAR (chain);
5904 if (CONSP (elt)
5905 && ((STRINGP (target)
5906 && STRINGP (XCAR (elt))
5907 && fast_string_match (XCAR (elt), target) >= 0)
5908 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5910 val = XCDR (elt);
5911 /* Here, if VAL is both a valid coding system and a valid
5912 function symbol, we return VAL as a coding system. */
5913 if (CONSP (val))
5914 return val;
5915 if (! SYMBOLP (val))
5916 return Qnil;
5917 if (! NILP (Fcoding_system_p (val)))
5918 return Fcons (val, val);
5919 if (! NILP (Ffboundp (val)))
5921 val = call1 (val, Flist (nargs, args));
5922 if (CONSP (val))
5923 return val;
5924 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5925 return Fcons (val, val);
5927 return Qnil;
5930 return Qnil;
5933 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5934 Supdate_coding_systems_internal, 0, 0, 0,
5935 "Update internal database for ISO2022 and CCL based coding systems.\n\
5936 When values of any coding categories are changed, you must\n\
5937 call this function")
5940 int i;
5942 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5944 Lisp_Object val;
5946 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5947 if (!NILP (val))
5949 if (! coding_system_table[i])
5950 coding_system_table[i] = ((struct coding_system *)
5951 xmalloc (sizeof (struct coding_system)));
5952 setup_coding_system (val, coding_system_table[i]);
5954 else if (coding_system_table[i])
5956 xfree (coding_system_table[i]);
5957 coding_system_table[i] = NULL;
5961 return Qnil;
5964 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5965 Sset_coding_priority_internal, 0, 0, 0,
5966 "Update internal database for the current value of `coding-category-list'.\n\
5967 This function is internal use only.")
5970 int i = 0, idx;
5971 Lisp_Object val;
5973 val = Vcoding_category_list;
5975 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5977 if (! SYMBOLP (XCAR (val)))
5978 break;
5979 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5980 if (idx >= CODING_CATEGORY_IDX_MAX)
5981 break;
5982 coding_priorities[i++] = (1 << idx);
5983 val = XCDR (val);
5985 /* If coding-category-list is valid and contains all coding
5986 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5987 the following code saves Emacs from crashing. */
5988 while (i < CODING_CATEGORY_IDX_MAX)
5989 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5991 return Qnil;
5994 #endif /* emacs */
5997 /*** 9. Post-amble ***/
5999 void
6000 init_coding ()
6002 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6005 void
6006 init_coding_once ()
6008 int i;
6010 /* Emacs' internal format specific initialize routine. */
6011 for (i = 0; i <= 0x20; i++)
6012 emacs_code_class[i] = EMACS_control_code;
6013 emacs_code_class[0x0A] = EMACS_linefeed_code;
6014 emacs_code_class[0x0D] = EMACS_carriage_return_code;
6015 for (i = 0x21 ; i < 0x7F; i++)
6016 emacs_code_class[i] = EMACS_ascii_code;
6017 emacs_code_class[0x7F] = EMACS_control_code;
6018 for (i = 0x80; i < 0xFF; i++)
6019 emacs_code_class[i] = EMACS_invalid_code;
6020 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6021 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6022 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6023 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6025 /* ISO2022 specific initialize routine. */
6026 for (i = 0; i < 0x20; i++)
6027 iso_code_class[i] = ISO_control_code;
6028 for (i = 0x21; i < 0x7F; i++)
6029 iso_code_class[i] = ISO_graphic_plane_0;
6030 for (i = 0x80; i < 0xA0; i++)
6031 iso_code_class[i] = ISO_control_code;
6032 for (i = 0xA1; i < 0xFF; i++)
6033 iso_code_class[i] = ISO_graphic_plane_1;
6034 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6035 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6036 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6037 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6038 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6039 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6040 iso_code_class[ISO_CODE_ESC] = ISO_escape;
6041 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6042 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6043 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6045 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6047 setup_coding_system (Qnil, &keyboard_coding);
6048 setup_coding_system (Qnil, &terminal_coding);
6049 setup_coding_system (Qnil, &safe_terminal_coding);
6050 setup_coding_system (Qnil, &default_buffer_file_coding);
6052 bzero (coding_system_table, sizeof coding_system_table);
6054 bzero (ascii_skip_code, sizeof ascii_skip_code);
6055 for (i = 0; i < 128; i++)
6056 ascii_skip_code[i] = 1;
6058 #if defined (MSDOS) || defined (WINDOWSNT)
6059 system_eol_type = CODING_EOL_CRLF;
6060 #else
6061 system_eol_type = CODING_EOL_LF;
6062 #endif
6064 inhibit_pre_post_conversion = 0;
6067 #ifdef emacs
6069 void
6070 syms_of_coding ()
6072 Qtarget_idx = intern ("target-idx");
6073 staticpro (&Qtarget_idx);
6075 Qcoding_system_history = intern ("coding-system-history");
6076 staticpro (&Qcoding_system_history);
6077 Fset (Qcoding_system_history, Qnil);
6079 /* Target FILENAME is the first argument. */
6080 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6081 /* Target FILENAME is the third argument. */
6082 Fput (Qwrite_region, Qtarget_idx, make_number (2));
6084 Qcall_process = intern ("call-process");
6085 staticpro (&Qcall_process);
6086 /* Target PROGRAM is the first argument. */
6087 Fput (Qcall_process, Qtarget_idx, make_number (0));
6089 Qcall_process_region = intern ("call-process-region");
6090 staticpro (&Qcall_process_region);
6091 /* Target PROGRAM is the third argument. */
6092 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6094 Qstart_process = intern ("start-process");
6095 staticpro (&Qstart_process);
6096 /* Target PROGRAM is the third argument. */
6097 Fput (Qstart_process, Qtarget_idx, make_number (2));
6099 Qopen_network_stream = intern ("open-network-stream");
6100 staticpro (&Qopen_network_stream);
6101 /* Target SERVICE is the fourth argument. */
6102 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6104 Qcoding_system = intern ("coding-system");
6105 staticpro (&Qcoding_system);
6107 Qeol_type = intern ("eol-type");
6108 staticpro (&Qeol_type);
6110 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6111 staticpro (&Qbuffer_file_coding_system);
6113 Qpost_read_conversion = intern ("post-read-conversion");
6114 staticpro (&Qpost_read_conversion);
6116 Qpre_write_conversion = intern ("pre-write-conversion");
6117 staticpro (&Qpre_write_conversion);
6119 Qno_conversion = intern ("no-conversion");
6120 staticpro (&Qno_conversion);
6122 Qundecided = intern ("undecided");
6123 staticpro (&Qundecided);
6125 Qcoding_system_p = intern ("coding-system-p");
6126 staticpro (&Qcoding_system_p);
6128 Qcoding_system_error = intern ("coding-system-error");
6129 staticpro (&Qcoding_system_error);
6131 Fput (Qcoding_system_error, Qerror_conditions,
6132 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6133 Fput (Qcoding_system_error, Qerror_message,
6134 build_string ("Invalid coding system"));
6136 Qcoding_category = intern ("coding-category");
6137 staticpro (&Qcoding_category);
6138 Qcoding_category_index = intern ("coding-category-index");
6139 staticpro (&Qcoding_category_index);
6141 Vcoding_category_table
6142 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6143 staticpro (&Vcoding_category_table);
6145 int i;
6146 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6148 XVECTOR (Vcoding_category_table)->contents[i]
6149 = intern (coding_category_name[i]);
6150 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6151 Qcoding_category_index, make_number (i));
6155 Qtranslation_table = intern ("translation-table");
6156 staticpro (&Qtranslation_table);
6157 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6159 Qtranslation_table_id = intern ("translation-table-id");
6160 staticpro (&Qtranslation_table_id);
6162 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6163 staticpro (&Qtranslation_table_for_decode);
6165 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6166 staticpro (&Qtranslation_table_for_encode);
6168 Qsafe_charsets = intern ("safe-charsets");
6169 staticpro (&Qsafe_charsets);
6171 Qvalid_codes = intern ("valid-codes");
6172 staticpro (&Qvalid_codes);
6174 Qemacs_mule = intern ("emacs-mule");
6175 staticpro (&Qemacs_mule);
6177 Qraw_text = intern ("raw-text");
6178 staticpro (&Qraw_text);
6180 defsubr (&Scoding_system_p);
6181 defsubr (&Sread_coding_system);
6182 defsubr (&Sread_non_nil_coding_system);
6183 defsubr (&Scheck_coding_system);
6184 defsubr (&Sdetect_coding_region);
6185 defsubr (&Sdetect_coding_string);
6186 defsubr (&Sdecode_coding_region);
6187 defsubr (&Sencode_coding_region);
6188 defsubr (&Sdecode_coding_string);
6189 defsubr (&Sencode_coding_string);
6190 defsubr (&Sdecode_sjis_char);
6191 defsubr (&Sencode_sjis_char);
6192 defsubr (&Sdecode_big5_char);
6193 defsubr (&Sencode_big5_char);
6194 defsubr (&Sset_terminal_coding_system_internal);
6195 defsubr (&Sset_safe_terminal_coding_system_internal);
6196 defsubr (&Sterminal_coding_system);
6197 defsubr (&Sset_keyboard_coding_system_internal);
6198 defsubr (&Skeyboard_coding_system);
6199 defsubr (&Sfind_operation_coding_system);
6200 defsubr (&Supdate_coding_systems_internal);
6201 defsubr (&Sset_coding_priority_internal);
6203 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6204 "List of coding systems.\n\
6206 Do not alter the value of this variable manually. This variable should be\n\
6207 updated by the functions `make-coding-system' and\n\
6208 `define-coding-system-alias'.");
6209 Vcoding_system_list = Qnil;
6211 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6212 "Alist of coding system names.\n\
6213 Each element is one element list of coding system name.\n\
6214 This variable is given to `completing-read' as TABLE argument.\n\
6216 Do not alter the value of this variable manually. This variable should be\n\
6217 updated by the functions `make-coding-system' and\n\
6218 `define-coding-system-alias'.");
6219 Vcoding_system_alist = Qnil;
6221 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6222 "List of coding-categories (symbols) ordered by priority.");
6224 int i;
6226 Vcoding_category_list = Qnil;
6227 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6228 Vcoding_category_list
6229 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6230 Vcoding_category_list);
6233 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6234 "Specify the coding system for read operations.\n\
6235 It is useful to bind this variable with `let', but do not set it globally.\n\
6236 If the value is a coding system, it is used for decoding on read operation.\n\
6237 If not, an appropriate element is used from one of the coding system alists:\n\
6238 There are three such tables, `file-coding-system-alist',\n\
6239 `process-coding-system-alist', and `network-coding-system-alist'.");
6240 Vcoding_system_for_read = Qnil;
6242 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6243 "Specify the coding system for write operations.\n\
6244 Programs bind this variable with `let', but you should not set it globally.\n\
6245 If the value is a coding system, it is used for encoding of output,\n\
6246 when writing it to a file and when sending it to a file or subprocess.\n\
6248 If this does not specify a coding system, an appropriate element\n\
6249 is used from one of the coding system alists:\n\
6250 There are three such tables, `file-coding-system-alist',\n\
6251 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6252 For output to files, if the above procedure does not specify a coding system,\n\
6253 the value of `buffer-file-coding-system' is used.");
6254 Vcoding_system_for_write = Qnil;
6256 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6257 "Coding system used in the latest file or process I/O.");
6258 Vlast_coding_system_used = Qnil;
6260 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6261 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6262 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6263 such conversion.");
6264 inhibit_eol_conversion = 0;
6266 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6267 "Non-nil means process buffer inherits coding system of process output.\n\
6268 Bind it to t if the process output is to be treated as if it were a file\n\
6269 read from some filesystem.");
6270 inherit_process_coding_system = 0;
6272 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6273 "Alist to decide a coding system to use for a file I/O operation.\n\
6274 The format is ((PATTERN . VAL) ...),\n\
6275 where PATTERN is a regular expression matching a file name,\n\
6276 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6277 If VAL is a coding system, it is used for both decoding and encoding\n\
6278 the file contents.\n\
6279 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6280 and the cdr part is used for encoding.\n\
6281 If VAL is a function symbol, the function must return a coding system\n\
6282 or a cons of coding systems which are used as above.\n\
6284 See also the function `find-operation-coding-system'\n\
6285 and the variable `auto-coding-alist'.");
6286 Vfile_coding_system_alist = Qnil;
6288 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6289 "Alist to decide a coding system to use for a process I/O operation.\n\
6290 The format is ((PATTERN . VAL) ...),\n\
6291 where PATTERN is a regular expression matching a program name,\n\
6292 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6293 If VAL is a coding system, it is used for both decoding what received\n\
6294 from the program and encoding what sent to the program.\n\
6295 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6296 and the cdr part is used for encoding.\n\
6297 If VAL is a function symbol, the function must return a coding system\n\
6298 or a cons of coding systems which are used as above.\n\
6300 See also the function `find-operation-coding-system'.");
6301 Vprocess_coding_system_alist = Qnil;
6303 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6304 "Alist to decide a coding system to use for a network I/O operation.\n\
6305 The format is ((PATTERN . VAL) ...),\n\
6306 where PATTERN is a regular expression matching a network service name\n\
6307 or is a port number to connect to,\n\
6308 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6309 If VAL is a coding system, it is used for both decoding what received\n\
6310 from the network stream and encoding what sent to the network stream.\n\
6311 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6312 and the cdr part is used for encoding.\n\
6313 If VAL is a function symbol, the function must return a coding system\n\
6314 or a cons of coding systems which are used as above.\n\
6316 See also the function `find-operation-coding-system'.");
6317 Vnetwork_coding_system_alist = Qnil;
6319 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6320 "Coding system to use with system messages.");
6321 Vlocale_coding_system = Qnil;
6323 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6324 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6325 eol_mnemonic_unix = build_string (":");
6327 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6328 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6329 eol_mnemonic_dos = build_string ("\\");
6331 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6332 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6333 eol_mnemonic_mac = build_string ("/");
6335 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6336 "*String displayed in mode line when end-of-line format is not yet determined.");
6337 eol_mnemonic_undecided = build_string (":");
6339 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6340 "*Non-nil enables character translation while encoding and decoding.");
6341 Venable_character_translation = Qt;
6343 DEFVAR_LISP ("standard-translation-table-for-decode",
6344 &Vstandard_translation_table_for_decode,
6345 "Table for translating characters while decoding.");
6346 Vstandard_translation_table_for_decode = Qnil;
6348 DEFVAR_LISP ("standard-translation-table-for-encode",
6349 &Vstandard_translation_table_for_encode,
6350 "Table for translationg characters while encoding.");
6351 Vstandard_translation_table_for_encode = Qnil;
6353 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6354 "Alist of charsets vs revision numbers.\n\
6355 While encoding, if a charset (car part of an element) is found,\n\
6356 designate it with the escape sequence identifing revision (cdr part of the element).");
6357 Vcharset_revision_alist = Qnil;
6359 DEFVAR_LISP ("default-process-coding-system",
6360 &Vdefault_process_coding_system,
6361 "Cons of coding systems used for process I/O by default.\n\
6362 The car part is used for decoding a process output,\n\
6363 the cdr part is used for encoding a text to be sent to a process.");
6364 Vdefault_process_coding_system = Qnil;
6366 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6367 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6368 This is a vector of length 256.\n\
6369 If Nth element is non-nil, the existence of code N in a file\n\
6370 \(or output of subprocess) doesn't prevent it to be detected as\n\
6371 a coding system of ISO 2022 variant which has a flag\n\
6372 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6373 or reading output of a subprocess.\n\
6374 Only 128th through 159th elements has a meaning.");
6375 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6377 DEFVAR_LISP ("select-safe-coding-system-function",
6378 &Vselect_safe_coding_system_function,
6379 "Function to call to select safe coding system for encoding a text.\n\
6381 If set, this function is called to force a user to select a proper\n\
6382 coding system which can encode the text in the case that a default\n\
6383 coding system used in each operation can't encode the text.\n\
6385 The default value is `select-safe-coding-system' (which see).");
6386 Vselect_safe_coding_system_function = Qnil;
6390 char *
6391 emacs_strerror (error_number)
6392 int error_number;
6394 char *str;
6396 synchronize_system_messages_locale ();
6397 str = strerror (error_number);
6399 if (! NILP (Vlocale_coding_system))
6401 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6402 Vlocale_coding_system,
6404 str = (char *) XSTRING (dec)->data;
6407 return str;
6410 #endif /* emacs */