* src/coding.c (encode_designation_at_bol): Change return value to EMACS_INT.
[emacs.git] / src / coding.c
blob8e96db1f52143dad6aafb12f30e8f33d0bdc708b
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008, 2009, 2010, 2011
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
12 This file is part of GNU Emacs.
14 GNU Emacs is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
27 /*** TABLE OF CONTENTS ***
29 0. General comments
30 1. Preamble
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
45 /*** 0. General comments ***
48 CODING SYSTEM
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
57 coding system.
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
61 stored in the hash table Vcharset_hash_table. The conversion from
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
65 Coding systems are classified into the following types depending on
66 the encoding mechanism. Here's a brief description of the types.
68 o UTF-8
70 o UTF-16
72 o Charset-base coding system
74 A coding system defined by one or more (coded) character sets.
75 Decoding and encoding are done by a code converter defined for each
76 character set.
78 o Old Emacs internal format (emacs-mule)
80 The coding system adopted by old versions of Emacs (20 and 21).
82 o ISO2022-base coding system
84 The most famous coding system for multiple character sets. X's
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
89 o SJIS (or Shift-JIS or MS-Kanji-Code)
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
93 section 8.
95 o BIG5
97 A coding system to encode character sets: ASCII and Big5. Widely
98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
103 o CCL
105 If a user wants to decode/encode text encoded in a coding system
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
110 o Raw-text
112 A coding system for text containing raw eight-bit data. Emacs
113 treats each byte of source text as a character (except for
114 end-of-line conversion).
116 o No-conversion
118 Like raw text, but don't do end-of-line conversion.
121 END-OF-LINE FORMAT
123 How text end-of-line is encoded depends on operating system. For
124 instance, Unix's format is just one byte of LF (line-feed) code,
125 whereas DOS's format is two-byte sequence of `carriage-return' and
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
129 Since text character encoding and end-of-line encoding are
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
133 STRUCT CODING_SYSTEM
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
138 conversion (e.g. the location of source and destination data).
142 /* COMMON MACROS */
145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
147 These functions check if a byte sequence specified as a source in
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
153 Below is the template of these functions. */
155 #if 0
156 static int
157 detect_coding_XXX (coding, detect_info)
158 struct coding_system *coding;
159 struct coding_detection_info *detect_info;
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
163 int multibytep = coding->src_multibyte;
164 int consumed_chars = 0;
165 int found = 0;
166 ...;
168 while (1)
170 /* Get one byte from the source. If the source is exhausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
181 return 0;
183 no_more_source:
184 /* The source exhausted successfully. */
185 detect_info->found |= found;
186 return 1;
188 #endif
190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
202 Below is the template of these functions. */
204 #if 0
205 static void
206 decode_coding_XXXX (coding)
207 struct coding_system *coding;
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
214 const unsigned char *src_base;
215 /* A buffer to produce decoded characters. */
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
218 int multibytep = coding->src_multibyte;
220 while (1)
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
243 #endif
245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
261 Below is a template of these functions. */
262 #if 0
263 static void
264 encode_coding_XXX (coding)
265 struct coding_system *coding;
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
285 #endif
288 /*** 1. Preamble ***/
290 #include <config.h>
291 #include <stdio.h>
292 #include <setjmp.h>
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302 #include "frame.h"
303 #include "termhooks.h"
305 Lisp_Object Vcoding_system_hash_table;
307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
308 Lisp_Object Qunix, Qdos;
309 extern Lisp_Object Qmac; /* frame.c */
310 Lisp_Object Qbuffer_file_coding_system;
311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
315 Lisp_Object Qbig, Qlittle;
316 Lisp_Object Qcoding_system_history;
317 Lisp_Object Qvalid_codes;
318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
321 Lisp_Object QCascii_compatible_p;
323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
324 Lisp_Object Qcall_process, Qcall_process_region;
325 Lisp_Object Qstart_process, Qopen_network_stream;
326 Lisp_Object Qtarget_idx;
328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
329 Lisp_Object Qinterrupted, Qinsufficient_memory;
331 extern Lisp_Object Qcompletion_ignore_case;
333 /* If a symbol has this property, evaluate the value to define the
334 symbol as a coding system. */
335 static Lisp_Object Qcoding_system_define_form;
337 int coding_system_require_warning;
339 Lisp_Object Vselect_safe_coding_system_function;
341 /* Mnemonic string for each format of end-of-line. */
342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
343 /* Mnemonic string to indicate format of end-of-line is not yet
344 decided. */
345 Lisp_Object eol_mnemonic_undecided;
347 /* Format of end-of-line decided by system. This is Qunix on
348 Unix and Mac, Qdos on DOS/Windows.
349 This has an effect only for external encoding (i.e. for output to
350 file and process), not for in-buffer or Lisp string encoding. */
351 static Lisp_Object system_eol_type;
353 #ifdef emacs
355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
359 /* Coding system emacs-mule and raw-text are for converting only
360 end-of-line format. */
361 Lisp_Object Qemacs_mule, Qraw_text;
362 Lisp_Object Qutf_8_emacs;
364 /* Coding-systems are handed between Emacs Lisp programs and C internal
365 routines by the following three variables. */
366 /* Coding-system for reading files and receiving data from process. */
367 Lisp_Object Vcoding_system_for_read;
368 /* Coding-system for writing files and sending data to process. */
369 Lisp_Object Vcoding_system_for_write;
370 /* Coding-system actually used in the latest I/O. */
371 Lisp_Object Vlast_coding_system_used;
372 /* Set to non-nil when an error is detected while code conversion. */
373 Lisp_Object Vlast_code_conversion_error;
374 /* A vector of length 256 which contains information about special
375 Latin codes (especially for dealing with Microsoft codes). */
376 Lisp_Object Vlatin_extra_code_table;
378 /* Flag to inhibit code conversion of end-of-line format. */
379 int inhibit_eol_conversion;
381 /* Flag to inhibit ISO2022 escape sequence detection. */
382 int inhibit_iso_escape_detection;
384 /* Flag to inhibit detection of binary files through null bytes. */
385 int inhibit_null_byte_detection;
387 /* Flag to make buffer-file-coding-system inherit from process-coding. */
388 int inherit_process_coding_system;
390 /* Coding system to be used to encode text for terminal display when
391 terminal coding system is nil. */
392 struct coding_system safe_terminal_coding;
394 Lisp_Object Vfile_coding_system_alist;
395 Lisp_Object Vprocess_coding_system_alist;
396 Lisp_Object Vnetwork_coding_system_alist;
398 Lisp_Object Vlocale_coding_system;
400 #endif /* emacs */
402 /* Flag to tell if we look up translation table on character code
403 conversion. */
404 Lisp_Object Venable_character_translation;
405 /* Standard translation table to look up on decoding (reading). */
406 Lisp_Object Vstandard_translation_table_for_decode;
407 /* Standard translation table to look up on encoding (writing). */
408 Lisp_Object Vstandard_translation_table_for_encode;
410 Lisp_Object Qtranslation_table;
411 Lisp_Object Qtranslation_table_id;
412 Lisp_Object Qtranslation_table_for_decode;
413 Lisp_Object Qtranslation_table_for_encode;
415 /* Alist of charsets vs revision number. */
416 static Lisp_Object Vcharset_revision_table;
418 /* Default coding systems used for process I/O. */
419 Lisp_Object Vdefault_process_coding_system;
421 /* Char table for translating Quail and self-inserting input. */
422 Lisp_Object Vtranslation_table_for_input;
424 /* Two special coding systems. */
425 Lisp_Object Vsjis_coding_system;
426 Lisp_Object Vbig5_coding_system;
428 /* ISO2022 section */
430 #define CODING_ISO_INITIAL(coding, reg) \
431 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
432 coding_attr_iso_initial), \
433 reg)))
436 #define CODING_ISO_REQUEST(coding, charset_id) \
437 (((charset_id) <= (coding)->max_charset_id \
438 ? ((coding)->safe_charsets[charset_id] != 255 \
439 ? (coding)->safe_charsets[charset_id] \
440 : -1) \
441 : -1))
444 #define CODING_ISO_FLAGS(coding) \
445 ((coding)->spec.iso_2022.flags)
446 #define CODING_ISO_DESIGNATION(coding, reg) \
447 ((coding)->spec.iso_2022.current_designation[reg])
448 #define CODING_ISO_INVOCATION(coding, plane) \
449 ((coding)->spec.iso_2022.current_invocation[plane])
450 #define CODING_ISO_SINGLE_SHIFTING(coding) \
451 ((coding)->spec.iso_2022.single_shifting)
452 #define CODING_ISO_BOL(coding) \
453 ((coding)->spec.iso_2022.bol)
454 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
455 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
456 #define CODING_ISO_CMP_STATUS(coding) \
457 (&(coding)->spec.iso_2022.cmp_status)
458 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
459 ((coding)->spec.iso_2022.ctext_extended_segment_len)
460 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
461 ((coding)->spec.iso_2022.embedded_utf_8)
463 /* Control characters of ISO2022. */
464 /* code */ /* function */
465 #define ISO_CODE_LF 0x0A /* line-feed */
466 #define ISO_CODE_CR 0x0D /* carriage-return */
467 #define ISO_CODE_SO 0x0E /* shift-out */
468 #define ISO_CODE_SI 0x0F /* shift-in */
469 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
470 #define ISO_CODE_ESC 0x1B /* escape */
471 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
472 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
473 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
475 /* All code (1-byte) of ISO2022 is classified into one of the
476 followings. */
477 enum iso_code_class_type
479 ISO_control_0, /* Control codes in the range
480 0x00..0x1F and 0x7F, except for the
481 following 5 codes. */
482 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
483 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
484 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
485 ISO_escape, /* ISO_CODE_SO (0x1B) */
486 ISO_control_1, /* Control codes in the range
487 0x80..0x9F, except for the
488 following 3 codes. */
489 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
490 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
491 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
492 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
493 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
494 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
495 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
499 `iso-flags' attribute of an iso2022 coding system. */
501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
502 instead of the correct short-form sequence (e.g. ESC $ A). */
503 #define CODING_ISO_FLAG_LONG_FORM 0x0001
505 /* If set, reset graphic planes and registers at end-of-line to the
506 initial state. */
507 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
509 /* If set, reset graphic planes and registers before any control
510 characters to the initial state. */
511 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
513 /* If set, encode by 7-bit environment. */
514 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
516 /* If set, use locking-shift function. */
517 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
519 /* If set, use single-shift function. Overwrite
520 CODING_ISO_FLAG_LOCKING_SHIFT. */
521 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
523 /* If set, use designation escape sequence. */
524 #define CODING_ISO_FLAG_DESIGNATION 0x0040
526 /* If set, produce revision number sequence. */
527 #define CODING_ISO_FLAG_REVISION 0x0080
529 /* If set, produce ISO6429's direction specifying sequence. */
530 #define CODING_ISO_FLAG_DIRECTION 0x0100
532 /* If set, assume designation states are reset at beginning of line on
533 output. */
534 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
536 /* If set, designation sequence should be placed at beginning of line
537 on output. */
538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
540 /* If set, do not encode unsafe characters on output. */
541 #define CODING_ISO_FLAG_SAFE 0x0800
543 /* If set, extra latin codes (128..159) are accepted as a valid code
544 on input. */
545 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
547 #define CODING_ISO_FLAG_COMPOSITION 0x2000
549 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
551 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
553 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
555 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
557 /* A character to be produced on output if encoding of the original
558 character is prohibited by CODING_ISO_FLAG_SAFE. */
559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
561 /* UTF-8 section */
562 #define CODING_UTF_8_BOM(coding) \
563 ((coding)->spec.utf_8_bom)
565 /* UTF-16 section */
566 #define CODING_UTF_16_BOM(coding) \
567 ((coding)->spec.utf_16.bom)
569 #define CODING_UTF_16_ENDIAN(coding) \
570 ((coding)->spec.utf_16.endian)
572 #define CODING_UTF_16_SURROGATE(coding) \
573 ((coding)->spec.utf_16.surrogate)
576 /* CCL section */
577 #define CODING_CCL_DECODER(coding) \
578 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
579 #define CODING_CCL_ENCODER(coding) \
580 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
581 #define CODING_CCL_VALIDS(coding) \
582 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
584 /* Index for each coding category in `coding_categories' */
586 enum coding_category
588 coding_category_iso_7,
589 coding_category_iso_7_tight,
590 coding_category_iso_8_1,
591 coding_category_iso_8_2,
592 coding_category_iso_7_else,
593 coding_category_iso_8_else,
594 coding_category_utf_8_auto,
595 coding_category_utf_8_nosig,
596 coding_category_utf_8_sig,
597 coding_category_utf_16_auto,
598 coding_category_utf_16_be,
599 coding_category_utf_16_le,
600 coding_category_utf_16_be_nosig,
601 coding_category_utf_16_le_nosig,
602 coding_category_charset,
603 coding_category_sjis,
604 coding_category_big5,
605 coding_category_ccl,
606 coding_category_emacs_mule,
607 /* All above are targets of code detection. */
608 coding_category_raw_text,
609 coding_category_undecided,
610 coding_category_max
613 /* Definitions of flag bits used in detect_coding_XXXX. */
614 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
615 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
616 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
617 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
618 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
619 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
620 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
621 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
622 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
623 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
624 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
625 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
626 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
627 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
628 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
629 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
630 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
631 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
632 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
633 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8_AUTO \
645 | CATEGORY_MASK_UTF_8_NOSIG \
646 | CATEGORY_MASK_UTF_8_SIG \
647 | CATEGORY_MASK_UTF_16_AUTO \
648 | CATEGORY_MASK_UTF_16_BE \
649 | CATEGORY_MASK_UTF_16_LE \
650 | CATEGORY_MASK_UTF_16_BE_NOSIG \
651 | CATEGORY_MASK_UTF_16_LE_NOSIG \
652 | CATEGORY_MASK_CHARSET \
653 | CATEGORY_MASK_SJIS \
654 | CATEGORY_MASK_BIG5 \
655 | CATEGORY_MASK_CCL \
656 | CATEGORY_MASK_EMACS_MULE)
659 #define CATEGORY_MASK_ISO_7BIT \
660 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
662 #define CATEGORY_MASK_ISO_8BIT \
663 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
665 #define CATEGORY_MASK_ISO_ELSE \
666 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
668 #define CATEGORY_MASK_ISO_ESCAPE \
669 (CATEGORY_MASK_ISO_7 \
670 | CATEGORY_MASK_ISO_7_TIGHT \
671 | CATEGORY_MASK_ISO_7_ELSE \
672 | CATEGORY_MASK_ISO_8_ELSE)
674 #define CATEGORY_MASK_ISO \
675 ( CATEGORY_MASK_ISO_7BIT \
676 | CATEGORY_MASK_ISO_8BIT \
677 | CATEGORY_MASK_ISO_ELSE)
679 #define CATEGORY_MASK_UTF_16 \
680 (CATEGORY_MASK_UTF_16_AUTO \
681 | CATEGORY_MASK_UTF_16_BE \
682 | CATEGORY_MASK_UTF_16_LE \
683 | CATEGORY_MASK_UTF_16_BE_NOSIG \
684 | CATEGORY_MASK_UTF_16_LE_NOSIG)
686 #define CATEGORY_MASK_UTF_8 \
687 (CATEGORY_MASK_UTF_8_AUTO \
688 | CATEGORY_MASK_UTF_8_NOSIG \
689 | CATEGORY_MASK_UTF_8_SIG)
691 /* List of symbols `coding-category-xxx' ordered by priority. This
692 variable is exposed to Emacs Lisp. */
693 static Lisp_Object Vcoding_category_list;
695 /* Table of coding categories (Lisp symbols). This variable is for
696 internal use only. */
697 static Lisp_Object Vcoding_category_table;
699 /* Table of coding-categories ordered by priority. */
700 static enum coding_category coding_priorities[coding_category_max];
702 /* Nth element is a coding context for the coding system bound to the
703 Nth coding category. */
704 static struct coding_system coding_categories[coding_category_max];
706 /*** Commonly used macros and functions ***/
708 #ifndef min
709 #define min(a, b) ((a) < (b) ? (a) : (b))
710 #endif
711 #ifndef max
712 #define max(a, b) ((a) > (b) ? (a) : (b))
713 #endif
715 #define CODING_GET_INFO(coding, attrs, charset_list) \
716 do { \
717 (attrs) = CODING_ID_ATTRS ((coding)->id); \
718 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
719 } while (0)
722 /* Safely get one byte from the source text pointed by SRC which ends
723 at SRC_END, and set C to that byte. If there are not enough bytes
724 in the source, it jumps to `no_more_source'. If multibytep is
725 nonzero, and a multibyte character is found at SRC, set C to the
726 negative value of the character code. The caller should declare
727 and set these variables appropriately in advance:
728 src, src_end, multibytep */
730 #define ONE_MORE_BYTE(c) \
731 do { \
732 if (src == src_end) \
734 if (src_base < src) \
735 record_conversion_result \
736 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
737 goto no_more_source; \
739 c = *src++; \
740 if (multibytep && (c & 0x80)) \
742 if ((c & 0xFE) == 0xC0) \
743 c = ((c & 1) << 6) | *src++; \
744 else \
746 src--; \
747 c = - string_char (src, &src, NULL); \
748 record_conversion_result \
749 (coding, CODING_RESULT_INVALID_SRC); \
752 consumed_chars++; \
753 } while (0)
755 /* Safely get two bytes from the source text pointed by SRC which ends
756 at SRC_END, and set C1 and C2 to those bytes while skipping the
757 heading multibyte characters. If there are not enough bytes in the
758 source, it jumps to `no_more_source'. If multibytep is nonzero and
759 a multibyte character is found for C2, set C2 to the negative value
760 of the character code. The caller should declare and set these
761 variables appropriately in advance:
762 src, src_end, multibytep
763 It is intended that this macro is used in detect_coding_utf_16. */
765 #define TWO_MORE_BYTES(c1, c2) \
766 do { \
767 do { \
768 if (src == src_end) \
769 goto no_more_source; \
770 c1 = *src++; \
771 if (multibytep && (c1 & 0x80)) \
773 if ((c1 & 0xFE) == 0xC0) \
774 c1 = ((c1 & 1) << 6) | *src++; \
775 else \
777 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
778 c1 = -1; \
781 } while (c1 < 0); \
782 if (src == src_end) \
783 goto no_more_source; \
784 c2 = *src++; \
785 if (multibytep && (c2 & 0x80)) \
787 if ((c2 & 0xFE) == 0xC0) \
788 c2 = ((c2 & 1) << 6) | *src++; \
789 else \
790 c2 = -1; \
792 } while (0)
795 #define ONE_MORE_BYTE_NO_CHECK(c) \
796 do { \
797 c = *src++; \
798 if (multibytep && (c & 0x80)) \
800 if ((c & 0xFE) == 0xC0) \
801 c = ((c & 1) << 6) | *src++; \
802 else \
804 src--; \
805 c = - string_char (src, &src, NULL); \
806 record_conversion_result \
807 (coding, CODING_RESULT_INVALID_SRC); \
810 consumed_chars++; \
811 } while (0)
814 /* Store a byte C in the place pointed by DST and increment DST to the
815 next free point, and increment PRODUCED_CHARS. The caller should
816 assure that C is 0..127, and declare and set the variable `dst'
817 appropriately in advance.
821 #define EMIT_ONE_ASCII_BYTE(c) \
822 do { \
823 produced_chars++; \
824 *dst++ = (c); \
825 } while (0)
828 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
830 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
831 do { \
832 produced_chars += 2; \
833 *dst++ = (c1), *dst++ = (c2); \
834 } while (0)
837 /* Store a byte C in the place pointed by DST and increment DST to the
838 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
839 nonzero, store in an appropriate multibyte from. The caller should
840 declare and set the variables `dst' and `multibytep' appropriately
841 in advance. */
843 #define EMIT_ONE_BYTE(c) \
844 do { \
845 produced_chars++; \
846 if (multibytep) \
848 int ch = (c); \
849 if (ch >= 0x80) \
850 ch = BYTE8_TO_CHAR (ch); \
851 CHAR_STRING_ADVANCE (ch, dst); \
853 else \
854 *dst++ = (c); \
855 } while (0)
858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
860 #define EMIT_TWO_BYTES(c1, c2) \
861 do { \
862 produced_chars += 2; \
863 if (multibytep) \
865 int ch; \
867 ch = (c1); \
868 if (ch >= 0x80) \
869 ch = BYTE8_TO_CHAR (ch); \
870 CHAR_STRING_ADVANCE (ch, dst); \
871 ch = (c2); \
872 if (ch >= 0x80) \
873 ch = BYTE8_TO_CHAR (ch); \
874 CHAR_STRING_ADVANCE (ch, dst); \
876 else \
878 *dst++ = (c1); \
879 *dst++ = (c2); \
881 } while (0)
884 #define EMIT_THREE_BYTES(c1, c2, c3) \
885 do { \
886 EMIT_ONE_BYTE (c1); \
887 EMIT_TWO_BYTES (c2, c3); \
888 } while (0)
891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
892 do { \
893 EMIT_TWO_BYTES (c1, c2); \
894 EMIT_TWO_BYTES (c3, c4); \
895 } while (0)
898 /* Prototypes for static functions. */
899 static void record_conversion_result P_ ((struct coding_system *coding,
900 enum coding_result_code result));
901 static int detect_coding_utf_8 P_ ((struct coding_system *,
902 struct coding_detection_info *info));
903 static void decode_coding_utf_8 P_ ((struct coding_system *));
904 static int encode_coding_utf_8 P_ ((struct coding_system *));
906 static int detect_coding_utf_16 P_ ((struct coding_system *,
907 struct coding_detection_info *info));
908 static void decode_coding_utf_16 P_ ((struct coding_system *));
909 static int encode_coding_utf_16 P_ ((struct coding_system *));
911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
912 struct coding_detection_info *info));
913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
917 struct coding_detection_info *info));
918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
921 static int detect_coding_sjis P_ ((struct coding_system *,
922 struct coding_detection_info *info));
923 static void decode_coding_sjis P_ ((struct coding_system *));
924 static int encode_coding_sjis P_ ((struct coding_system *));
926 static int detect_coding_big5 P_ ((struct coding_system *,
927 struct coding_detection_info *info));
928 static void decode_coding_big5 P_ ((struct coding_system *));
929 static int encode_coding_big5 P_ ((struct coding_system *));
931 static int detect_coding_ccl P_ ((struct coding_system *,
932 struct coding_detection_info *info));
933 static void decode_coding_ccl P_ ((struct coding_system *));
934 static int encode_coding_ccl P_ ((struct coding_system *));
936 static void decode_coding_raw_text P_ ((struct coding_system *));
937 static int encode_coding_raw_text P_ ((struct coding_system *));
939 static EMACS_INT coding_set_source P_ ((struct coding_system *));
940 static EMACS_INT coding_set_destination P_ ((struct coding_system *));
941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
943 EMACS_INT, EMACS_INT));
944 static unsigned char *alloc_destination P_ ((struct coding_system *,
945 EMACS_INT, unsigned char *));
946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
947 static EMACS_INT encode_designation_at_bol P_ ((struct coding_system *,
948 int *, int *, unsigned char *));
949 static int detect_eol P_ ((const unsigned char *,
950 EMACS_INT, enum coding_category));
951 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
952 static void decode_eol P_ ((struct coding_system *));
953 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
954 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
955 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
956 static INLINE void produce_charset P_ ((struct coding_system *, int *,
957 EMACS_INT));
958 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
959 static int decode_coding P_ ((struct coding_system *));
960 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
961 struct coding_system *,
962 int *, EMACS_INT *));
963 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
964 struct coding_system *,
965 int *, EMACS_INT *));
966 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
967 static int encode_coding P_ ((struct coding_system *));
968 static Lisp_Object make_conversion_work_buffer P_ ((int));
969 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
970 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
971 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
973 static void
974 record_conversion_result (struct coding_system *coding,
975 enum coding_result_code result)
977 coding->result = result;
978 switch (result)
980 case CODING_RESULT_INSUFFICIENT_SRC:
981 Vlast_code_conversion_error = Qinsufficient_source;
982 break;
983 case CODING_RESULT_INCONSISTENT_EOL:
984 Vlast_code_conversion_error = Qinconsistent_eol;
985 break;
986 case CODING_RESULT_INVALID_SRC:
987 Vlast_code_conversion_error = Qinvalid_source;
988 break;
989 case CODING_RESULT_INTERRUPT:
990 Vlast_code_conversion_error = Qinterrupted;
991 break;
992 case CODING_RESULT_INSUFFICIENT_MEM:
993 Vlast_code_conversion_error = Qinsufficient_memory;
994 break;
995 case CODING_RESULT_INSUFFICIENT_DST:
996 /* Don't record this error in Vlast_code_conversion_error
997 because it happens just temporarily and is resolved when the
998 whole conversion is finished. */
999 break;
1000 case CODING_RESULT_SUCCESS:
1001 break;
1002 default:
1003 Vlast_code_conversion_error = intern ("Unknown error");
1007 /* These wrapper macros are used to preserve validity of pointers into
1008 buffer text across calls to decode_char, encode_char, etc, which
1009 could cause relocation of buffers if it loads a charset map,
1010 because loading a charset map allocates large structures. */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013 do { \
1014 EMACS_INT offset; \
1016 charset_map_loaded = 0; \
1017 c = DECODE_CHAR (charset, code); \
1018 if (charset_map_loaded \
1019 && (offset = coding_set_source (coding))) \
1021 src += offset; \
1022 src_base += offset; \
1023 src_end += offset; \
1025 } while (0)
1027 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
1028 do { \
1029 EMACS_INT offset; \
1031 charset_map_loaded = 0; \
1032 code = ENCODE_CHAR (charset, c); \
1033 if (charset_map_loaded \
1034 && (offset = coding_set_destination (coding))) \
1036 dst += offset; \
1037 dst_end += offset; \
1039 } while (0)
1041 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
1042 do { \
1043 EMACS_INT offset; \
1045 charset_map_loaded = 0; \
1046 charset = char_charset (c, charset_list, code_return); \
1047 if (charset_map_loaded \
1048 && (offset = coding_set_destination (coding))) \
1050 dst += offset; \
1051 dst_end += offset; \
1053 } while (0)
1055 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
1056 do { \
1057 EMACS_INT offset; \
1059 charset_map_loaded = 0; \
1060 result = CHAR_CHARSET_P (c, charset); \
1061 if (charset_map_loaded \
1062 && (offset = coding_set_destination (coding))) \
1064 dst += offset; \
1065 dst_end += offset; \
1067 } while (0)
1070 /* If there are at least BYTES length of room at dst, allocate memory
1071 for coding->destination and update dst and dst_end. We don't have
1072 to take care of coding->source which will be relocated. It is
1073 handled by calling coding_set_source in encode_coding. */
1075 #define ASSURE_DESTINATION(bytes) \
1076 do { \
1077 if (dst + (bytes) >= dst_end) \
1079 int more_bytes = charbuf_end - charbuf + (bytes); \
1081 dst = alloc_destination (coding, more_bytes, dst); \
1082 dst_end = coding->destination + coding->dst_bytes; \
1084 } while (0)
1087 /* Store multibyte form of the character C in P, and advance P to the
1088 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
1089 never calls MAYBE_UNIFY_CHAR. */
1091 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
1092 do { \
1093 if ((c) <= MAX_1_BYTE_CHAR) \
1094 *(p)++ = (c); \
1095 else if ((c) <= MAX_2_BYTE_CHAR) \
1096 *(p)++ = (0xC0 | ((c) >> 6)), \
1097 *(p)++ = (0x80 | ((c) & 0x3F)); \
1098 else if ((c) <= MAX_3_BYTE_CHAR) \
1099 *(p)++ = (0xE0 | ((c) >> 12)), \
1100 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1101 *(p)++ = (0x80 | ((c) & 0x3F)); \
1102 else if ((c) <= MAX_4_BYTE_CHAR) \
1103 *(p)++ = (0xF0 | (c >> 18)), \
1104 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1105 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1106 *(p)++ = (0x80 | (c & 0x3F)); \
1107 else if ((c) <= MAX_5_BYTE_CHAR) \
1108 *(p)++ = 0xF8, \
1109 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1110 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1111 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1112 *(p)++ = (0x80 | (c & 0x3F)); \
1113 else \
1114 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1115 } while (0)
1118 /* Return the character code of character whose multibyte form is at
1119 P, and advance P to the end of the multibyte form. This is like
1120 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1122 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1123 (!((p)[0] & 0x80) \
1124 ? *(p)++ \
1125 : ! ((p)[0] & 0x20) \
1126 ? ((p) += 2, \
1127 ((((p)[-2] & 0x1F) << 6) \
1128 | ((p)[-1] & 0x3F) \
1129 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1130 : ! ((p)[0] & 0x10) \
1131 ? ((p) += 3, \
1132 ((((p)[-3] & 0x0F) << 12) \
1133 | (((p)[-2] & 0x3F) << 6) \
1134 | ((p)[-1] & 0x3F))) \
1135 : ! ((p)[0] & 0x08) \
1136 ? ((p) += 4, \
1137 ((((p)[-4] & 0xF) << 18) \
1138 | (((p)[-3] & 0x3F) << 12) \
1139 | (((p)[-2] & 0x3F) << 6) \
1140 | ((p)[-1] & 0x3F))) \
1141 : ((p) += 5, \
1142 ((((p)[-4] & 0x3F) << 18) \
1143 | (((p)[-3] & 0x3F) << 12) \
1144 | (((p)[-2] & 0x3F) << 6) \
1145 | ((p)[-1] & 0x3F))))
1148 /* Update coding->source from coding->src_object, and return how many
1149 bytes coding->source was changed. */
1151 static EMACS_INT
1152 coding_set_source (coding)
1153 struct coding_system *coding;
1155 const unsigned char *orig = coding->source;
1157 if (BUFFERP (coding->src_object))
1159 struct buffer *buf = XBUFFER (coding->src_object);
1161 if (coding->src_pos < 0)
1162 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1163 else
1164 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1166 else if (STRINGP (coding->src_object))
1168 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1170 else
1171 /* Otherwise, the source is C string and is never relocated
1172 automatically. Thus we don't have to update anything. */
1175 return coding->source - orig;
1178 /* Update coding->destination from coding->dst_object, and return how
1179 many bytes coding->destination was changed. */
1181 static EMACS_INT
1182 coding_set_destination (coding)
1183 struct coding_system *coding;
1185 const unsigned char *orig = coding->destination;
1187 if (BUFFERP (coding->dst_object))
1189 if (coding->src_pos < 0)
1191 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1192 coding->dst_bytes = (GAP_END_ADDR
1193 - (coding->src_bytes - coding->consumed)
1194 - coding->destination);
1196 else
1198 /* We are sure that coding->dst_pos_byte is before the gap
1199 of the buffer. */
1200 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1201 + coding->dst_pos_byte - BEG_BYTE);
1202 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1203 - coding->destination);
1206 else
1207 /* Otherwise, the destination is C string and is never relocated
1208 automatically. Thus we don't have to update anything. */
1211 return coding->destination - orig;
1215 static void
1216 coding_alloc_by_realloc (coding, bytes)
1217 struct coding_system *coding;
1218 EMACS_INT bytes;
1220 coding->destination = (unsigned char *) xrealloc (coding->destination,
1221 coding->dst_bytes + bytes);
1222 coding->dst_bytes += bytes;
1225 static void
1226 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1227 struct coding_system *coding;
1228 EMACS_INT gap_head_used, bytes;
1230 if (EQ (coding->src_object, coding->dst_object))
1232 /* The gap may contain the produced data at the head and not-yet
1233 consumed data at the tail. To preserve those data, we at
1234 first make the gap size to zero, then increase the gap
1235 size. */
1236 EMACS_INT add = GAP_SIZE;
1238 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1239 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1240 make_gap (bytes);
1241 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1242 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1244 else
1246 Lisp_Object this_buffer;
1248 this_buffer = Fcurrent_buffer ();
1249 set_buffer_internal (XBUFFER (coding->dst_object));
1250 make_gap (bytes);
1251 set_buffer_internal (XBUFFER (this_buffer));
1256 static unsigned char *
1257 alloc_destination (coding, nbytes, dst)
1258 struct coding_system *coding;
1259 EMACS_INT nbytes;
1260 unsigned char *dst;
1262 EMACS_INT offset = dst - coding->destination;
1264 if (BUFFERP (coding->dst_object))
1266 struct buffer *buf = XBUFFER (coding->dst_object);
1268 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1270 else
1271 coding_alloc_by_realloc (coding, nbytes);
1272 coding_set_destination (coding);
1273 dst = coding->destination + offset;
1274 return dst;
1277 /** Macros for annotations. */
1279 /* An annotation data is stored in the array coding->charbuf in this
1280 format:
1281 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1282 LENGTH is the number of elements in the annotation.
1283 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1284 NCHARS is the number of characters in the text annotated.
1286 The format of the following elements depend on ANNOTATION_MASK.
1288 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1289 follows:
1290 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1292 NBYTES is the number of bytes specified in the header part of
1293 old-style emacs-mule encoding, or 0 for the other kind of
1294 composition.
1296 METHOD is one of enum composition_method.
1298 Optional COMPOSITION-COMPONENTS are characters and composition
1299 rules.
1301 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1302 follows.
1304 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1305 recover from an invalid annotation, and should be skipped by
1306 produce_annotation. */
1308 /* Maximum length of the header of annotation data. */
1309 #define MAX_ANNOTATION_LENGTH 5
1311 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1312 do { \
1313 *(buf)++ = -(len); \
1314 *(buf)++ = (mask); \
1315 *(buf)++ = (nchars); \
1316 coding->annotated = 1; \
1317 } while (0);
1319 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1320 do { \
1321 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1322 *buf++ = nbytes; \
1323 *buf++ = method; \
1324 } while (0)
1327 #define ADD_CHARSET_DATA(buf, nchars, id) \
1328 do { \
1329 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1330 *buf++ = id; \
1331 } while (0)
1334 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1339 /*** 3. UTF-8 ***/
1341 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1342 Check if a text is encoded in UTF-8. If it is, return 1, else
1343 return 0. */
1345 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1346 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1347 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1348 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1349 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1350 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1352 #define UTF_BOM 0xFEFF
1353 #define UTF_8_BOM_1 0xEF
1354 #define UTF_8_BOM_2 0xBB
1355 #define UTF_8_BOM_3 0xBF
1357 static int
1358 detect_coding_utf_8 (coding, detect_info)
1359 struct coding_system *coding;
1360 struct coding_detection_info *detect_info;
1362 const unsigned char *src = coding->source, *src_base;
1363 const unsigned char *src_end = coding->source + coding->src_bytes;
1364 int multibytep = coding->src_multibyte;
1365 int consumed_chars = 0;
1366 int bom_found = 0;
1367 int found = 0;
1369 detect_info->checked |= CATEGORY_MASK_UTF_8;
1370 /* A coding system of this category is always ASCII compatible. */
1371 src += coding->head_ascii;
1373 while (1)
1375 int c, c1, c2, c3, c4;
1377 src_base = src;
1378 ONE_MORE_BYTE (c);
1379 if (c < 0 || UTF_8_1_OCTET_P (c))
1380 continue;
1381 ONE_MORE_BYTE (c1);
1382 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1383 break;
1384 if (UTF_8_2_OCTET_LEADING_P (c))
1386 found = 1;
1387 continue;
1389 ONE_MORE_BYTE (c2);
1390 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1391 break;
1392 if (UTF_8_3_OCTET_LEADING_P (c))
1394 found = 1;
1395 if (src_base == coding->source
1396 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1397 bom_found = 1;
1398 continue;
1400 ONE_MORE_BYTE (c3);
1401 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1402 break;
1403 if (UTF_8_4_OCTET_LEADING_P (c))
1405 found = 1;
1406 continue;
1408 ONE_MORE_BYTE (c4);
1409 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1410 break;
1411 if (UTF_8_5_OCTET_LEADING_P (c))
1413 found = 1;
1414 continue;
1416 break;
1418 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1419 return 0;
1421 no_more_source:
1422 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1424 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1425 return 0;
1427 if (bom_found)
1429 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1430 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1432 else
1434 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1435 if (found)
1436 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1438 return 1;
1442 static void
1443 decode_coding_utf_8 (coding)
1444 struct coding_system *coding;
1446 const unsigned char *src = coding->source + coding->consumed;
1447 const unsigned char *src_end = coding->source + coding->src_bytes;
1448 const unsigned char *src_base;
1449 int *charbuf = coding->charbuf + coding->charbuf_used;
1450 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1451 int consumed_chars = 0, consumed_chars_base = 0;
1452 int multibytep = coding->src_multibyte;
1453 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1454 Lisp_Object attr, charset_list;
1455 int eol_crlf =
1456 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1457 int byte_after_cr = -1;
1459 CODING_GET_INFO (coding, attr, charset_list);
1461 if (bom != utf_without_bom)
1463 int c1, c2, c3;
1465 src_base = src;
1466 ONE_MORE_BYTE (c1);
1467 if (! UTF_8_3_OCTET_LEADING_P (c1))
1468 src = src_base;
1469 else
1471 ONE_MORE_BYTE (c2);
1472 if (! UTF_8_EXTRA_OCTET_P (c2))
1473 src = src_base;
1474 else
1476 ONE_MORE_BYTE (c3);
1477 if (! UTF_8_EXTRA_OCTET_P (c3))
1478 src = src_base;
1479 else
1481 if ((c1 != UTF_8_BOM_1)
1482 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1483 src = src_base;
1484 else
1485 CODING_UTF_8_BOM (coding) = utf_without_bom;
1490 CODING_UTF_8_BOM (coding) = utf_without_bom;
1494 while (1)
1496 int c, c1, c2, c3, c4, c5;
1498 src_base = src;
1499 consumed_chars_base = consumed_chars;
1501 if (charbuf >= charbuf_end)
1503 if (byte_after_cr >= 0)
1504 src_base--;
1505 break;
1508 if (byte_after_cr >= 0)
1509 c1 = byte_after_cr, byte_after_cr = -1;
1510 else
1511 ONE_MORE_BYTE (c1);
1512 if (c1 < 0)
1514 c = - c1;
1516 else if (UTF_8_1_OCTET_P(c1))
1518 if (eol_crlf && c1 == '\r')
1519 ONE_MORE_BYTE (byte_after_cr);
1520 c = c1;
1522 else
1524 ONE_MORE_BYTE (c2);
1525 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1526 goto invalid_code;
1527 if (UTF_8_2_OCTET_LEADING_P (c1))
1529 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1530 /* Reject overlong sequences here and below. Encoders
1531 producing them are incorrect, they can be misleading,
1532 and they mess up read/write invariance. */
1533 if (c < 128)
1534 goto invalid_code;
1536 else
1538 ONE_MORE_BYTE (c3);
1539 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1540 goto invalid_code;
1541 if (UTF_8_3_OCTET_LEADING_P (c1))
1543 c = (((c1 & 0xF) << 12)
1544 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1545 if (c < 0x800
1546 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1547 goto invalid_code;
1549 else
1551 ONE_MORE_BYTE (c4);
1552 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1553 goto invalid_code;
1554 if (UTF_8_4_OCTET_LEADING_P (c1))
1556 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1557 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1558 if (c < 0x10000)
1559 goto invalid_code;
1561 else
1563 ONE_MORE_BYTE (c5);
1564 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1565 goto invalid_code;
1566 if (UTF_8_5_OCTET_LEADING_P (c1))
1568 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1569 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1570 | (c5 & 0x3F));
1571 if ((c > MAX_CHAR) || (c < 0x200000))
1572 goto invalid_code;
1574 else
1575 goto invalid_code;
1581 *charbuf++ = c;
1582 continue;
1584 invalid_code:
1585 src = src_base;
1586 consumed_chars = consumed_chars_base;
1587 ONE_MORE_BYTE (c);
1588 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1589 coding->errors++;
1592 no_more_source:
1593 coding->consumed_char += consumed_chars_base;
1594 coding->consumed = src_base - coding->source;
1595 coding->charbuf_used = charbuf - coding->charbuf;
1599 static int
1600 encode_coding_utf_8 (coding)
1601 struct coding_system *coding;
1603 int multibytep = coding->dst_multibyte;
1604 int *charbuf = coding->charbuf;
1605 int *charbuf_end = charbuf + coding->charbuf_used;
1606 unsigned char *dst = coding->destination + coding->produced;
1607 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1608 int produced_chars = 0;
1609 int c;
1611 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1613 ASSURE_DESTINATION (3);
1614 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1615 CODING_UTF_8_BOM (coding) = utf_without_bom;
1618 if (multibytep)
1620 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1622 while (charbuf < charbuf_end)
1624 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1626 ASSURE_DESTINATION (safe_room);
1627 c = *charbuf++;
1628 if (CHAR_BYTE8_P (c))
1630 c = CHAR_TO_BYTE8 (c);
1631 EMIT_ONE_BYTE (c);
1633 else
1635 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1636 for (p = str; p < pend; p++)
1637 EMIT_ONE_BYTE (*p);
1641 else
1643 int safe_room = MAX_MULTIBYTE_LENGTH;
1645 while (charbuf < charbuf_end)
1647 ASSURE_DESTINATION (safe_room);
1648 c = *charbuf++;
1649 if (CHAR_BYTE8_P (c))
1650 *dst++ = CHAR_TO_BYTE8 (c);
1651 else
1652 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1653 produced_chars++;
1656 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1657 coding->produced_char += produced_chars;
1658 coding->produced = dst - coding->destination;
1659 return 0;
1663 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1664 Check if a text is encoded in one of UTF-16 based coding systems.
1665 If it is, return 1, else return 0. */
1667 #define UTF_16_HIGH_SURROGATE_P(val) \
1668 (((val) & 0xFC00) == 0xD800)
1670 #define UTF_16_LOW_SURROGATE_P(val) \
1671 (((val) & 0xFC00) == 0xDC00)
1673 #define UTF_16_INVALID_P(val) \
1674 (((val) == 0xFFFE) \
1675 || ((val) == 0xFFFF) \
1676 || UTF_16_LOW_SURROGATE_P (val))
1679 static int
1680 detect_coding_utf_16 (coding, detect_info)
1681 struct coding_system *coding;
1682 struct coding_detection_info *detect_info;
1684 const unsigned char *src = coding->source, *src_base = src;
1685 const unsigned char *src_end = coding->source + coding->src_bytes;
1686 int multibytep = coding->src_multibyte;
1687 int consumed_chars = 0;
1688 int c1, c2;
1690 detect_info->checked |= CATEGORY_MASK_UTF_16;
1691 if (coding->mode & CODING_MODE_LAST_BLOCK
1692 && (coding->src_chars & 1))
1694 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1695 return 0;
1698 TWO_MORE_BYTES (c1, c2);
1699 if ((c1 == 0xFF) && (c2 == 0xFE))
1701 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1702 | CATEGORY_MASK_UTF_16_AUTO);
1703 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1704 | CATEGORY_MASK_UTF_16_BE_NOSIG
1705 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1707 else if ((c1 == 0xFE) && (c2 == 0xFF))
1709 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1710 | CATEGORY_MASK_UTF_16_AUTO);
1711 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1712 | CATEGORY_MASK_UTF_16_BE_NOSIG
1713 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1715 else if (c2 < 0)
1717 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1718 return 0;
1720 else
1722 /* We check the dispersion of Eth and Oth bytes where E is even and
1723 O is odd. If both are high, we assume binary data.*/
1724 unsigned char e[256], o[256];
1725 unsigned e_num = 1, o_num = 1;
1727 memset (e, 0, 256);
1728 memset (o, 0, 256);
1729 e[c1] = 1;
1730 o[c2] = 1;
1732 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1733 |CATEGORY_MASK_UTF_16_BE
1734 | CATEGORY_MASK_UTF_16_LE);
1736 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1737 != CATEGORY_MASK_UTF_16)
1739 TWO_MORE_BYTES (c1, c2);
1740 if (c2 < 0)
1741 break;
1742 if (! e[c1])
1744 e[c1] = 1;
1745 e_num++;
1746 if (e_num >= 128)
1747 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1749 if (! o[c2])
1751 o[c2] = 1;
1752 o_num++;
1753 if (o_num >= 128)
1754 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1757 return 0;
1760 no_more_source:
1761 return 1;
1764 static void
1765 decode_coding_utf_16 (coding)
1766 struct coding_system *coding;
1768 const unsigned char *src = coding->source + coding->consumed;
1769 const unsigned char *src_end = coding->source + coding->src_bytes;
1770 const unsigned char *src_base;
1771 int *charbuf = coding->charbuf + coding->charbuf_used;
1772 /* We may produces at most 3 chars in one loop. */
1773 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1774 int consumed_chars = 0, consumed_chars_base = 0;
1775 int multibytep = coding->src_multibyte;
1776 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1777 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1778 int surrogate = CODING_UTF_16_SURROGATE (coding);
1779 Lisp_Object attr, charset_list;
1780 int eol_crlf =
1781 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1782 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1784 CODING_GET_INFO (coding, attr, charset_list);
1786 if (bom == utf_with_bom)
1788 int c, c1, c2;
1790 src_base = src;
1791 ONE_MORE_BYTE (c1);
1792 ONE_MORE_BYTE (c2);
1793 c = (c1 << 8) | c2;
1795 if (endian == utf_16_big_endian
1796 ? c != 0xFEFF : c != 0xFFFE)
1798 /* The first two bytes are not BOM. Treat them as bytes
1799 for a normal character. */
1800 src = src_base;
1801 coding->errors++;
1803 CODING_UTF_16_BOM (coding) = utf_without_bom;
1805 else if (bom == utf_detect_bom)
1807 /* We have already tried to detect BOM and failed in
1808 detect_coding. */
1809 CODING_UTF_16_BOM (coding) = utf_without_bom;
1812 while (1)
1814 int c, c1, c2;
1816 src_base = src;
1817 consumed_chars_base = consumed_chars;
1819 if (charbuf >= charbuf_end)
1821 if (byte_after_cr1 >= 0)
1822 src_base -= 2;
1823 break;
1826 if (byte_after_cr1 >= 0)
1827 c1 = byte_after_cr1, byte_after_cr1 = -1;
1828 else
1829 ONE_MORE_BYTE (c1);
1830 if (c1 < 0)
1832 *charbuf++ = -c1;
1833 continue;
1835 if (byte_after_cr2 >= 0)
1836 c2 = byte_after_cr2, byte_after_cr2 = -1;
1837 else
1838 ONE_MORE_BYTE (c2);
1839 if (c2 < 0)
1841 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1842 *charbuf++ = -c2;
1843 continue;
1845 c = (endian == utf_16_big_endian
1846 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1848 if (surrogate)
1850 if (! UTF_16_LOW_SURROGATE_P (c))
1852 if (endian == utf_16_big_endian)
1853 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1854 else
1855 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1856 *charbuf++ = c1;
1857 *charbuf++ = c2;
1858 coding->errors++;
1859 if (UTF_16_HIGH_SURROGATE_P (c))
1860 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1861 else
1862 *charbuf++ = c;
1864 else
1866 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1867 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1868 *charbuf++ = 0x10000 + c;
1871 else
1873 if (UTF_16_HIGH_SURROGATE_P (c))
1874 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1875 else
1877 if (eol_crlf && c == '\r')
1879 ONE_MORE_BYTE (byte_after_cr1);
1880 ONE_MORE_BYTE (byte_after_cr2);
1882 *charbuf++ = c;
1887 no_more_source:
1888 coding->consumed_char += consumed_chars_base;
1889 coding->consumed = src_base - coding->source;
1890 coding->charbuf_used = charbuf - coding->charbuf;
1893 static int
1894 encode_coding_utf_16 (coding)
1895 struct coding_system *coding;
1897 int multibytep = coding->dst_multibyte;
1898 int *charbuf = coding->charbuf;
1899 int *charbuf_end = charbuf + coding->charbuf_used;
1900 unsigned char *dst = coding->destination + coding->produced;
1901 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1902 int safe_room = 8;
1903 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1904 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1905 int produced_chars = 0;
1906 Lisp_Object attrs, charset_list;
1907 int c;
1909 CODING_GET_INFO (coding, attrs, charset_list);
1911 if (bom != utf_without_bom)
1913 ASSURE_DESTINATION (safe_room);
1914 if (big_endian)
1915 EMIT_TWO_BYTES (0xFE, 0xFF);
1916 else
1917 EMIT_TWO_BYTES (0xFF, 0xFE);
1918 CODING_UTF_16_BOM (coding) = utf_without_bom;
1921 while (charbuf < charbuf_end)
1923 ASSURE_DESTINATION (safe_room);
1924 c = *charbuf++;
1925 if (c > MAX_UNICODE_CHAR)
1926 c = coding->default_char;
1928 if (c < 0x10000)
1930 if (big_endian)
1931 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1932 else
1933 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1935 else
1937 int c1, c2;
1939 c -= 0x10000;
1940 c1 = (c >> 10) + 0xD800;
1941 c2 = (c & 0x3FF) + 0xDC00;
1942 if (big_endian)
1943 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1944 else
1945 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1948 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1949 coding->produced = dst - coding->destination;
1950 coding->produced_char += produced_chars;
1951 return 0;
1955 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1957 /* Emacs' internal format for representation of multiple character
1958 sets is a kind of multi-byte encoding, i.e. characters are
1959 represented by variable-length sequences of one-byte codes.
1961 ASCII characters and control characters (e.g. `tab', `newline') are
1962 represented by one-byte sequences which are their ASCII codes, in
1963 the range 0x00 through 0x7F.
1965 8-bit characters of the range 0x80..0x9F are represented by
1966 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1967 code + 0x20).
1969 8-bit characters of the range 0xA0..0xFF are represented by
1970 one-byte sequences which are their 8-bit code.
1972 The other characters are represented by a sequence of `base
1973 leading-code', optional `extended leading-code', and one or two
1974 `position-code's. The length of the sequence is determined by the
1975 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1976 whereas extended leading-code and position-code take the range 0xA0
1977 through 0xFF. See `charset.h' for more details about leading-code
1978 and position-code.
1980 --- CODE RANGE of Emacs' internal format ---
1981 character set range
1982 ------------- -----
1983 ascii 0x00..0x7F
1984 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1985 eight-bit-graphic 0xA0..0xBF
1986 ELSE 0x81..0x9D + [0xA0..0xFF]+
1987 ---------------------------------------------
1989 As this is the internal character representation, the format is
1990 usually not used externally (i.e. in a file or in a data sent to a
1991 process). But, it is possible to have a text externally in this
1992 format (i.e. by encoding by the coding system `emacs-mule').
1994 In that case, a sequence of one-byte codes has a slightly different
1995 form.
1997 At first, all characters in eight-bit-control are represented by
1998 one-byte sequences which are their 8-bit code.
2000 Next, character composition data are represented by the byte
2001 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
2002 where,
2003 METHOD is 0xF2 plus one of composition method (enum
2004 composition_method),
2006 BYTES is 0xA0 plus a byte length of this composition data,
2008 CHARS is 0xA0 plus a number of characters composed by this
2009 data,
2011 COMPONENTs are characters of multibyte form or composition
2012 rules encoded by two-byte of ASCII codes.
2014 In addition, for backward compatibility, the following formats are
2015 also recognized as composition data on decoding.
2017 0x80 MSEQ ...
2018 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
2020 Here,
2021 MSEQ is a multibyte form but in these special format:
2022 ASCII: 0xA0 ASCII_CODE+0x80,
2023 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
2024 RULE is a one byte code of the range 0xA0..0xF0 that
2025 represents a composition rule.
2028 char emacs_mule_bytes[256];
2031 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2032 Check if a text is encoded in `emacs-mule'. If it is, return 1,
2033 else return 0. */
2035 static int
2036 detect_coding_emacs_mule (coding, detect_info)
2037 struct coding_system *coding;
2038 struct coding_detection_info *detect_info;
2040 const unsigned char *src = coding->source, *src_base;
2041 const unsigned char *src_end = coding->source + coding->src_bytes;
2042 int multibytep = coding->src_multibyte;
2043 int consumed_chars = 0;
2044 int c;
2045 int found = 0;
2047 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
2048 /* A coding system of this category is always ASCII compatible. */
2049 src += coding->head_ascii;
2051 while (1)
2053 src_base = src;
2054 ONE_MORE_BYTE (c);
2055 if (c < 0)
2056 continue;
2057 if (c == 0x80)
2059 /* Perhaps the start of composite character. We simply skip
2060 it because analyzing it is too heavy for detecting. But,
2061 at least, we check that the composite character
2062 constitutes of more than 4 bytes. */
2063 const unsigned char *src_base;
2065 repeat:
2066 src_base = src;
2069 ONE_MORE_BYTE (c);
2071 while (c >= 0xA0);
2073 if (src - src_base <= 4)
2074 break;
2075 found = CATEGORY_MASK_EMACS_MULE;
2076 if (c == 0x80)
2077 goto repeat;
2080 if (c < 0x80)
2082 if (c < 0x20
2083 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2084 break;
2086 else
2088 int more_bytes = emacs_mule_bytes[c] - 1;
2090 while (more_bytes > 0)
2092 ONE_MORE_BYTE (c);
2093 if (c < 0xA0)
2095 src--; /* Unread the last byte. */
2096 break;
2098 more_bytes--;
2100 if (more_bytes != 0)
2101 break;
2102 found = CATEGORY_MASK_EMACS_MULE;
2105 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2106 return 0;
2108 no_more_source:
2109 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2111 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2112 return 0;
2114 detect_info->found |= found;
2115 return 1;
2119 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2120 character. If CMP_STATUS indicates that we must expect MSEQ or
2121 RULE described above, decode it and return the negative value of
2122 the decoded character or rule. If an invalid byte is found, return
2123 -1. If SRC is too short, return -2. */
2126 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2127 struct coding_system *coding;
2128 const unsigned char *src;
2129 int *nbytes, *nchars, *id;
2130 struct composition_status *cmp_status;
2132 const unsigned char *src_end = coding->source + coding->src_bytes;
2133 const unsigned char *src_base = src;
2134 int multibytep = coding->src_multibyte;
2135 int charset_id;
2136 unsigned code;
2137 int c;
2138 int consumed_chars = 0;
2139 int mseq_found = 0;
2141 ONE_MORE_BYTE (c);
2142 if (c < 0)
2144 c = -c;
2145 charset_id = emacs_mule_charset[0];
2147 else
2149 if (c >= 0xA0)
2151 if (cmp_status->state != COMPOSING_NO
2152 && cmp_status->old_form)
2154 if (cmp_status->state == COMPOSING_CHAR)
2156 if (c == 0xA0)
2158 ONE_MORE_BYTE (c);
2159 c -= 0x80;
2160 if (c < 0)
2161 goto invalid_code;
2163 else
2164 c -= 0x20;
2165 mseq_found = 1;
2167 else
2169 *nbytes = src - src_base;
2170 *nchars = consumed_chars;
2171 return -c;
2174 else
2175 goto invalid_code;
2178 switch (emacs_mule_bytes[c])
2180 case 2:
2181 if ((charset_id = emacs_mule_charset[c]) < 0)
2182 goto invalid_code;
2183 ONE_MORE_BYTE (c);
2184 if (c < 0xA0)
2185 goto invalid_code;
2186 code = c & 0x7F;
2187 break;
2189 case 3:
2190 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2191 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2193 ONE_MORE_BYTE (c);
2194 if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2195 goto invalid_code;
2196 ONE_MORE_BYTE (c);
2197 if (c < 0xA0)
2198 goto invalid_code;
2199 code = c & 0x7F;
2201 else
2203 if ((charset_id = emacs_mule_charset[c]) < 0)
2204 goto invalid_code;
2205 ONE_MORE_BYTE (c);
2206 if (c < 0xA0)
2207 goto invalid_code;
2208 code = (c & 0x7F) << 8;
2209 ONE_MORE_BYTE (c);
2210 if (c < 0xA0)
2211 goto invalid_code;
2212 code |= c & 0x7F;
2214 break;
2216 case 4:
2217 ONE_MORE_BYTE (c);
2218 if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2219 goto invalid_code;
2220 ONE_MORE_BYTE (c);
2221 if (c < 0xA0)
2222 goto invalid_code;
2223 code = (c & 0x7F) << 8;
2224 ONE_MORE_BYTE (c);
2225 if (c < 0xA0)
2226 goto invalid_code;
2227 code |= c & 0x7F;
2228 break;
2230 case 1:
2231 code = c;
2232 charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2233 break;
2235 default:
2236 abort ();
2238 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2239 CHARSET_FROM_ID (charset_id), code, c);
2240 if (c < 0)
2241 goto invalid_code;
2243 *nbytes = src - src_base;
2244 *nchars = consumed_chars;
2245 if (id)
2246 *id = charset_id;
2247 return (mseq_found ? -c : c);
2249 no_more_source:
2250 return -2;
2252 invalid_code:
2253 return -1;
2257 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2259 /* Handle these composition sequence ('|': the end of header elements,
2260 BYTES and CHARS >= 0xA0):
2262 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2263 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2264 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2266 and these old form:
2268 (4) relative composition: 0x80 | MSEQ ... MSEQ
2269 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2271 When the starter 0x80 and the following header elements are found,
2272 this annotation header is produced.
2274 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2276 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2277 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2279 Then, upon reading the following elements, these codes are produced
2280 until the composition end is found:
2282 (1) CHAR ... CHAR
2283 (2) ALT ... ALT CHAR ... CHAR
2284 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2285 (4) CHAR ... CHAR
2286 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2288 When the composition end is found, LENGTH and NCHARS in the
2289 annotation header is updated as below:
2291 (1) LENGTH: unchanged, NCHARS: unchanged
2292 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2293 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2294 (4) LENGTH: unchanged, NCHARS: number of CHARs
2295 (5) LENGTH: unchanged, NCHARS: number of CHARs
2297 If an error is found while composing, the annotation header is
2298 changed to the original composition header (plus filler -1s) as
2299 below:
2301 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2302 (5) [ 0x80 0xFF -1 -1- -1 ]
2304 and the sequence [ -2 DECODED-RULE ] is changed to the original
2305 byte sequence as below:
2306 o the original byte sequence is B: [ B -1 ]
2307 o the original byte sequence is B1 B2: [ B1 B2 ]
2309 Most of the routines are implemented by macros because many
2310 variables and labels in the caller decode_coding_emacs_mule must be
2311 accessible, and they are usually called just once (thus doesn't
2312 increase the size of compiled object). */
2314 /* Decode a composition rule represented by C as a component of
2315 composition sequence of Emacs 20 style. Set RULE to the decoded
2316 rule. */
2318 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2319 do { \
2320 int gref, nref; \
2322 c -= 0xA0; \
2323 if (c < 0 || c >= 81) \
2324 goto invalid_code; \
2325 gref = c / 9, nref = c % 9; \
2326 if (gref == 4) gref = 10; \
2327 if (nref == 4) nref = 10; \
2328 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2329 } while (0)
2332 /* Decode a composition rule represented by C and the following byte
2333 at SRC as a component of composition sequence of Emacs 21 style.
2334 Set RULE to the decoded rule. */
2336 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2337 do { \
2338 int gref, nref; \
2340 gref = c - 0x20; \
2341 if (gref < 0 || gref >= 81) \
2342 goto invalid_code; \
2343 ONE_MORE_BYTE (c); \
2344 nref = c - 0x20; \
2345 if (nref < 0 || nref >= 81) \
2346 goto invalid_code; \
2347 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2348 } while (0)
2351 /* Start of Emacs 21 style format. The first three bytes at SRC are
2352 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2353 byte length of this composition information, CHARS is the number of
2354 characters composed by this composition. */
2356 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2357 do { \
2358 enum composition_method method = c - 0xF2; \
2359 int *charbuf_base = charbuf; \
2360 int nbytes, nchars; \
2362 ONE_MORE_BYTE (c); \
2363 if (c < 0) \
2364 goto invalid_code; \
2365 nbytes = c - 0xA0; \
2366 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2367 goto invalid_code; \
2368 ONE_MORE_BYTE (c); \
2369 nchars = c - 0xA0; \
2370 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2371 goto invalid_code; \
2372 cmp_status->old_form = 0; \
2373 cmp_status->method = method; \
2374 if (method == COMPOSITION_RELATIVE) \
2375 cmp_status->state = COMPOSING_CHAR; \
2376 else \
2377 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2378 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2379 cmp_status->nchars = nchars; \
2380 cmp_status->ncomps = nbytes - 4; \
2381 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2382 } while (0)
2385 /* Start of Emacs 20 style format for relative composition. */
2387 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2388 do { \
2389 cmp_status->old_form = 1; \
2390 cmp_status->method = COMPOSITION_RELATIVE; \
2391 cmp_status->state = COMPOSING_CHAR; \
2392 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2393 cmp_status->nchars = cmp_status->ncomps = 0; \
2394 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2395 } while (0)
2398 /* Start of Emacs 20 style format for rule-base composition. */
2400 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2401 do { \
2402 cmp_status->old_form = 1; \
2403 cmp_status->method = COMPOSITION_WITH_RULE; \
2404 cmp_status->state = COMPOSING_CHAR; \
2405 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2406 cmp_status->nchars = cmp_status->ncomps = 0; \
2407 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2408 } while (0)
2411 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2412 do { \
2413 const unsigned char *current_src = src; \
2415 ONE_MORE_BYTE (c); \
2416 if (c < 0) \
2417 goto invalid_code; \
2418 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2419 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2420 DECODE_EMACS_MULE_21_COMPOSITION (); \
2421 else if (c < 0xA0) \
2422 goto invalid_code; \
2423 else if (c < 0xC0) \
2425 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2426 /* Re-read C as a composition component. */ \
2427 src = current_src; \
2429 else if (c == 0xFF) \
2430 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2431 else \
2432 goto invalid_code; \
2433 } while (0)
2435 #define EMACS_MULE_COMPOSITION_END() \
2436 do { \
2437 int idx = - cmp_status->length; \
2439 if (cmp_status->old_form) \
2440 charbuf[idx + 2] = cmp_status->nchars; \
2441 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2442 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2443 cmp_status->state = COMPOSING_NO; \
2444 } while (0)
2447 static int
2448 emacs_mule_finish_composition (charbuf, cmp_status)
2449 int *charbuf;
2450 struct composition_status *cmp_status;
2452 int idx = - cmp_status->length;
2453 int new_chars;
2455 if (cmp_status->old_form && cmp_status->nchars > 0)
2457 charbuf[idx + 2] = cmp_status->nchars;
2458 new_chars = 0;
2459 if (cmp_status->method == COMPOSITION_WITH_RULE
2460 && cmp_status->state == COMPOSING_CHAR)
2462 /* The last rule was invalid. */
2463 int rule = charbuf[-1] + 0xA0;
2465 charbuf[-2] = BYTE8_TO_CHAR (rule);
2466 charbuf[-1] = -1;
2467 new_chars = 1;
2470 else
2472 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2474 if (cmp_status->method == COMPOSITION_WITH_RULE)
2476 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2477 charbuf[idx++] = -3;
2478 charbuf[idx++] = 0;
2479 new_chars = 1;
2481 else
2483 int nchars = charbuf[idx + 1] + 0xA0;
2484 int nbytes = charbuf[idx + 2] + 0xA0;
2486 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2487 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2488 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2489 charbuf[idx++] = -1;
2490 new_chars = 4;
2493 cmp_status->state = COMPOSING_NO;
2494 return new_chars;
2497 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2498 do { \
2499 if (cmp_status->state != COMPOSING_NO) \
2500 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2501 } while (0)
2504 static void
2505 decode_coding_emacs_mule (coding)
2506 struct coding_system *coding;
2508 const unsigned char *src = coding->source + coding->consumed;
2509 const unsigned char *src_end = coding->source + coding->src_bytes;
2510 const unsigned char *src_base;
2511 int *charbuf = coding->charbuf + coding->charbuf_used;
2512 /* We may produce two annotations (charset and composition) in one
2513 loop and one more charset annotation at the end. */
2514 int *charbuf_end
2515 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2516 int consumed_chars = 0, consumed_chars_base;
2517 int multibytep = coding->src_multibyte;
2518 Lisp_Object attrs, charset_list;
2519 int char_offset = coding->produced_char;
2520 int last_offset = char_offset;
2521 int last_id = charset_ascii;
2522 int eol_crlf =
2523 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2524 int byte_after_cr = -1;
2525 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2527 CODING_GET_INFO (coding, attrs, charset_list);
2529 if (cmp_status->state != COMPOSING_NO)
2531 int i;
2533 for (i = 0; i < cmp_status->length; i++)
2534 *charbuf++ = cmp_status->carryover[i];
2535 coding->annotated = 1;
2538 while (1)
2540 int c, id;
2542 src_base = src;
2543 consumed_chars_base = consumed_chars;
2545 if (charbuf >= charbuf_end)
2547 if (byte_after_cr >= 0)
2548 src_base--;
2549 break;
2552 if (byte_after_cr >= 0)
2553 c = byte_after_cr, byte_after_cr = -1;
2554 else
2555 ONE_MORE_BYTE (c);
2557 if (c < 0 || c == 0x80)
2559 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560 if (c < 0)
2562 *charbuf++ = -c;
2563 char_offset++;
2565 else
2566 DECODE_EMACS_MULE_COMPOSITION_START ();
2567 continue;
2570 if (c < 0x80)
2572 if (eol_crlf && c == '\r')
2573 ONE_MORE_BYTE (byte_after_cr);
2574 id = charset_ascii;
2575 if (cmp_status->state != COMPOSING_NO)
2577 if (cmp_status->old_form)
2578 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2579 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2580 cmp_status->ncomps--;
2583 else
2585 int nchars, nbytes;
2586 /* emacs_mule_char can load a charset map from a file, which
2587 allocates a large structure and might cause buffer text
2588 to be relocated as result. Thus, we need to remember the
2589 original pointer to buffer text, and fix up all related
2590 pointers after the call. */
2591 const unsigned char *orig = coding->source;
2592 EMACS_INT offset;
2594 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2595 cmp_status);
2596 offset = coding->source - orig;
2597 if (offset)
2599 src += offset;
2600 src_base += offset;
2601 src_end += offset;
2603 if (c < 0)
2605 if (c == -1)
2606 goto invalid_code;
2607 if (c == -2)
2608 break;
2610 src = src_base + nbytes;
2611 consumed_chars = consumed_chars_base + nchars;
2612 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2613 cmp_status->ncomps -= nchars;
2616 /* Now if C >= 0, we found a normally encoded character, if C <
2617 0, we found an old-style composition component character or
2618 rule. */
2620 if (cmp_status->state == COMPOSING_NO)
2622 if (last_id != id)
2624 if (last_id != charset_ascii)
2625 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2626 last_id);
2627 last_id = id;
2628 last_offset = char_offset;
2630 *charbuf++ = c;
2631 char_offset++;
2633 else if (cmp_status->state == COMPOSING_CHAR)
2635 if (cmp_status->old_form)
2637 if (c >= 0)
2639 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2640 *charbuf++ = c;
2641 char_offset++;
2643 else
2645 *charbuf++ = -c;
2646 cmp_status->nchars++;
2647 cmp_status->length++;
2648 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2649 EMACS_MULE_COMPOSITION_END ();
2650 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2651 cmp_status->state = COMPOSING_RULE;
2654 else
2656 *charbuf++ = c;
2657 cmp_status->length++;
2658 cmp_status->nchars--;
2659 if (cmp_status->nchars == 0)
2660 EMACS_MULE_COMPOSITION_END ();
2663 else if (cmp_status->state == COMPOSING_RULE)
2665 int rule;
2667 if (c >= 0)
2669 EMACS_MULE_COMPOSITION_END ();
2670 *charbuf++ = c;
2671 char_offset++;
2673 else
2675 c = -c;
2676 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2677 if (rule < 0)
2678 goto invalid_code;
2679 *charbuf++ = -2;
2680 *charbuf++ = rule;
2681 cmp_status->length += 2;
2682 cmp_status->state = COMPOSING_CHAR;
2685 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2687 *charbuf++ = c;
2688 cmp_status->length++;
2689 if (cmp_status->ncomps == 0)
2690 cmp_status->state = COMPOSING_CHAR;
2691 else if (cmp_status->ncomps > 0)
2693 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2694 cmp_status->state = COMPOSING_COMPONENT_RULE;
2696 else
2697 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2699 else /* COMPOSING_COMPONENT_RULE */
2701 int rule;
2703 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2704 if (rule < 0)
2705 goto invalid_code;
2706 *charbuf++ = -2;
2707 *charbuf++ = rule;
2708 cmp_status->length += 2;
2709 cmp_status->ncomps--;
2710 if (cmp_status->ncomps > 0)
2711 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2712 else
2713 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2715 continue;
2717 retry:
2718 src = src_base;
2719 consumed_chars = consumed_chars_base;
2720 continue;
2722 invalid_code:
2723 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2724 src = src_base;
2725 consumed_chars = consumed_chars_base;
2726 ONE_MORE_BYTE (c);
2727 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2728 char_offset++;
2729 coding->errors++;
2732 no_more_source:
2733 if (cmp_status->state != COMPOSING_NO)
2735 if (coding->mode & CODING_MODE_LAST_BLOCK)
2736 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2737 else
2739 int i;
2741 charbuf -= cmp_status->length;
2742 for (i = 0; i < cmp_status->length; i++)
2743 cmp_status->carryover[i] = charbuf[i];
2746 if (last_id != charset_ascii)
2747 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2748 coding->consumed_char += consumed_chars_base;
2749 coding->consumed = src_base - coding->source;
2750 coding->charbuf_used = charbuf - coding->charbuf;
2754 #define EMACS_MULE_LEADING_CODES(id, codes) \
2755 do { \
2756 if (id < 0xA0) \
2757 codes[0] = id, codes[1] = 0; \
2758 else if (id < 0xE0) \
2759 codes[0] = 0x9A, codes[1] = id; \
2760 else if (id < 0xF0) \
2761 codes[0] = 0x9B, codes[1] = id; \
2762 else if (id < 0xF5) \
2763 codes[0] = 0x9C, codes[1] = id; \
2764 else \
2765 codes[0] = 0x9D, codes[1] = id; \
2766 } while (0);
2769 static int
2770 encode_coding_emacs_mule (coding)
2771 struct coding_system *coding;
2773 int multibytep = coding->dst_multibyte;
2774 int *charbuf = coding->charbuf;
2775 int *charbuf_end = charbuf + coding->charbuf_used;
2776 unsigned char *dst = coding->destination + coding->produced;
2777 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2778 int safe_room = 8;
2779 int produced_chars = 0;
2780 Lisp_Object attrs, charset_list;
2781 int c;
2782 int preferred_charset_id = -1;
2784 CODING_GET_INFO (coding, attrs, charset_list);
2785 if (! EQ (charset_list, Vemacs_mule_charset_list))
2787 CODING_ATTR_CHARSET_LIST (attrs)
2788 = charset_list = Vemacs_mule_charset_list;
2791 while (charbuf < charbuf_end)
2793 ASSURE_DESTINATION (safe_room);
2794 c = *charbuf++;
2796 if (c < 0)
2798 /* Handle an annotation. */
2799 switch (*charbuf)
2801 case CODING_ANNOTATE_COMPOSITION_MASK:
2802 /* Not yet implemented. */
2803 break;
2804 case CODING_ANNOTATE_CHARSET_MASK:
2805 preferred_charset_id = charbuf[3];
2806 if (preferred_charset_id >= 0
2807 && NILP (Fmemq (make_number (preferred_charset_id),
2808 charset_list)))
2809 preferred_charset_id = -1;
2810 break;
2811 default:
2812 abort ();
2814 charbuf += -c - 1;
2815 continue;
2818 if (ASCII_CHAR_P (c))
2819 EMIT_ONE_ASCII_BYTE (c);
2820 else if (CHAR_BYTE8_P (c))
2822 c = CHAR_TO_BYTE8 (c);
2823 EMIT_ONE_BYTE (c);
2825 else
2827 struct charset *charset;
2828 unsigned code;
2829 int dimension;
2830 int emacs_mule_id;
2831 unsigned char leading_codes[2];
2833 if (preferred_charset_id >= 0)
2835 int result;
2837 charset = CHARSET_FROM_ID (preferred_charset_id);
2838 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2839 if (result)
2840 code = ENCODE_CHAR (charset, c);
2841 else
2842 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2843 &code, charset);
2845 else
2846 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2847 &code, charset);
2848 if (! charset)
2850 c = coding->default_char;
2851 if (ASCII_CHAR_P (c))
2853 EMIT_ONE_ASCII_BYTE (c);
2854 continue;
2856 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2857 &code, charset);
2859 dimension = CHARSET_DIMENSION (charset);
2860 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2861 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2862 EMIT_ONE_BYTE (leading_codes[0]);
2863 if (leading_codes[1])
2864 EMIT_ONE_BYTE (leading_codes[1]);
2865 if (dimension == 1)
2866 EMIT_ONE_BYTE (code | 0x80);
2867 else
2869 code |= 0x8080;
2870 EMIT_ONE_BYTE (code >> 8);
2871 EMIT_ONE_BYTE (code & 0xFF);
2875 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2876 coding->produced_char += produced_chars;
2877 coding->produced = dst - coding->destination;
2878 return 0;
2882 /*** 7. ISO2022 handlers ***/
2884 /* The following note describes the coding system ISO2022 briefly.
2885 Since the intention of this note is to help understand the
2886 functions in this file, some parts are NOT ACCURATE or are OVERLY
2887 SIMPLIFIED. For thorough understanding, please refer to the
2888 original document of ISO2022. This is equivalent to the standard
2889 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2891 ISO2022 provides many mechanisms to encode several character sets
2892 in 7-bit and 8-bit environments. For 7-bit environments, all text
2893 is encoded using bytes less than 128. This may make the encoded
2894 text a little bit longer, but the text passes more easily through
2895 several types of gateway, some of which strip off the MSB (Most
2896 Significant Bit).
2898 There are two kinds of character sets: control character sets and
2899 graphic character sets. The former contain control characters such
2900 as `newline' and `escape' to provide control functions (control
2901 functions are also provided by escape sequences). The latter
2902 contain graphic characters such as 'A' and '-'. Emacs recognizes
2903 two control character sets and many graphic character sets.
2905 Graphic character sets are classified into one of the following
2906 four classes, according to the number of bytes (DIMENSION) and
2907 number of characters in one dimension (CHARS) of the set:
2908 - DIMENSION1_CHARS94
2909 - DIMENSION1_CHARS96
2910 - DIMENSION2_CHARS94
2911 - DIMENSION2_CHARS96
2913 In addition, each character set is assigned an identification tag,
2914 unique for each set, called the "final character" (denoted as <F>
2915 hereafter). The <F> of each character set is decided by ECMA(*)
2916 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2917 (0x30..0x3F are for private use only).
2919 Note (*): ECMA = European Computer Manufacturers Association
2921 Here are examples of graphic character sets [NAME(<F>)]:
2922 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2923 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2924 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2925 o DIMENSION2_CHARS96 -- none for the moment
2927 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2928 C0 [0x00..0x1F] -- control character plane 0
2929 GL [0x20..0x7F] -- graphic character plane 0
2930 C1 [0x80..0x9F] -- control character plane 1
2931 GR [0xA0..0xFF] -- graphic character plane 1
2933 A control character set is directly designated and invoked to C0 or
2934 C1 by an escape sequence. The most common case is that:
2935 - ISO646's control character set is designated/invoked to C0, and
2936 - ISO6429's control character set is designated/invoked to C1,
2937 and usually these designations/invocations are omitted in encoded
2938 text. In a 7-bit environment, only C0 can be used, and a control
2939 character for C1 is encoded by an appropriate escape sequence to
2940 fit into the environment. All control characters for C1 are
2941 defined to have corresponding escape sequences.
2943 A graphic character set is at first designated to one of four
2944 graphic registers (G0 through G3), then these graphic registers are
2945 invoked to GL or GR. These designations and invocations can be
2946 done independently. The most common case is that G0 is invoked to
2947 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2948 these invocations and designations are omitted in encoded text.
2949 In a 7-bit environment, only GL can be used.
2951 When a graphic character set of CHARS94 is invoked to GL, codes
2952 0x20 and 0x7F of the GL area work as control characters SPACE and
2953 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2954 be used.
2956 There are two ways of invocation: locking-shift and single-shift.
2957 With locking-shift, the invocation lasts until the next different
2958 invocation, whereas with single-shift, the invocation affects the
2959 following character only and doesn't affect the locking-shift
2960 state. Invocations are done by the following control characters or
2961 escape sequences:
2963 ----------------------------------------------------------------------
2964 abbrev function cntrl escape seq description
2965 ----------------------------------------------------------------------
2966 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2967 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2968 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2969 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2970 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2971 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2972 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2973 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2974 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2975 ----------------------------------------------------------------------
2976 (*) These are not used by any known coding system.
2978 Control characters for these functions are defined by macros
2979 ISO_CODE_XXX in `coding.h'.
2981 Designations are done by the following escape sequences:
2982 ----------------------------------------------------------------------
2983 escape sequence description
2984 ----------------------------------------------------------------------
2985 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2986 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2987 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2988 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2989 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2990 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2991 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2992 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2993 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2994 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2995 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2996 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2997 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2998 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2999 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3000 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3001 ----------------------------------------------------------------------
3003 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3004 of dimension 1, chars 94, and final character <F>, etc...
3006 Note (*): Although these designations are not allowed in ISO2022,
3007 Emacs accepts them on decoding, and produces them on encoding
3008 CHARS96 character sets in a coding system which is characterized as
3009 7-bit environment, non-locking-shift, and non-single-shift.
3011 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3012 '(' must be omitted. We refer to this as "short-form" hereafter.
3014 Now you may notice that there are a lot of ways of encoding the
3015 same multilingual text in ISO2022. Actually, there exist many
3016 coding systems such as Compound Text (used in X11's inter client
3017 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
3018 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
3019 localized platforms), and all of these are variants of ISO2022.
3021 In addition to the above, Emacs handles two more kinds of escape
3022 sequences: ISO6429's direction specification and Emacs' private
3023 sequence for specifying character composition.
3025 ISO6429's direction specification takes the following form:
3026 o CSI ']' -- end of the current direction
3027 o CSI '0' ']' -- end of the current direction
3028 o CSI '1' ']' -- start of left-to-right text
3029 o CSI '2' ']' -- start of right-to-left text
3030 The control character CSI (0x9B: control sequence introducer) is
3031 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3033 Character composition specification takes the following form:
3034 o ESC '0' -- start relative composition
3035 o ESC '1' -- end composition
3036 o ESC '2' -- start rule-base composition (*)
3037 o ESC '3' -- start relative composition with alternate chars (**)
3038 o ESC '4' -- start rule-base composition with alternate chars (**)
3039 Since these are not standard escape sequences of any ISO standard,
3040 the use of them with these meanings is restricted to Emacs only.
3042 (*) This form is used only in Emacs 20.7 and older versions,
3043 but newer versions can safely decode it.
3044 (**) This form is used only in Emacs 21.1 and newer versions,
3045 and older versions can't decode it.
3047 Here's a list of example usages of these composition escape
3048 sequences (categorized by `enum composition_method').
3050 COMPOSITION_RELATIVE:
3051 ESC 0 CHAR [ CHAR ] ESC 1
3052 COMPOSITION_WITH_RULE:
3053 ESC 2 CHAR [ RULE CHAR ] ESC 1
3054 COMPOSITION_WITH_ALTCHARS:
3055 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
3056 COMPOSITION_WITH_RULE_ALTCHARS:
3057 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
3059 enum iso_code_class_type iso_code_class[256];
3061 #define SAFE_CHARSET_P(coding, id) \
3062 ((id) <= (coding)->max_charset_id \
3063 && (coding)->safe_charsets[id] != 255)
3066 #define SHIFT_OUT_OK(category) \
3067 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3069 static void
3070 setup_iso_safe_charsets (attrs)
3071 Lisp_Object attrs;
3073 Lisp_Object charset_list, safe_charsets;
3074 Lisp_Object request;
3075 Lisp_Object reg_usage;
3076 Lisp_Object tail;
3077 int reg94, reg96;
3078 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3079 int max_charset_id;
3081 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3082 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3083 && ! EQ (charset_list, Viso_2022_charset_list))
3085 CODING_ATTR_CHARSET_LIST (attrs)
3086 = charset_list = Viso_2022_charset_list;
3087 ASET (attrs, coding_attr_safe_charsets, Qnil);
3090 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3091 return;
3093 max_charset_id = 0;
3094 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3096 int id = XINT (XCAR (tail));
3097 if (max_charset_id < id)
3098 max_charset_id = id;
3101 safe_charsets = make_uninit_string (max_charset_id + 1);
3102 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3103 request = AREF (attrs, coding_attr_iso_request);
3104 reg_usage = AREF (attrs, coding_attr_iso_usage);
3105 reg94 = XINT (XCAR (reg_usage));
3106 reg96 = XINT (XCDR (reg_usage));
3108 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3110 Lisp_Object id;
3111 Lisp_Object reg;
3112 struct charset *charset;
3114 id = XCAR (tail);
3115 charset = CHARSET_FROM_ID (XINT (id));
3116 reg = Fcdr (Fassq (id, request));
3117 if (! NILP (reg))
3118 SSET (safe_charsets, XINT (id), XINT (reg));
3119 else if (charset->iso_chars_96)
3121 if (reg96 < 4)
3122 SSET (safe_charsets, XINT (id), reg96);
3124 else
3126 if (reg94 < 4)
3127 SSET (safe_charsets, XINT (id), reg94);
3130 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3134 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3135 Check if a text is encoded in one of ISO-2022 based coding systems.
3136 If it is, return 1, else return 0. */
3138 static int
3139 detect_coding_iso_2022 (coding, detect_info)
3140 struct coding_system *coding;
3141 struct coding_detection_info *detect_info;
3143 const unsigned char *src = coding->source, *src_base = src;
3144 const unsigned char *src_end = coding->source + coding->src_bytes;
3145 int multibytep = coding->src_multibyte;
3146 int single_shifting = 0;
3147 int id;
3148 int c, c1;
3149 int consumed_chars = 0;
3150 int i;
3151 int rejected = 0;
3152 int found = 0;
3153 int composition_count = -1;
3155 detect_info->checked |= CATEGORY_MASK_ISO;
3157 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3159 struct coding_system *this = &(coding_categories[i]);
3160 Lisp_Object attrs, val;
3162 if (this->id < 0)
3163 continue;
3164 attrs = CODING_ID_ATTRS (this->id);
3165 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3166 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3167 setup_iso_safe_charsets (attrs);
3168 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3169 this->max_charset_id = SCHARS (val) - 1;
3170 this->safe_charsets = SDATA (val);
3173 /* A coding system of this category is always ASCII compatible. */
3174 src += coding->head_ascii;
3176 while (rejected != CATEGORY_MASK_ISO)
3178 src_base = src;
3179 ONE_MORE_BYTE (c);
3180 switch (c)
3182 case ISO_CODE_ESC:
3183 if (inhibit_iso_escape_detection)
3184 break;
3185 single_shifting = 0;
3186 ONE_MORE_BYTE (c);
3187 if (c >= '(' && c <= '/')
3189 /* Designation sequence for a charset of dimension 1. */
3190 ONE_MORE_BYTE (c1);
3191 if (c1 < ' ' || c1 >= 0x80
3192 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3193 /* Invalid designation sequence. Just ignore. */
3194 break;
3196 else if (c == '$')
3198 /* Designation sequence for a charset of dimension 2. */
3199 ONE_MORE_BYTE (c);
3200 if (c >= '@' && c <= 'B')
3201 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3202 id = iso_charset_table[1][0][c];
3203 else if (c >= '(' && c <= '/')
3205 ONE_MORE_BYTE (c1);
3206 if (c1 < ' ' || c1 >= 0x80
3207 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3208 /* Invalid designation sequence. Just ignore. */
3209 break;
3211 else
3212 /* Invalid designation sequence. Just ignore it. */
3213 break;
3215 else if (c == 'N' || c == 'O')
3217 /* ESC <Fe> for SS2 or SS3. */
3218 single_shifting = 1;
3219 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3220 break;
3222 else if (c == '1')
3224 /* End of composition. */
3225 if (composition_count < 0
3226 || composition_count > MAX_COMPOSITION_COMPONENTS)
3227 /* Invalid */
3228 break;
3229 composition_count = -1;
3230 found |= CATEGORY_MASK_ISO;
3232 else if (c >= '0' && c <= '4')
3234 /* ESC <Fp> for start/end composition. */
3235 composition_count = 0;
3236 break;
3238 else
3240 /* Invalid escape sequence. Just ignore it. */
3241 break;
3244 /* We found a valid designation sequence for CHARSET. */
3245 rejected |= CATEGORY_MASK_ISO_8BIT;
3246 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3247 id))
3248 found |= CATEGORY_MASK_ISO_7;
3249 else
3250 rejected |= CATEGORY_MASK_ISO_7;
3251 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3252 id))
3253 found |= CATEGORY_MASK_ISO_7_TIGHT;
3254 else
3255 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3256 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3257 id))
3258 found |= CATEGORY_MASK_ISO_7_ELSE;
3259 else
3260 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3261 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3262 id))
3263 found |= CATEGORY_MASK_ISO_8_ELSE;
3264 else
3265 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3266 break;
3268 case ISO_CODE_SO:
3269 case ISO_CODE_SI:
3270 /* Locking shift out/in. */
3271 if (inhibit_iso_escape_detection)
3272 break;
3273 single_shifting = 0;
3274 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3275 break;
3277 case ISO_CODE_CSI:
3278 /* Control sequence introducer. */
3279 single_shifting = 0;
3280 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3281 found |= CATEGORY_MASK_ISO_8_ELSE;
3282 goto check_extra_latin;
3284 case ISO_CODE_SS2:
3285 case ISO_CODE_SS3:
3286 /* Single shift. */
3287 if (inhibit_iso_escape_detection)
3288 break;
3289 single_shifting = 0;
3290 rejected |= CATEGORY_MASK_ISO_7BIT;
3291 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3292 & CODING_ISO_FLAG_SINGLE_SHIFT)
3293 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3294 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3295 & CODING_ISO_FLAG_SINGLE_SHIFT)
3296 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3297 if (single_shifting)
3298 break;
3299 goto check_extra_latin;
3301 default:
3302 if (c < 0)
3303 continue;
3304 if (c < 0x80)
3306 if (composition_count >= 0)
3307 composition_count++;
3308 single_shifting = 0;
3309 break;
3311 if (c >= 0xA0)
3313 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3314 found |= CATEGORY_MASK_ISO_8_1;
3315 /* Check the length of succeeding codes of the range
3316 0xA0..0FF. If the byte length is even, we include
3317 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3318 only when we are not single shifting. */
3319 if (! single_shifting
3320 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3322 int i = 1;
3323 while (src < src_end)
3325 src_base = src;
3326 ONE_MORE_BYTE (c);
3327 if (c < 0xA0)
3329 src = src_base;
3330 break;
3332 i++;
3335 if (i & 1 && src < src_end)
3337 rejected |= CATEGORY_MASK_ISO_8_2;
3338 if (composition_count >= 0)
3339 composition_count += i;
3341 else
3343 found |= CATEGORY_MASK_ISO_8_2;
3344 if (composition_count >= 0)
3345 composition_count += i / 2;
3348 break;
3350 check_extra_latin:
3351 single_shifting = 0;
3352 if (! VECTORP (Vlatin_extra_code_table)
3353 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3355 rejected = CATEGORY_MASK_ISO;
3356 break;
3358 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3359 & CODING_ISO_FLAG_LATIN_EXTRA)
3360 found |= CATEGORY_MASK_ISO_8_1;
3361 else
3362 rejected |= CATEGORY_MASK_ISO_8_1;
3363 rejected |= CATEGORY_MASK_ISO_8_2;
3366 detect_info->rejected |= CATEGORY_MASK_ISO;
3367 return 0;
3369 no_more_source:
3370 detect_info->rejected |= rejected;
3371 detect_info->found |= (found & ~rejected);
3372 return 1;
3376 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3377 escape sequence should be kept. */
3378 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3379 do { \
3380 int id, prev; \
3382 if (final < '0' || final >= 128 \
3383 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3384 || !SAFE_CHARSET_P (coding, id)) \
3386 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3387 chars_96 = -1; \
3388 break; \
3390 prev = CODING_ISO_DESIGNATION (coding, reg); \
3391 if (id == charset_jisx0201_roman) \
3393 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3394 id = charset_ascii; \
3396 else if (id == charset_jisx0208_1978) \
3398 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3399 id = charset_jisx0208; \
3401 CODING_ISO_DESIGNATION (coding, reg) = id; \
3402 /* If there was an invalid designation to REG previously, and this \
3403 designation is ASCII to REG, we should keep this designation \
3404 sequence. */ \
3405 if (prev == -2 && id == charset_ascii) \
3406 chars_96 = -1; \
3407 } while (0)
3410 /* Handle these composition sequence (ALT: alternate char):
3412 (1) relative composition: ESC 0 CHAR ... ESC 1
3413 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3414 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3415 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3417 When the start sequence (ESC 0/2/3/4) is found, this annotation
3418 header is produced.
3420 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3422 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3423 produced until the end sequence (ESC 1) is found:
3425 (1) CHAR ... CHAR
3426 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3427 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3428 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3430 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3431 annotation header is updated as below:
3433 (1) LENGTH: unchanged, NCHARS: number of CHARs
3434 (2) LENGTH: unchanged, NCHARS: number of CHARs
3435 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3436 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3438 If an error is found while composing, the annotation header is
3439 changed to:
3441 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3443 and the sequence [ -2 DECODED-RULE ] is changed to the original
3444 byte sequence as below:
3445 o the original byte sequence is B: [ B -1 ]
3446 o the original byte sequence is B1 B2: [ B1 B2 ]
3447 and the sequence [ -1 -1 ] is changed to the original byte
3448 sequence:
3449 [ ESC '0' ]
3452 /* Decode a composition rule C1 and maybe one more byte from the
3453 source, and set RULE to the encoded composition rule, NBYTES to the
3454 length of the composition rule. If the rule is invalid, set RULE
3455 to some negative value. */
3457 #define DECODE_COMPOSITION_RULE(rule, nbytes) \
3458 do { \
3459 rule = c1 - 32; \
3460 if (rule < 0) \
3461 break; \
3462 if (rule < 81) /* old format (before ver.21) */ \
3464 int gref = (rule) / 9; \
3465 int nref = (rule) % 9; \
3466 if (gref == 4) gref = 10; \
3467 if (nref == 4) nref = 10; \
3468 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3469 nbytes = 1; \
3471 else /* new format (after ver.21) */ \
3473 int c; \
3475 ONE_MORE_BYTE (c); \
3476 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3477 if (rule >= 0) \
3478 rule += 0x100; /* to destinguish it from the old format */ \
3479 nbytes = 2; \
3481 } while (0)
3483 #define ENCODE_COMPOSITION_RULE(rule) \
3484 do { \
3485 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3487 if (rule < 0x100) /* old format */ \
3489 if (gref == 10) gref = 4; \
3490 if (nref == 10) nref = 4; \
3491 charbuf[idx] = 32 + gref * 9 + nref; \
3492 charbuf[idx + 1] = -1; \
3493 new_chars++; \
3495 else /* new format */ \
3497 charbuf[idx] = 32 + 81 + gref; \
3498 charbuf[idx + 1] = 32 + nref; \
3499 new_chars += 2; \
3501 } while (0)
3503 /* Finish the current composition as invalid. */
3505 static int finish_composition P_ ((int *, struct composition_status *));
3507 static int
3508 finish_composition (charbuf, cmp_status)
3509 int *charbuf;
3510 struct composition_status *cmp_status;
3512 int idx = - cmp_status->length;
3513 int new_chars;
3515 /* Recover the original ESC sequence */
3516 charbuf[idx++] = ISO_CODE_ESC;
3517 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3518 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3519 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3520 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3521 : '4');
3522 charbuf[idx++] = -2;
3523 charbuf[idx++] = 0;
3524 charbuf[idx++] = -1;
3525 new_chars = cmp_status->nchars;
3526 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3527 for (; idx < 0; idx++)
3529 int elt = charbuf[idx];
3531 if (elt == -2)
3533 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3534 idx++;
3536 else if (elt == -1)
3538 charbuf[idx++] = ISO_CODE_ESC;
3539 charbuf[idx] = '0';
3540 new_chars += 2;
3543 cmp_status->state = COMPOSING_NO;
3544 return new_chars;
3547 /* If characters are under composition, finish the composition. */
3548 #define MAYBE_FINISH_COMPOSITION() \
3549 do { \
3550 if (cmp_status->state != COMPOSING_NO) \
3551 char_offset += finish_composition (charbuf, cmp_status); \
3552 } while (0)
3554 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3556 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3557 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3558 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3559 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3561 Produce this annotation sequence now:
3563 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3566 #define DECODE_COMPOSITION_START(c1) \
3567 do { \
3568 if (c1 == '0' \
3569 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3570 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3571 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3572 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3574 *charbuf++ = -1; \
3575 *charbuf++= -1; \
3576 cmp_status->state = COMPOSING_CHAR; \
3577 cmp_status->length += 2; \
3579 else \
3581 MAYBE_FINISH_COMPOSITION (); \
3582 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3583 : c1 == '2' ? COMPOSITION_WITH_RULE \
3584 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3585 : COMPOSITION_WITH_RULE_ALTCHARS); \
3586 cmp_status->state \
3587 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3588 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3589 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3590 cmp_status->nchars = cmp_status->ncomps = 0; \
3591 coding->annotated = 1; \
3593 } while (0)
3596 /* Handle composition end sequence ESC 1. */
3598 #define DECODE_COMPOSITION_END() \
3599 do { \
3600 if (cmp_status->nchars == 0 \
3601 || ((cmp_status->state == COMPOSING_CHAR) \
3602 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3604 MAYBE_FINISH_COMPOSITION (); \
3605 goto invalid_code; \
3607 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3608 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3609 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3610 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3611 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3612 char_offset += cmp_status->nchars; \
3613 cmp_status->state = COMPOSING_NO; \
3614 } while (0)
3616 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3618 #define STORE_COMPOSITION_RULE(rule) \
3619 do { \
3620 *charbuf++ = -2; \
3621 *charbuf++ = rule; \
3622 cmp_status->length += 2; \
3623 cmp_status->state--; \
3624 } while (0)
3626 /* Store a composed char or a component char C in charbuf, and update
3627 cmp_status. */
3629 #define STORE_COMPOSITION_CHAR(c) \
3630 do { \
3631 *charbuf++ = (c); \
3632 cmp_status->length++; \
3633 if (cmp_status->state == COMPOSING_CHAR) \
3634 cmp_status->nchars++; \
3635 else \
3636 cmp_status->ncomps++; \
3637 if (cmp_status->method == COMPOSITION_WITH_RULE \
3638 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3639 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3640 cmp_status->state++; \
3641 } while (0)
3644 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3646 static void
3647 decode_coding_iso_2022 (coding)
3648 struct coding_system *coding;
3650 const unsigned char *src = coding->source + coding->consumed;
3651 const unsigned char *src_end = coding->source + coding->src_bytes;
3652 const unsigned char *src_base;
3653 int *charbuf = coding->charbuf + coding->charbuf_used;
3654 /* We may produce two annotations (charset and composition) in one
3655 loop and one more charset annotation at the end. */
3656 int *charbuf_end
3657 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3658 int consumed_chars = 0, consumed_chars_base;
3659 int multibytep = coding->src_multibyte;
3660 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3661 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3662 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3663 int charset_id_2, charset_id_3;
3664 struct charset *charset;
3665 int c;
3666 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3667 Lisp_Object attrs, charset_list;
3668 int char_offset = coding->produced_char;
3669 int last_offset = char_offset;
3670 int last_id = charset_ascii;
3671 int eol_crlf =
3672 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3673 int byte_after_cr = -1;
3674 int i;
3676 CODING_GET_INFO (coding, attrs, charset_list);
3677 setup_iso_safe_charsets (attrs);
3678 /* Charset list may have been changed. */
3679 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3680 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3682 if (cmp_status->state != COMPOSING_NO)
3684 for (i = 0; i < cmp_status->length; i++)
3685 *charbuf++ = cmp_status->carryover[i];
3686 coding->annotated = 1;
3689 while (1)
3691 int c1, c2, c3;
3693 src_base = src;
3694 consumed_chars_base = consumed_chars;
3696 if (charbuf >= charbuf_end)
3698 if (byte_after_cr >= 0)
3699 src_base--;
3700 break;
3703 if (byte_after_cr >= 0)
3704 c1 = byte_after_cr, byte_after_cr = -1;
3705 else
3706 ONE_MORE_BYTE (c1);
3707 if (c1 < 0)
3708 goto invalid_code;
3710 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3712 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3713 char_offset++;
3714 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3715 continue;
3718 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3720 if (c1 == ISO_CODE_ESC)
3722 if (src + 1 >= src_end)
3723 goto no_more_source;
3724 *charbuf++ = ISO_CODE_ESC;
3725 char_offset++;
3726 if (src[0] == '%' && src[1] == '@')
3728 src += 2;
3729 consumed_chars += 2;
3730 char_offset += 2;
3731 /* We are sure charbuf can contain two more chars. */
3732 *charbuf++ = '%';
3733 *charbuf++ = '@';
3734 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3737 else
3739 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3740 char_offset++;
3742 continue;
3745 if ((cmp_status->state == COMPOSING_RULE
3746 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3747 && c1 != ISO_CODE_ESC)
3749 int rule, nbytes;
3751 DECODE_COMPOSITION_RULE (rule, nbytes);
3752 if (rule < 0)
3753 goto invalid_code;
3754 STORE_COMPOSITION_RULE (rule);
3755 continue;
3758 /* We produce at most one character. */
3759 switch (iso_code_class [c1])
3761 case ISO_0x20_or_0x7F:
3762 if (charset_id_0 < 0
3763 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3764 /* This is SPACE or DEL. */
3765 charset = CHARSET_FROM_ID (charset_ascii);
3766 else
3767 charset = CHARSET_FROM_ID (charset_id_0);
3768 break;
3770 case ISO_graphic_plane_0:
3771 if (charset_id_0 < 0)
3772 charset = CHARSET_FROM_ID (charset_ascii);
3773 else
3774 charset = CHARSET_FROM_ID (charset_id_0);
3775 break;
3777 case ISO_0xA0_or_0xFF:
3778 if (charset_id_1 < 0
3779 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3780 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3781 goto invalid_code;
3782 /* This is a graphic character, we fall down ... */
3784 case ISO_graphic_plane_1:
3785 if (charset_id_1 < 0)
3786 goto invalid_code;
3787 charset = CHARSET_FROM_ID (charset_id_1);
3788 break;
3790 case ISO_control_0:
3791 if (eol_crlf && c1 == '\r')
3792 ONE_MORE_BYTE (byte_after_cr);
3793 MAYBE_FINISH_COMPOSITION ();
3794 charset = CHARSET_FROM_ID (charset_ascii);
3795 break;
3797 case ISO_control_1:
3798 goto invalid_code;
3800 case ISO_shift_out:
3801 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3802 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3803 goto invalid_code;
3804 CODING_ISO_INVOCATION (coding, 0) = 1;
3805 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3806 continue;
3808 case ISO_shift_in:
3809 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3810 goto invalid_code;
3811 CODING_ISO_INVOCATION (coding, 0) = 0;
3812 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3813 continue;
3815 case ISO_single_shift_2_7:
3816 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3817 goto invalid_code;
3818 case ISO_single_shift_2:
3819 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3820 goto invalid_code;
3821 /* SS2 is handled as an escape sequence of ESC 'N' */
3822 c1 = 'N';
3823 goto label_escape_sequence;
3825 case ISO_single_shift_3:
3826 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3827 goto invalid_code;
3828 /* SS2 is handled as an escape sequence of ESC 'O' */
3829 c1 = 'O';
3830 goto label_escape_sequence;
3832 case ISO_control_sequence_introducer:
3833 /* CSI is handled as an escape sequence of ESC '[' ... */
3834 c1 = '[';
3835 goto label_escape_sequence;
3837 case ISO_escape:
3838 ONE_MORE_BYTE (c1);
3839 label_escape_sequence:
3840 /* Escape sequences handled here are invocation,
3841 designation, direction specification, and character
3842 composition specification. */
3843 switch (c1)
3845 case '&': /* revision of following character set */
3846 ONE_MORE_BYTE (c1);
3847 if (!(c1 >= '@' && c1 <= '~'))
3848 goto invalid_code;
3849 ONE_MORE_BYTE (c1);
3850 if (c1 != ISO_CODE_ESC)
3851 goto invalid_code;
3852 ONE_MORE_BYTE (c1);
3853 goto label_escape_sequence;
3855 case '$': /* designation of 2-byte character set */
3856 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3857 goto invalid_code;
3859 int reg, chars96;
3861 ONE_MORE_BYTE (c1);
3862 if (c1 >= '@' && c1 <= 'B')
3863 { /* designation of JISX0208.1978, GB2312.1980,
3864 or JISX0208.1980 */
3865 reg = 0, chars96 = 0;
3867 else if (c1 >= 0x28 && c1 <= 0x2B)
3868 { /* designation of DIMENSION2_CHARS94 character set */
3869 reg = c1 - 0x28, chars96 = 0;
3870 ONE_MORE_BYTE (c1);
3872 else if (c1 >= 0x2C && c1 <= 0x2F)
3873 { /* designation of DIMENSION2_CHARS96 character set */
3874 reg = c1 - 0x2C, chars96 = 1;
3875 ONE_MORE_BYTE (c1);
3877 else
3878 goto invalid_code;
3879 DECODE_DESIGNATION (reg, 2, chars96, c1);
3880 /* We must update these variables now. */
3881 if (reg == 0)
3882 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3883 else if (reg == 1)
3884 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3885 if (chars96 < 0)
3886 goto invalid_code;
3888 continue;
3890 case 'n': /* invocation of locking-shift-2 */
3891 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3892 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3893 goto invalid_code;
3894 CODING_ISO_INVOCATION (coding, 0) = 2;
3895 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3896 continue;
3898 case 'o': /* invocation of locking-shift-3 */
3899 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3900 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3901 goto invalid_code;
3902 CODING_ISO_INVOCATION (coding, 0) = 3;
3903 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3904 continue;
3906 case 'N': /* invocation of single-shift-2 */
3907 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3908 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3909 goto invalid_code;
3910 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3911 if (charset_id_2 < 0)
3912 charset = CHARSET_FROM_ID (charset_ascii);
3913 else
3914 charset = CHARSET_FROM_ID (charset_id_2);
3915 ONE_MORE_BYTE (c1);
3916 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3917 goto invalid_code;
3918 break;
3920 case 'O': /* invocation of single-shift-3 */
3921 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3922 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3923 goto invalid_code;
3924 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3925 if (charset_id_3 < 0)
3926 charset = CHARSET_FROM_ID (charset_ascii);
3927 else
3928 charset = CHARSET_FROM_ID (charset_id_3);
3929 ONE_MORE_BYTE (c1);
3930 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3931 goto invalid_code;
3932 break;
3934 case '0': case '2': case '3': case '4': /* start composition */
3935 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3936 goto invalid_code;
3937 if (last_id != charset_ascii)
3939 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3940 last_id = charset_ascii;
3941 last_offset = char_offset;
3943 DECODE_COMPOSITION_START (c1);
3944 continue;
3946 case '1': /* end composition */
3947 if (cmp_status->state == COMPOSING_NO)
3948 goto invalid_code;
3949 DECODE_COMPOSITION_END ();
3950 continue;
3952 case '[': /* specification of direction */
3953 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3954 goto invalid_code;
3955 /* For the moment, nested direction is not supported.
3956 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3957 left-to-right, and nonzero means right-to-left. */
3958 ONE_MORE_BYTE (c1);
3959 switch (c1)
3961 case ']': /* end of the current direction */
3962 coding->mode &= ~CODING_MODE_DIRECTION;
3964 case '0': /* end of the current direction */
3965 case '1': /* start of left-to-right direction */
3966 ONE_MORE_BYTE (c1);
3967 if (c1 == ']')
3968 coding->mode &= ~CODING_MODE_DIRECTION;
3969 else
3970 goto invalid_code;
3971 break;
3973 case '2': /* start of right-to-left direction */
3974 ONE_MORE_BYTE (c1);
3975 if (c1 == ']')
3976 coding->mode |= CODING_MODE_DIRECTION;
3977 else
3978 goto invalid_code;
3979 break;
3981 default:
3982 goto invalid_code;
3984 continue;
3986 case '%':
3987 ONE_MORE_BYTE (c1);
3988 if (c1 == '/')
3990 /* CTEXT extended segment:
3991 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3992 We keep these bytes as is for the moment.
3993 They may be decoded by post-read-conversion. */
3994 int dim, M, L;
3995 int size;
3997 ONE_MORE_BYTE (dim);
3998 if (dim < '0' || dim > '4')
3999 goto invalid_code;
4000 ONE_MORE_BYTE (M);
4001 if (M < 128)
4002 goto invalid_code;
4003 ONE_MORE_BYTE (L);
4004 if (L < 128)
4005 goto invalid_code;
4006 size = ((M - 128) * 128) + (L - 128);
4007 if (charbuf + 6 > charbuf_end)
4008 goto break_loop;
4009 *charbuf++ = ISO_CODE_ESC;
4010 *charbuf++ = '%';
4011 *charbuf++ = '/';
4012 *charbuf++ = dim;
4013 *charbuf++ = BYTE8_TO_CHAR (M);
4014 *charbuf++ = BYTE8_TO_CHAR (L);
4015 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
4017 else if (c1 == 'G')
4019 /* XFree86 extension for embedding UTF-8 in CTEXT:
4020 ESC % G --UTF-8-BYTES-- ESC % @
4021 We keep these bytes as is for the moment.
4022 They may be decoded by post-read-conversion. */
4023 if (charbuf + 3 > charbuf_end)
4024 goto break_loop;
4025 *charbuf++ = ISO_CODE_ESC;
4026 *charbuf++ = '%';
4027 *charbuf++ = 'G';
4028 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
4030 else
4031 goto invalid_code;
4032 continue;
4033 break;
4035 default:
4036 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
4037 goto invalid_code;
4039 int reg, chars96;
4041 if (c1 >= 0x28 && c1 <= 0x2B)
4042 { /* designation of DIMENSION1_CHARS94 character set */
4043 reg = c1 - 0x28, chars96 = 0;
4044 ONE_MORE_BYTE (c1);
4046 else if (c1 >= 0x2C && c1 <= 0x2F)
4047 { /* designation of DIMENSION1_CHARS96 character set */
4048 reg = c1 - 0x2C, chars96 = 1;
4049 ONE_MORE_BYTE (c1);
4051 else
4052 goto invalid_code;
4053 DECODE_DESIGNATION (reg, 1, chars96, c1);
4054 /* We must update these variables now. */
4055 if (reg == 0)
4056 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
4057 else if (reg == 1)
4058 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
4059 if (chars96 < 0)
4060 goto invalid_code;
4062 continue;
4066 if (cmp_status->state == COMPOSING_NO
4067 && charset->id != charset_ascii
4068 && last_id != charset->id)
4070 if (last_id != charset_ascii)
4071 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4072 last_id = charset->id;
4073 last_offset = char_offset;
4076 /* Now we know CHARSET and 1st position code C1 of a character.
4077 Produce a decoded character while getting 2nd and 3rd
4078 position codes C2, C3 if necessary. */
4079 if (CHARSET_DIMENSION (charset) > 1)
4081 ONE_MORE_BYTE (c2);
4082 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4083 || ((c1 & 0x80) != (c2 & 0x80)))
4084 /* C2 is not in a valid range. */
4085 goto invalid_code;
4086 if (CHARSET_DIMENSION (charset) == 2)
4087 c1 = (c1 << 8) | c2;
4088 else
4090 ONE_MORE_BYTE (c3);
4091 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4092 || ((c1 & 0x80) != (c3 & 0x80)))
4093 /* C3 is not in a valid range. */
4094 goto invalid_code;
4095 c1 = (c1 << 16) | (c2 << 8) | c2;
4098 c1 &= 0x7F7F7F;
4099 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4100 if (c < 0)
4102 MAYBE_FINISH_COMPOSITION ();
4103 for (; src_base < src; src_base++, char_offset++)
4105 if (ASCII_BYTE_P (*src_base))
4106 *charbuf++ = *src_base;
4107 else
4108 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4111 else if (cmp_status->state == COMPOSING_NO)
4113 *charbuf++ = c;
4114 char_offset++;
4116 else if ((cmp_status->state == COMPOSING_CHAR
4117 ? cmp_status->nchars
4118 : cmp_status->ncomps)
4119 >= MAX_COMPOSITION_COMPONENTS)
4121 /* Too long composition. */
4122 MAYBE_FINISH_COMPOSITION ();
4123 *charbuf++ = c;
4124 char_offset++;
4126 else
4127 STORE_COMPOSITION_CHAR (c);
4128 continue;
4130 invalid_code:
4131 MAYBE_FINISH_COMPOSITION ();
4132 src = src_base;
4133 consumed_chars = consumed_chars_base;
4134 ONE_MORE_BYTE (c);
4135 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4136 char_offset++;
4137 coding->errors++;
4138 continue;
4140 break_loop:
4141 break;
4144 no_more_source:
4145 if (cmp_status->state != COMPOSING_NO)
4147 if (coding->mode & CODING_MODE_LAST_BLOCK)
4148 MAYBE_FINISH_COMPOSITION ();
4149 else
4151 charbuf -= cmp_status->length;
4152 for (i = 0; i < cmp_status->length; i++)
4153 cmp_status->carryover[i] = charbuf[i];
4156 else if (last_id != charset_ascii)
4157 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4158 coding->consumed_char += consumed_chars_base;
4159 coding->consumed = src_base - coding->source;
4160 coding->charbuf_used = charbuf - coding->charbuf;
4164 /* ISO2022 encoding stuff. */
4167 It is not enough to say just "ISO2022" on encoding, we have to
4168 specify more details. In Emacs, each coding system of ISO2022
4169 variant has the following specifications:
4170 1. Initial designation to G0 thru G3.
4171 2. Allows short-form designation?
4172 3. ASCII should be designated to G0 before control characters?
4173 4. ASCII should be designated to G0 at end of line?
4174 5. 7-bit environment or 8-bit environment?
4175 6. Use locking-shift?
4176 7. Use Single-shift?
4177 And the following two are only for Japanese:
4178 8. Use ASCII in place of JIS0201-1976-Roman?
4179 9. Use JISX0208-1983 in place of JISX0208-1978?
4180 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4181 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4182 details.
4185 /* Produce codes (escape sequence) for designating CHARSET to graphic
4186 register REG at DST, and increment DST. If <final-char> of CHARSET is
4187 '@', 'A', or 'B' and the coding system CODING allows, produce
4188 designation sequence of short-form. */
4190 #define ENCODE_DESIGNATION(charset, reg, coding) \
4191 do { \
4192 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4193 char *intermediate_char_94 = "()*+"; \
4194 char *intermediate_char_96 = ",-./"; \
4195 int revision = -1; \
4196 int c; \
4198 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4199 revision = CHARSET_ISO_REVISION (charset); \
4201 if (revision >= 0) \
4203 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4204 EMIT_ONE_BYTE ('@' + revision); \
4206 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4207 if (CHARSET_DIMENSION (charset) == 1) \
4209 if (! CHARSET_ISO_CHARS_96 (charset)) \
4210 c = intermediate_char_94[reg]; \
4211 else \
4212 c = intermediate_char_96[reg]; \
4213 EMIT_ONE_ASCII_BYTE (c); \
4215 else \
4217 EMIT_ONE_ASCII_BYTE ('$'); \
4218 if (! CHARSET_ISO_CHARS_96 (charset)) \
4220 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4221 || reg != 0 \
4222 || final_char < '@' || final_char > 'B') \
4223 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4225 else \
4226 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4228 EMIT_ONE_ASCII_BYTE (final_char); \
4230 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4231 } while (0)
4234 /* The following two macros produce codes (control character or escape
4235 sequence) for ISO2022 single-shift functions (single-shift-2 and
4236 single-shift-3). */
4238 #define ENCODE_SINGLE_SHIFT_2 \
4239 do { \
4240 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4241 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4242 else \
4243 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4244 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4245 } while (0)
4248 #define ENCODE_SINGLE_SHIFT_3 \
4249 do { \
4250 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4251 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4252 else \
4253 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4254 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4255 } while (0)
4258 /* The following four macros produce codes (control character or
4259 escape sequence) for ISO2022 locking-shift functions (shift-in,
4260 shift-out, locking-shift-2, and locking-shift-3). */
4262 #define ENCODE_SHIFT_IN \
4263 do { \
4264 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4265 CODING_ISO_INVOCATION (coding, 0) = 0; \
4266 } while (0)
4269 #define ENCODE_SHIFT_OUT \
4270 do { \
4271 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4272 CODING_ISO_INVOCATION (coding, 0) = 1; \
4273 } while (0)
4276 #define ENCODE_LOCKING_SHIFT_2 \
4277 do { \
4278 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4279 CODING_ISO_INVOCATION (coding, 0) = 2; \
4280 } while (0)
4283 #define ENCODE_LOCKING_SHIFT_3 \
4284 do { \
4285 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4286 CODING_ISO_INVOCATION (coding, 0) = 3; \
4287 } while (0)
4290 /* Produce codes for a DIMENSION1 character whose character set is
4291 CHARSET and whose position-code is C1. Designation and invocation
4292 sequences are also produced in advance if necessary. */
4294 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4295 do { \
4296 int id = CHARSET_ID (charset); \
4298 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4299 && id == charset_ascii) \
4301 id = charset_jisx0201_roman; \
4302 charset = CHARSET_FROM_ID (id); \
4305 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4307 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4308 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4309 else \
4310 EMIT_ONE_BYTE (c1 | 0x80); \
4311 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4312 break; \
4314 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4316 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4317 break; \
4319 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4321 EMIT_ONE_BYTE (c1 | 0x80); \
4322 break; \
4324 else \
4325 /* Since CHARSET is not yet invoked to any graphic planes, we \
4326 must invoke it, or, at first, designate it to some graphic \
4327 register. Then repeat the loop to actually produce the \
4328 character. */ \
4329 dst = encode_invocation_designation (charset, coding, dst, \
4330 &produced_chars); \
4331 } while (1)
4334 /* Produce codes for a DIMENSION2 character whose character set is
4335 CHARSET and whose position-codes are C1 and C2. Designation and
4336 invocation codes are also produced in advance if necessary. */
4338 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4339 do { \
4340 int id = CHARSET_ID (charset); \
4342 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4343 && id == charset_jisx0208) \
4345 id = charset_jisx0208_1978; \
4346 charset = CHARSET_FROM_ID (id); \
4349 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4351 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4352 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4353 else \
4354 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4355 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4356 break; \
4358 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4360 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4361 break; \
4363 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4365 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4366 break; \
4368 else \
4369 /* Since CHARSET is not yet invoked to any graphic planes, we \
4370 must invoke it, or, at first, designate it to some graphic \
4371 register. Then repeat the loop to actually produce the \
4372 character. */ \
4373 dst = encode_invocation_designation (charset, coding, dst, \
4374 &produced_chars); \
4375 } while (1)
4378 #define ENCODE_ISO_CHARACTER(charset, c) \
4379 do { \
4380 int code; \
4381 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4383 if (CHARSET_DIMENSION (charset) == 1) \
4384 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4385 else \
4386 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4387 } while (0)
4390 /* Produce designation and invocation codes at a place pointed by DST
4391 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4392 Return new DST. */
4394 unsigned char *
4395 encode_invocation_designation (charset, coding, dst, p_nchars)
4396 struct charset *charset;
4397 struct coding_system *coding;
4398 unsigned char *dst;
4399 int *p_nchars;
4401 int multibytep = coding->dst_multibyte;
4402 int produced_chars = *p_nchars;
4403 int reg; /* graphic register number */
4404 int id = CHARSET_ID (charset);
4406 /* At first, check designations. */
4407 for (reg = 0; reg < 4; reg++)
4408 if (id == CODING_ISO_DESIGNATION (coding, reg))
4409 break;
4411 if (reg >= 4)
4413 /* CHARSET is not yet designated to any graphic registers. */
4414 /* At first check the requested designation. */
4415 reg = CODING_ISO_REQUEST (coding, id);
4416 if (reg < 0)
4417 /* Since CHARSET requests no special designation, designate it
4418 to graphic register 0. */
4419 reg = 0;
4421 ENCODE_DESIGNATION (charset, reg, coding);
4424 if (CODING_ISO_INVOCATION (coding, 0) != reg
4425 && CODING_ISO_INVOCATION (coding, 1) != reg)
4427 /* Since the graphic register REG is not invoked to any graphic
4428 planes, invoke it to graphic plane 0. */
4429 switch (reg)
4431 case 0: /* graphic register 0 */
4432 ENCODE_SHIFT_IN;
4433 break;
4435 case 1: /* graphic register 1 */
4436 ENCODE_SHIFT_OUT;
4437 break;
4439 case 2: /* graphic register 2 */
4440 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4441 ENCODE_SINGLE_SHIFT_2;
4442 else
4443 ENCODE_LOCKING_SHIFT_2;
4444 break;
4446 case 3: /* graphic register 3 */
4447 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4448 ENCODE_SINGLE_SHIFT_3;
4449 else
4450 ENCODE_LOCKING_SHIFT_3;
4451 break;
4455 *p_nchars = produced_chars;
4456 return dst;
4459 /* The following three macros produce codes for indicating direction
4460 of text. */
4461 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
4462 do { \
4463 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4464 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
4465 else \
4466 EMIT_ONE_BYTE (ISO_CODE_CSI); \
4467 } while (0)
4470 #define ENCODE_DIRECTION_R2L() \
4471 do { \
4472 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4473 EMIT_TWO_ASCII_BYTES ('2', ']'); \
4474 } while (0)
4477 #define ENCODE_DIRECTION_L2R() \
4478 do { \
4479 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4480 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4481 } while (0)
4484 /* Produce codes for designation and invocation to reset the graphic
4485 planes and registers to initial state. */
4486 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4487 do { \
4488 int reg; \
4489 struct charset *charset; \
4491 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4492 ENCODE_SHIFT_IN; \
4493 for (reg = 0; reg < 4; reg++) \
4494 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4495 && (CODING_ISO_DESIGNATION (coding, reg) \
4496 != CODING_ISO_INITIAL (coding, reg))) \
4498 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4499 ENCODE_DESIGNATION (charset, reg, coding); \
4501 } while (0)
4504 /* Produce designation sequences of charsets in the line started from
4505 CHARBUF to a place pointed by DST, and return the number of
4506 produced bytes. DST should not directly point a buffer text area
4507 which may be relocated by char_charset call.
4509 If the current block ends before any end-of-line, we may fail to
4510 find all the necessary designations. */
4512 static EMACS_INT
4513 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4514 struct coding_system *coding;
4515 int *charbuf, *charbuf_end;
4516 unsigned char *dst;
4518 unsigned char *orig = dst;
4519 struct charset *charset;
4520 /* Table of charsets to be designated to each graphic register. */
4521 int r[4];
4522 int c, found = 0, reg;
4523 int produced_chars = 0;
4524 int multibytep = coding->dst_multibyte;
4525 Lisp_Object attrs;
4526 Lisp_Object charset_list;
4528 attrs = CODING_ID_ATTRS (coding->id);
4529 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4530 if (EQ (charset_list, Qiso_2022))
4531 charset_list = Viso_2022_charset_list;
4533 for (reg = 0; reg < 4; reg++)
4534 r[reg] = -1;
4536 while (charbuf < charbuf_end && found < 4)
4538 int id;
4540 c = *charbuf++;
4541 if (c == '\n')
4542 break;
4543 charset = char_charset (c, charset_list, NULL);
4544 id = CHARSET_ID (charset);
4545 reg = CODING_ISO_REQUEST (coding, id);
4546 if (reg >= 0 && r[reg] < 0)
4548 found++;
4549 r[reg] = id;
4553 if (found)
4555 for (reg = 0; reg < 4; reg++)
4556 if (r[reg] >= 0
4557 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4558 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4561 return dst - orig;
4564 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4566 static int
4567 encode_coding_iso_2022 (coding)
4568 struct coding_system *coding;
4570 int multibytep = coding->dst_multibyte;
4571 int *charbuf = coding->charbuf;
4572 int *charbuf_end = charbuf + coding->charbuf_used;
4573 unsigned char *dst = coding->destination + coding->produced;
4574 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4575 int safe_room = 16;
4576 int bol_designation
4577 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4578 && CODING_ISO_BOL (coding));
4579 int produced_chars = 0;
4580 Lisp_Object attrs, eol_type, charset_list;
4581 int ascii_compatible;
4582 int c;
4583 int preferred_charset_id = -1;
4585 CODING_GET_INFO (coding, attrs, charset_list);
4586 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4587 if (VECTORP (eol_type))
4588 eol_type = Qunix;
4590 setup_iso_safe_charsets (attrs);
4591 /* Charset list may have been changed. */
4592 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4593 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4595 ascii_compatible
4596 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4597 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4598 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4600 while (charbuf < charbuf_end)
4602 ASSURE_DESTINATION (safe_room);
4604 if (bol_designation)
4606 /* We have to produce designation sequences if any now. */
4607 unsigned char desig_buf[16];
4608 int nbytes;
4609 EMACS_INT offset;
4611 charset_map_loaded = 0;
4612 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4613 desig_buf);
4614 if (charset_map_loaded
4615 && (offset = coding_set_destination (coding)))
4617 dst += offset;
4618 dst_end += offset;
4620 memcpy (dst, desig_buf, nbytes);
4621 dst += nbytes;
4622 /* We are sure that designation sequences are all ASCII bytes. */
4623 produced_chars += nbytes;
4624 bol_designation = 0;
4625 ASSURE_DESTINATION (safe_room);
4628 c = *charbuf++;
4630 if (c < 0)
4632 /* Handle an annotation. */
4633 switch (*charbuf)
4635 case CODING_ANNOTATE_COMPOSITION_MASK:
4636 /* Not yet implemented. */
4637 break;
4638 case CODING_ANNOTATE_CHARSET_MASK:
4639 preferred_charset_id = charbuf[2];
4640 if (preferred_charset_id >= 0
4641 && NILP (Fmemq (make_number (preferred_charset_id),
4642 charset_list)))
4643 preferred_charset_id = -1;
4644 break;
4645 default:
4646 abort ();
4648 charbuf += -c - 1;
4649 continue;
4652 /* Now encode the character C. */
4653 if (c < 0x20 || c == 0x7F)
4655 if (c == '\n'
4656 || (c == '\r' && EQ (eol_type, Qmac)))
4658 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4659 ENCODE_RESET_PLANE_AND_REGISTER ();
4660 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4662 int i;
4664 for (i = 0; i < 4; i++)
4665 CODING_ISO_DESIGNATION (coding, i)
4666 = CODING_ISO_INITIAL (coding, i);
4668 bol_designation
4669 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4671 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4672 ENCODE_RESET_PLANE_AND_REGISTER ();
4673 EMIT_ONE_ASCII_BYTE (c);
4675 else if (ASCII_CHAR_P (c))
4677 if (ascii_compatible)
4678 EMIT_ONE_ASCII_BYTE (c);
4679 else
4681 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4682 ENCODE_ISO_CHARACTER (charset, c);
4685 else if (CHAR_BYTE8_P (c))
4687 c = CHAR_TO_BYTE8 (c);
4688 EMIT_ONE_BYTE (c);
4690 else
4692 struct charset *charset;
4694 if (preferred_charset_id >= 0)
4696 int result;
4698 charset = CHARSET_FROM_ID (preferred_charset_id);
4699 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4700 if (! result)
4701 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4702 NULL, charset);
4704 else
4705 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4706 NULL, charset);
4707 if (!charset)
4709 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4711 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4712 charset = CHARSET_FROM_ID (charset_ascii);
4714 else
4716 c = coding->default_char;
4717 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4718 charset_list, NULL, charset);
4721 ENCODE_ISO_CHARACTER (charset, c);
4725 if (coding->mode & CODING_MODE_LAST_BLOCK
4726 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4728 ASSURE_DESTINATION (safe_room);
4729 ENCODE_RESET_PLANE_AND_REGISTER ();
4731 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4732 CODING_ISO_BOL (coding) = bol_designation;
4733 coding->produced_char += produced_chars;
4734 coding->produced = dst - coding->destination;
4735 return 0;
4739 /*** 8,9. SJIS and BIG5 handlers ***/
4741 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4742 quite widely. So, for the moment, Emacs supports them in the bare
4743 C code. But, in the future, they may be supported only by CCL. */
4745 /* SJIS is a coding system encoding three character sets: ASCII, right
4746 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4747 as is. A character of charset katakana-jisx0201 is encoded by
4748 "position-code + 0x80". A character of charset japanese-jisx0208
4749 is encoded in 2-byte but two position-codes are divided and shifted
4750 so that it fit in the range below.
4752 --- CODE RANGE of SJIS ---
4753 (character set) (range)
4754 ASCII 0x00 .. 0x7F
4755 KATAKANA-JISX0201 0xA0 .. 0xDF
4756 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4757 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4758 -------------------------------
4762 /* BIG5 is a coding system encoding two character sets: ASCII and
4763 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4764 character set and is encoded in two-byte.
4766 --- CODE RANGE of BIG5 ---
4767 (character set) (range)
4768 ASCII 0x00 .. 0x7F
4769 Big5 (1st byte) 0xA1 .. 0xFE
4770 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4771 --------------------------
4775 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4776 Check if a text is encoded in SJIS. If it is, return
4777 CATEGORY_MASK_SJIS, else return 0. */
4779 static int
4780 detect_coding_sjis (coding, detect_info)
4781 struct coding_system *coding;
4782 struct coding_detection_info *detect_info;
4784 const unsigned char *src = coding->source, *src_base;
4785 const unsigned char *src_end = coding->source + coding->src_bytes;
4786 int multibytep = coding->src_multibyte;
4787 int consumed_chars = 0;
4788 int found = 0;
4789 int c;
4790 Lisp_Object attrs, charset_list;
4791 int max_first_byte_of_2_byte_code;
4793 CODING_GET_INFO (coding, attrs, charset_list);
4794 max_first_byte_of_2_byte_code
4795 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4797 detect_info->checked |= CATEGORY_MASK_SJIS;
4798 /* A coding system of this category is always ASCII compatible. */
4799 src += coding->head_ascii;
4801 while (1)
4803 src_base = src;
4804 ONE_MORE_BYTE (c);
4805 if (c < 0x80)
4806 continue;
4807 if ((c >= 0x81 && c <= 0x9F)
4808 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4810 ONE_MORE_BYTE (c);
4811 if (c < 0x40 || c == 0x7F || c > 0xFC)
4812 break;
4813 found = CATEGORY_MASK_SJIS;
4815 else if (c >= 0xA0 && c < 0xE0)
4816 found = CATEGORY_MASK_SJIS;
4817 else
4818 break;
4820 detect_info->rejected |= CATEGORY_MASK_SJIS;
4821 return 0;
4823 no_more_source:
4824 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4826 detect_info->rejected |= CATEGORY_MASK_SJIS;
4827 return 0;
4829 detect_info->found |= found;
4830 return 1;
4833 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4834 Check if a text is encoded in BIG5. If it is, return
4835 CATEGORY_MASK_BIG5, else return 0. */
4837 static int
4838 detect_coding_big5 (coding, detect_info)
4839 struct coding_system *coding;
4840 struct coding_detection_info *detect_info;
4842 const unsigned char *src = coding->source, *src_base;
4843 const unsigned char *src_end = coding->source + coding->src_bytes;
4844 int multibytep = coding->src_multibyte;
4845 int consumed_chars = 0;
4846 int found = 0;
4847 int c;
4849 detect_info->checked |= CATEGORY_MASK_BIG5;
4850 /* A coding system of this category is always ASCII compatible. */
4851 src += coding->head_ascii;
4853 while (1)
4855 src_base = src;
4856 ONE_MORE_BYTE (c);
4857 if (c < 0x80)
4858 continue;
4859 if (c >= 0xA1)
4861 ONE_MORE_BYTE (c);
4862 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4863 return 0;
4864 found = CATEGORY_MASK_BIG5;
4866 else
4867 break;
4869 detect_info->rejected |= CATEGORY_MASK_BIG5;
4870 return 0;
4872 no_more_source:
4873 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4875 detect_info->rejected |= CATEGORY_MASK_BIG5;
4876 return 0;
4878 detect_info->found |= found;
4879 return 1;
4882 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4883 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4885 static void
4886 decode_coding_sjis (coding)
4887 struct coding_system *coding;
4889 const unsigned char *src = coding->source + coding->consumed;
4890 const unsigned char *src_end = coding->source + coding->src_bytes;
4891 const unsigned char *src_base;
4892 int *charbuf = coding->charbuf + coding->charbuf_used;
4893 /* We may produce one charset annotation in one loop and one more at
4894 the end. */
4895 int *charbuf_end
4896 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4897 int consumed_chars = 0, consumed_chars_base;
4898 int multibytep = coding->src_multibyte;
4899 struct charset *charset_roman, *charset_kanji, *charset_kana;
4900 struct charset *charset_kanji2;
4901 Lisp_Object attrs, charset_list, val;
4902 int char_offset = coding->produced_char;
4903 int last_offset = char_offset;
4904 int last_id = charset_ascii;
4905 int eol_crlf =
4906 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4907 int byte_after_cr = -1;
4909 CODING_GET_INFO (coding, attrs, charset_list);
4911 val = charset_list;
4912 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4913 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4914 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4915 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4917 while (1)
4919 int c, c1;
4920 struct charset *charset;
4922 src_base = src;
4923 consumed_chars_base = consumed_chars;
4925 if (charbuf >= charbuf_end)
4927 if (byte_after_cr >= 0)
4928 src_base--;
4929 break;
4932 if (byte_after_cr >= 0)
4933 c = byte_after_cr, byte_after_cr = -1;
4934 else
4935 ONE_MORE_BYTE (c);
4936 if (c < 0)
4937 goto invalid_code;
4938 if (c < 0x80)
4940 if (eol_crlf && c == '\r')
4941 ONE_MORE_BYTE (byte_after_cr);
4942 charset = charset_roman;
4944 else if (c == 0x80 || c == 0xA0)
4945 goto invalid_code;
4946 else if (c >= 0xA1 && c <= 0xDF)
4948 /* SJIS -> JISX0201-Kana */
4949 c &= 0x7F;
4950 charset = charset_kana;
4952 else if (c <= 0xEF)
4954 /* SJIS -> JISX0208 */
4955 ONE_MORE_BYTE (c1);
4956 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4957 goto invalid_code;
4958 c = (c << 8) | c1;
4959 SJIS_TO_JIS (c);
4960 charset = charset_kanji;
4962 else if (c <= 0xFC && charset_kanji2)
4964 /* SJIS -> JISX0213-2 */
4965 ONE_MORE_BYTE (c1);
4966 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4967 goto invalid_code;
4968 c = (c << 8) | c1;
4969 SJIS_TO_JIS2 (c);
4970 charset = charset_kanji2;
4972 else
4973 goto invalid_code;
4974 if (charset->id != charset_ascii
4975 && last_id != charset->id)
4977 if (last_id != charset_ascii)
4978 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4979 last_id = charset->id;
4980 last_offset = char_offset;
4982 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4983 *charbuf++ = c;
4984 char_offset++;
4985 continue;
4987 invalid_code:
4988 src = src_base;
4989 consumed_chars = consumed_chars_base;
4990 ONE_MORE_BYTE (c);
4991 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4992 char_offset++;
4993 coding->errors++;
4996 no_more_source:
4997 if (last_id != charset_ascii)
4998 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4999 coding->consumed_char += consumed_chars_base;
5000 coding->consumed = src_base - coding->source;
5001 coding->charbuf_used = charbuf - coding->charbuf;
5004 static void
5005 decode_coding_big5 (coding)
5006 struct coding_system *coding;
5008 const unsigned char *src = coding->source + coding->consumed;
5009 const unsigned char *src_end = coding->source + coding->src_bytes;
5010 const unsigned char *src_base;
5011 int *charbuf = coding->charbuf + coding->charbuf_used;
5012 /* We may produce one charset annotation in one loop and one more at
5013 the end. */
5014 int *charbuf_end
5015 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5016 int consumed_chars = 0, consumed_chars_base;
5017 int multibytep = coding->src_multibyte;
5018 struct charset *charset_roman, *charset_big5;
5019 Lisp_Object attrs, charset_list, val;
5020 int char_offset = coding->produced_char;
5021 int last_offset = char_offset;
5022 int last_id = charset_ascii;
5023 int eol_crlf =
5024 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5025 int byte_after_cr = -1;
5027 CODING_GET_INFO (coding, attrs, charset_list);
5028 val = charset_list;
5029 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5030 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5032 while (1)
5034 int c, c1;
5035 struct charset *charset;
5037 src_base = src;
5038 consumed_chars_base = consumed_chars;
5040 if (charbuf >= charbuf_end)
5042 if (byte_after_cr >= 0)
5043 src_base--;
5044 break;
5047 if (byte_after_cr >= 0)
5048 c = byte_after_cr, byte_after_cr = -1;
5049 else
5050 ONE_MORE_BYTE (c);
5052 if (c < 0)
5053 goto invalid_code;
5054 if (c < 0x80)
5056 if (eol_crlf && c == '\r')
5057 ONE_MORE_BYTE (byte_after_cr);
5058 charset = charset_roman;
5060 else
5062 /* BIG5 -> Big5 */
5063 if (c < 0xA1 || c > 0xFE)
5064 goto invalid_code;
5065 ONE_MORE_BYTE (c1);
5066 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
5067 goto invalid_code;
5068 c = c << 8 | c1;
5069 charset = charset_big5;
5071 if (charset->id != charset_ascii
5072 && last_id != charset->id)
5074 if (last_id != charset_ascii)
5075 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5076 last_id = charset->id;
5077 last_offset = char_offset;
5079 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
5080 *charbuf++ = c;
5081 char_offset++;
5082 continue;
5084 invalid_code:
5085 src = src_base;
5086 consumed_chars = consumed_chars_base;
5087 ONE_MORE_BYTE (c);
5088 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5089 char_offset++;
5090 coding->errors++;
5093 no_more_source:
5094 if (last_id != charset_ascii)
5095 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5096 coding->consumed_char += consumed_chars_base;
5097 coding->consumed = src_base - coding->source;
5098 coding->charbuf_used = charbuf - coding->charbuf;
5101 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5102 This function can encode charsets `ascii', `katakana-jisx0201',
5103 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
5104 are sure that all these charsets are registered as official charset
5105 (i.e. do not have extended leading-codes). Characters of other
5106 charsets are produced without any encoding. If SJIS_P is 1, encode
5107 SJIS text, else encode BIG5 text. */
5109 static int
5110 encode_coding_sjis (coding)
5111 struct coding_system *coding;
5113 int multibytep = coding->dst_multibyte;
5114 int *charbuf = coding->charbuf;
5115 int *charbuf_end = charbuf + coding->charbuf_used;
5116 unsigned char *dst = coding->destination + coding->produced;
5117 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5118 int safe_room = 4;
5119 int produced_chars = 0;
5120 Lisp_Object attrs, charset_list, val;
5121 int ascii_compatible;
5122 struct charset *charset_roman, *charset_kanji, *charset_kana;
5123 struct charset *charset_kanji2;
5124 int c;
5126 CODING_GET_INFO (coding, attrs, charset_list);
5127 val = charset_list;
5128 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5129 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5130 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5131 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5133 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5135 while (charbuf < charbuf_end)
5137 ASSURE_DESTINATION (safe_room);
5138 c = *charbuf++;
5139 /* Now encode the character C. */
5140 if (ASCII_CHAR_P (c) && ascii_compatible)
5141 EMIT_ONE_ASCII_BYTE (c);
5142 else if (CHAR_BYTE8_P (c))
5144 c = CHAR_TO_BYTE8 (c);
5145 EMIT_ONE_BYTE (c);
5147 else
5149 unsigned code;
5150 struct charset *charset;
5151 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5152 &code, charset);
5154 if (!charset)
5156 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5158 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5159 charset = CHARSET_FROM_ID (charset_ascii);
5161 else
5163 c = coding->default_char;
5164 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5165 charset_list, &code, charset);
5168 if (code == CHARSET_INVALID_CODE (charset))
5169 abort ();
5170 if (charset == charset_kanji)
5172 int c1, c2;
5173 JIS_TO_SJIS (code);
5174 c1 = code >> 8, c2 = code & 0xFF;
5175 EMIT_TWO_BYTES (c1, c2);
5177 else if (charset == charset_kana)
5178 EMIT_ONE_BYTE (code | 0x80);
5179 else if (charset_kanji2 && charset == charset_kanji2)
5181 int c1, c2;
5183 c1 = code >> 8;
5184 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5185 || c1 == 0x28
5186 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5188 JIS_TO_SJIS2 (code);
5189 c1 = code >> 8, c2 = code & 0xFF;
5190 EMIT_TWO_BYTES (c1, c2);
5192 else
5193 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5195 else
5196 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5199 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5200 coding->produced_char += produced_chars;
5201 coding->produced = dst - coding->destination;
5202 return 0;
5205 static int
5206 encode_coding_big5 (coding)
5207 struct coding_system *coding;
5209 int multibytep = coding->dst_multibyte;
5210 int *charbuf = coding->charbuf;
5211 int *charbuf_end = charbuf + coding->charbuf_used;
5212 unsigned char *dst = coding->destination + coding->produced;
5213 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5214 int safe_room = 4;
5215 int produced_chars = 0;
5216 Lisp_Object attrs, charset_list, val;
5217 int ascii_compatible;
5218 struct charset *charset_roman, *charset_big5;
5219 int c;
5221 CODING_GET_INFO (coding, attrs, charset_list);
5222 val = charset_list;
5223 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5224 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5225 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5227 while (charbuf < charbuf_end)
5229 ASSURE_DESTINATION (safe_room);
5230 c = *charbuf++;
5231 /* Now encode the character C. */
5232 if (ASCII_CHAR_P (c) && ascii_compatible)
5233 EMIT_ONE_ASCII_BYTE (c);
5234 else if (CHAR_BYTE8_P (c))
5236 c = CHAR_TO_BYTE8 (c);
5237 EMIT_ONE_BYTE (c);
5239 else
5241 unsigned code;
5242 struct charset *charset;
5243 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5244 &code, charset);
5246 if (! charset)
5248 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5250 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5251 charset = CHARSET_FROM_ID (charset_ascii);
5253 else
5255 c = coding->default_char;
5256 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5257 charset_list, &code, charset);
5260 if (code == CHARSET_INVALID_CODE (charset))
5261 abort ();
5262 if (charset == charset_big5)
5264 int c1, c2;
5266 c1 = code >> 8, c2 = code & 0xFF;
5267 EMIT_TWO_BYTES (c1, c2);
5269 else
5270 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5273 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5274 coding->produced_char += produced_chars;
5275 coding->produced = dst - coding->destination;
5276 return 0;
5280 /*** 10. CCL handlers ***/
5282 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5283 Check if a text is encoded in a coding system of which
5284 encoder/decoder are written in CCL program. If it is, return
5285 CATEGORY_MASK_CCL, else return 0. */
5287 static int
5288 detect_coding_ccl (coding, detect_info)
5289 struct coding_system *coding;
5290 struct coding_detection_info *detect_info;
5292 const unsigned char *src = coding->source, *src_base;
5293 const unsigned char *src_end = coding->source + coding->src_bytes;
5294 int multibytep = coding->src_multibyte;
5295 int consumed_chars = 0;
5296 int found = 0;
5297 unsigned char *valids;
5298 int head_ascii = coding->head_ascii;
5299 Lisp_Object attrs;
5301 detect_info->checked |= CATEGORY_MASK_CCL;
5303 coding = &coding_categories[coding_category_ccl];
5304 valids = CODING_CCL_VALIDS (coding);
5305 attrs = CODING_ID_ATTRS (coding->id);
5306 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5307 src += head_ascii;
5309 while (1)
5311 int c;
5313 src_base = src;
5314 ONE_MORE_BYTE (c);
5315 if (c < 0 || ! valids[c])
5316 break;
5317 if ((valids[c] > 1))
5318 found = CATEGORY_MASK_CCL;
5320 detect_info->rejected |= CATEGORY_MASK_CCL;
5321 return 0;
5323 no_more_source:
5324 detect_info->found |= found;
5325 return 1;
5328 static void
5329 decode_coding_ccl (coding)
5330 struct coding_system *coding;
5332 const unsigned char *src = coding->source + coding->consumed;
5333 const unsigned char *src_end = coding->source + coding->src_bytes;
5334 int *charbuf = coding->charbuf + coding->charbuf_used;
5335 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5336 int consumed_chars = 0;
5337 int multibytep = coding->src_multibyte;
5338 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5339 int source_charbuf[1024];
5340 int source_byteidx[1025];
5341 Lisp_Object attrs, charset_list;
5343 CODING_GET_INFO (coding, attrs, charset_list);
5345 while (1)
5347 const unsigned char *p = src;
5348 int i = 0;
5350 if (multibytep)
5352 while (i < 1024 && p < src_end)
5354 source_byteidx[i] = p - src;
5355 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5357 source_byteidx[i] = p - src;
5359 else
5360 while (i < 1024 && p < src_end)
5361 source_charbuf[i++] = *p++;
5363 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5364 ccl->last_block = 1;
5365 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5366 charset_list);
5367 charbuf += ccl->produced;
5368 if (multibytep)
5369 src += source_byteidx[ccl->consumed];
5370 else
5371 src += ccl->consumed;
5372 consumed_chars += ccl->consumed;
5373 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5374 break;
5377 switch (ccl->status)
5379 case CCL_STAT_SUSPEND_BY_SRC:
5380 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5381 break;
5382 case CCL_STAT_SUSPEND_BY_DST:
5383 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5384 break;
5385 case CCL_STAT_QUIT:
5386 case CCL_STAT_INVALID_CMD:
5387 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5388 break;
5389 default:
5390 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5391 break;
5393 coding->consumed_char += consumed_chars;
5394 coding->consumed = src - coding->source;
5395 coding->charbuf_used = charbuf - coding->charbuf;
5398 static int
5399 encode_coding_ccl (coding)
5400 struct coding_system *coding;
5402 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5403 int multibytep = coding->dst_multibyte;
5404 int *charbuf = coding->charbuf;
5405 int *charbuf_end = charbuf + coding->charbuf_used;
5406 unsigned char *dst = coding->destination + coding->produced;
5407 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5408 int destination_charbuf[1024];
5409 int i, produced_chars = 0;
5410 Lisp_Object attrs, charset_list;
5412 CODING_GET_INFO (coding, attrs, charset_list);
5413 if (coding->consumed_char == coding->src_chars
5414 && coding->mode & CODING_MODE_LAST_BLOCK)
5415 ccl->last_block = 1;
5417 while (charbuf < charbuf_end)
5419 ccl_driver (ccl, charbuf, destination_charbuf,
5420 charbuf_end - charbuf, 1024, charset_list);
5421 if (multibytep)
5423 ASSURE_DESTINATION (ccl->produced * 2);
5424 for (i = 0; i < ccl->produced; i++)
5425 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5427 else
5429 ASSURE_DESTINATION (ccl->produced);
5430 for (i = 0; i < ccl->produced; i++)
5431 *dst++ = destination_charbuf[i] & 0xFF;
5432 produced_chars += ccl->produced;
5434 charbuf += ccl->consumed;
5435 if (ccl->status == CCL_STAT_QUIT
5436 || ccl->status == CCL_STAT_INVALID_CMD)
5437 break;
5440 switch (ccl->status)
5442 case CCL_STAT_SUSPEND_BY_SRC:
5443 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5444 break;
5445 case CCL_STAT_SUSPEND_BY_DST:
5446 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5447 break;
5448 case CCL_STAT_QUIT:
5449 case CCL_STAT_INVALID_CMD:
5450 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5451 break;
5452 default:
5453 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5454 break;
5457 coding->produced_char += produced_chars;
5458 coding->produced = dst - coding->destination;
5459 return 0;
5464 /*** 10, 11. no-conversion handlers ***/
5466 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5468 static void
5469 decode_coding_raw_text (coding)
5470 struct coding_system *coding;
5472 int eol_crlf =
5473 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5475 coding->chars_at_source = 1;
5476 coding->consumed_char = coding->src_chars;
5477 coding->consumed = coding->src_bytes;
5478 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5480 coding->consumed_char--;
5481 coding->consumed--;
5482 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5484 else
5485 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5488 static int
5489 encode_coding_raw_text (coding)
5490 struct coding_system *coding;
5492 int multibytep = coding->dst_multibyte;
5493 int *charbuf = coding->charbuf;
5494 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5495 unsigned char *dst = coding->destination + coding->produced;
5496 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5497 int produced_chars = 0;
5498 int c;
5500 if (multibytep)
5502 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5504 if (coding->src_multibyte)
5505 while (charbuf < charbuf_end)
5507 ASSURE_DESTINATION (safe_room);
5508 c = *charbuf++;
5509 if (ASCII_CHAR_P (c))
5510 EMIT_ONE_ASCII_BYTE (c);
5511 else if (CHAR_BYTE8_P (c))
5513 c = CHAR_TO_BYTE8 (c);
5514 EMIT_ONE_BYTE (c);
5516 else
5518 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5520 CHAR_STRING_ADVANCE (c, p1);
5521 while (p0 < p1)
5523 EMIT_ONE_BYTE (*p0);
5524 p0++;
5528 else
5529 while (charbuf < charbuf_end)
5531 ASSURE_DESTINATION (safe_room);
5532 c = *charbuf++;
5533 EMIT_ONE_BYTE (c);
5536 else
5538 if (coding->src_multibyte)
5540 int safe_room = MAX_MULTIBYTE_LENGTH;
5542 while (charbuf < charbuf_end)
5544 ASSURE_DESTINATION (safe_room);
5545 c = *charbuf++;
5546 if (ASCII_CHAR_P (c))
5547 *dst++ = c;
5548 else if (CHAR_BYTE8_P (c))
5549 *dst++ = CHAR_TO_BYTE8 (c);
5550 else
5551 CHAR_STRING_ADVANCE (c, dst);
5554 else
5556 ASSURE_DESTINATION (charbuf_end - charbuf);
5557 while (charbuf < charbuf_end && dst < dst_end)
5558 *dst++ = *charbuf++;
5560 produced_chars = dst - (coding->destination + coding->produced);
5562 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5563 coding->produced_char += produced_chars;
5564 coding->produced = dst - coding->destination;
5565 return 0;
5568 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5569 Check if a text is encoded in a charset-based coding system. If it
5570 is, return 1, else return 0. */
5572 static int
5573 detect_coding_charset (coding, detect_info)
5574 struct coding_system *coding;
5575 struct coding_detection_info *detect_info;
5577 const unsigned char *src = coding->source, *src_base;
5578 const unsigned char *src_end = coding->source + coding->src_bytes;
5579 int multibytep = coding->src_multibyte;
5580 int consumed_chars = 0;
5581 Lisp_Object attrs, valids, name;
5582 int found = 0;
5583 int head_ascii = coding->head_ascii;
5584 int check_latin_extra = 0;
5586 detect_info->checked |= CATEGORY_MASK_CHARSET;
5588 coding = &coding_categories[coding_category_charset];
5589 attrs = CODING_ID_ATTRS (coding->id);
5590 valids = AREF (attrs, coding_attr_charset_valids);
5591 name = CODING_ID_NAME (coding->id);
5592 if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5593 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5594 || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5595 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5596 check_latin_extra = 1;
5598 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5599 src += head_ascii;
5601 while (1)
5603 int c;
5604 Lisp_Object val;
5605 struct charset *charset;
5606 int dim, idx;
5608 src_base = src;
5609 ONE_MORE_BYTE (c);
5610 if (c < 0)
5611 continue;
5612 val = AREF (valids, c);
5613 if (NILP (val))
5614 break;
5615 if (c >= 0x80)
5617 if (c < 0xA0
5618 && check_latin_extra
5619 && (!VECTORP (Vlatin_extra_code_table)
5620 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5621 break;
5622 found = CATEGORY_MASK_CHARSET;
5624 if (INTEGERP (val))
5626 charset = CHARSET_FROM_ID (XFASTINT (val));
5627 dim = CHARSET_DIMENSION (charset);
5628 for (idx = 1; idx < dim; idx++)
5630 if (src == src_end)
5631 goto too_short;
5632 ONE_MORE_BYTE (c);
5633 if (c < charset->code_space[(dim - 1 - idx) * 2]
5634 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5635 break;
5637 if (idx < dim)
5638 break;
5640 else
5642 idx = 1;
5643 for (; CONSP (val); val = XCDR (val))
5645 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5646 dim = CHARSET_DIMENSION (charset);
5647 while (idx < dim)
5649 if (src == src_end)
5650 goto too_short;
5651 ONE_MORE_BYTE (c);
5652 if (c < charset->code_space[(dim - 1 - idx) * 4]
5653 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5654 break;
5655 idx++;
5657 if (idx == dim)
5659 val = Qnil;
5660 break;
5663 if (CONSP (val))
5664 break;
5667 too_short:
5668 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5669 return 0;
5671 no_more_source:
5672 detect_info->found |= found;
5673 return 1;
5676 static void
5677 decode_coding_charset (coding)
5678 struct coding_system *coding;
5680 const unsigned char *src = coding->source + coding->consumed;
5681 const unsigned char *src_end = coding->source + coding->src_bytes;
5682 const unsigned char *src_base;
5683 int *charbuf = coding->charbuf + coding->charbuf_used;
5684 /* We may produce one charset annotation in one loop and one more at
5685 the end. */
5686 int *charbuf_end
5687 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5688 int consumed_chars = 0, consumed_chars_base;
5689 int multibytep = coding->src_multibyte;
5690 Lisp_Object attrs, charset_list, valids;
5691 int char_offset = coding->produced_char;
5692 int last_offset = char_offset;
5693 int last_id = charset_ascii;
5694 int eol_crlf =
5695 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5696 int byte_after_cr = -1;
5698 CODING_GET_INFO (coding, attrs, charset_list);
5699 valids = AREF (attrs, coding_attr_charset_valids);
5701 while (1)
5703 int c;
5704 Lisp_Object val;
5705 struct charset *charset;
5706 int dim;
5707 int len = 1;
5708 unsigned code;
5710 src_base = src;
5711 consumed_chars_base = consumed_chars;
5713 if (charbuf >= charbuf_end)
5715 if (byte_after_cr >= 0)
5716 src_base--;
5717 break;
5720 if (byte_after_cr >= 0)
5722 c = byte_after_cr;
5723 byte_after_cr = -1;
5725 else
5727 ONE_MORE_BYTE (c);
5728 if (eol_crlf && c == '\r')
5729 ONE_MORE_BYTE (byte_after_cr);
5731 if (c < 0)
5732 goto invalid_code;
5733 code = c;
5735 val = AREF (valids, c);
5736 if (! INTEGERP (val) && ! CONSP (val))
5737 goto invalid_code;
5738 if (INTEGERP (val))
5740 charset = CHARSET_FROM_ID (XFASTINT (val));
5741 dim = CHARSET_DIMENSION (charset);
5742 while (len < dim)
5744 ONE_MORE_BYTE (c);
5745 code = (code << 8) | c;
5746 len++;
5748 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5749 charset, code, c);
5751 else
5753 /* VAL is a list of charset IDs. It is assured that the
5754 list is sorted by charset dimensions (smaller one
5755 comes first). */
5756 while (CONSP (val))
5758 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5759 dim = CHARSET_DIMENSION (charset);
5760 while (len < dim)
5762 ONE_MORE_BYTE (c);
5763 code = (code << 8) | c;
5764 len++;
5766 CODING_DECODE_CHAR (coding, src, src_base,
5767 src_end, charset, code, c);
5768 if (c >= 0)
5769 break;
5770 val = XCDR (val);
5773 if (c < 0)
5774 goto invalid_code;
5775 if (charset->id != charset_ascii
5776 && last_id != charset->id)
5778 if (last_id != charset_ascii)
5779 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5780 last_id = charset->id;
5781 last_offset = char_offset;
5784 *charbuf++ = c;
5785 char_offset++;
5786 continue;
5788 invalid_code:
5789 src = src_base;
5790 consumed_chars = consumed_chars_base;
5791 ONE_MORE_BYTE (c);
5792 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5793 char_offset++;
5794 coding->errors++;
5797 no_more_source:
5798 if (last_id != charset_ascii)
5799 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5800 coding->consumed_char += consumed_chars_base;
5801 coding->consumed = src_base - coding->source;
5802 coding->charbuf_used = charbuf - coding->charbuf;
5805 static int
5806 encode_coding_charset (coding)
5807 struct coding_system *coding;
5809 int multibytep = coding->dst_multibyte;
5810 int *charbuf = coding->charbuf;
5811 int *charbuf_end = charbuf + coding->charbuf_used;
5812 unsigned char *dst = coding->destination + coding->produced;
5813 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5814 int safe_room = MAX_MULTIBYTE_LENGTH;
5815 int produced_chars = 0;
5816 Lisp_Object attrs, charset_list;
5817 int ascii_compatible;
5818 int c;
5820 CODING_GET_INFO (coding, attrs, charset_list);
5821 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5823 while (charbuf < charbuf_end)
5825 struct charset *charset;
5826 unsigned code;
5828 ASSURE_DESTINATION (safe_room);
5829 c = *charbuf++;
5830 if (ascii_compatible && ASCII_CHAR_P (c))
5831 EMIT_ONE_ASCII_BYTE (c);
5832 else if (CHAR_BYTE8_P (c))
5834 c = CHAR_TO_BYTE8 (c);
5835 EMIT_ONE_BYTE (c);
5837 else
5839 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5840 &code, charset);
5842 if (charset)
5844 if (CHARSET_DIMENSION (charset) == 1)
5845 EMIT_ONE_BYTE (code);
5846 else if (CHARSET_DIMENSION (charset) == 2)
5847 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5848 else if (CHARSET_DIMENSION (charset) == 3)
5849 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5850 else
5851 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5852 (code >> 8) & 0xFF, code & 0xFF);
5854 else
5856 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5857 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5858 else
5859 c = coding->default_char;
5860 EMIT_ONE_BYTE (c);
5865 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5866 coding->produced_char += produced_chars;
5867 coding->produced = dst - coding->destination;
5868 return 0;
5872 /*** 7. C library functions ***/
5874 /* Setup coding context CODING from information about CODING_SYSTEM.
5875 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5876 CODING_SYSTEM is invalid, signal an error. */
5878 void
5879 setup_coding_system (coding_system, coding)
5880 Lisp_Object coding_system;
5881 struct coding_system *coding;
5883 Lisp_Object attrs;
5884 Lisp_Object eol_type;
5885 Lisp_Object coding_type;
5886 Lisp_Object val;
5888 if (NILP (coding_system))
5889 coding_system = Qundecided;
5891 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5893 attrs = CODING_ID_ATTRS (coding->id);
5894 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5896 coding->mode = 0;
5897 coding->head_ascii = -1;
5898 if (VECTORP (eol_type))
5899 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5900 | CODING_REQUIRE_DETECTION_MASK);
5901 else if (! EQ (eol_type, Qunix))
5902 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5903 | CODING_REQUIRE_ENCODING_MASK);
5904 else
5905 coding->common_flags = 0;
5906 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5907 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5908 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5909 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5910 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5911 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5913 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5914 coding->max_charset_id = SCHARS (val) - 1;
5915 coding->safe_charsets = SDATA (val);
5916 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5917 coding->carryover_bytes = 0;
5919 coding_type = CODING_ATTR_TYPE (attrs);
5920 if (EQ (coding_type, Qundecided))
5922 coding->detector = NULL;
5923 coding->decoder = decode_coding_raw_text;
5924 coding->encoder = encode_coding_raw_text;
5925 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5927 else if (EQ (coding_type, Qiso_2022))
5929 int i;
5930 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5932 /* Invoke graphic register 0 to plane 0. */
5933 CODING_ISO_INVOCATION (coding, 0) = 0;
5934 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5935 CODING_ISO_INVOCATION (coding, 1)
5936 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5937 /* Setup the initial status of designation. */
5938 for (i = 0; i < 4; i++)
5939 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5940 /* Not single shifting initially. */
5941 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5942 /* Beginning of buffer should also be regarded as bol. */
5943 CODING_ISO_BOL (coding) = 1;
5944 coding->detector = detect_coding_iso_2022;
5945 coding->decoder = decode_coding_iso_2022;
5946 coding->encoder = encode_coding_iso_2022;
5947 if (flags & CODING_ISO_FLAG_SAFE)
5948 coding->mode |= CODING_MODE_SAFE_ENCODING;
5949 coding->common_flags
5950 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5951 | CODING_REQUIRE_FLUSHING_MASK);
5952 if (flags & CODING_ISO_FLAG_COMPOSITION)
5953 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5954 if (flags & CODING_ISO_FLAG_DESIGNATION)
5955 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5956 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5958 setup_iso_safe_charsets (attrs);
5959 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5960 coding->max_charset_id = SCHARS (val) - 1;
5961 coding->safe_charsets = SDATA (val);
5963 CODING_ISO_FLAGS (coding) = flags;
5964 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5965 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5966 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5967 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5969 else if (EQ (coding_type, Qcharset))
5971 coding->detector = detect_coding_charset;
5972 coding->decoder = decode_coding_charset;
5973 coding->encoder = encode_coding_charset;
5974 coding->common_flags
5975 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5977 else if (EQ (coding_type, Qutf_8))
5979 val = AREF (attrs, coding_attr_utf_bom);
5980 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5981 : EQ (val, Qt) ? utf_with_bom
5982 : utf_without_bom);
5983 coding->detector = detect_coding_utf_8;
5984 coding->decoder = decode_coding_utf_8;
5985 coding->encoder = encode_coding_utf_8;
5986 coding->common_flags
5987 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5988 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5989 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5991 else if (EQ (coding_type, Qutf_16))
5993 val = AREF (attrs, coding_attr_utf_bom);
5994 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5995 : EQ (val, Qt) ? utf_with_bom
5996 : utf_without_bom);
5997 val = AREF (attrs, coding_attr_utf_16_endian);
5998 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5999 : utf_16_little_endian);
6000 CODING_UTF_16_SURROGATE (coding) = 0;
6001 coding->detector = detect_coding_utf_16;
6002 coding->decoder = decode_coding_utf_16;
6003 coding->encoder = encode_coding_utf_16;
6004 coding->common_flags
6005 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
6006 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
6007 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
6009 else if (EQ (coding_type, Qccl))
6011 coding->detector = detect_coding_ccl;
6012 coding->decoder = decode_coding_ccl;
6013 coding->encoder = encode_coding_ccl;
6014 coding->common_flags
6015 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
6016 | CODING_REQUIRE_FLUSHING_MASK);
6018 else if (EQ (coding_type, Qemacs_mule))
6020 coding->detector = detect_coding_emacs_mule;
6021 coding->decoder = decode_coding_emacs_mule;
6022 coding->encoder = encode_coding_emacs_mule;
6023 coding->common_flags
6024 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
6025 coding->spec.emacs_mule.full_support = 1;
6026 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
6027 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
6029 Lisp_Object tail, safe_charsets;
6030 int max_charset_id = 0;
6032 for (tail = Vemacs_mule_charset_list; CONSP (tail);
6033 tail = XCDR (tail))
6034 if (max_charset_id < XFASTINT (XCAR (tail)))
6035 max_charset_id = XFASTINT (XCAR (tail));
6036 safe_charsets = make_uninit_string (max_charset_id + 1);
6037 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
6038 for (tail = Vemacs_mule_charset_list; CONSP (tail);
6039 tail = XCDR (tail))
6040 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
6041 coding->max_charset_id = max_charset_id;
6042 coding->safe_charsets = SDATA (safe_charsets);
6043 coding->spec.emacs_mule.full_support = 1;
6045 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
6046 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
6048 else if (EQ (coding_type, Qshift_jis))
6050 coding->detector = detect_coding_sjis;
6051 coding->decoder = decode_coding_sjis;
6052 coding->encoder = encode_coding_sjis;
6053 coding->common_flags
6054 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
6056 else if (EQ (coding_type, Qbig5))
6058 coding->detector = detect_coding_big5;
6059 coding->decoder = decode_coding_big5;
6060 coding->encoder = encode_coding_big5;
6061 coding->common_flags
6062 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
6064 else /* EQ (coding_type, Qraw_text) */
6066 coding->detector = NULL;
6067 coding->decoder = decode_coding_raw_text;
6068 coding->encoder = encode_coding_raw_text;
6069 if (! EQ (eol_type, Qunix))
6071 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
6072 if (! VECTORP (eol_type))
6073 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
6078 return;
6081 /* Return a list of charsets supported by CODING. */
6083 Lisp_Object
6084 coding_charset_list (coding)
6085 struct coding_system *coding;
6087 Lisp_Object attrs, charset_list;
6089 CODING_GET_INFO (coding, attrs, charset_list);
6090 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6092 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6094 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6095 charset_list = Viso_2022_charset_list;
6097 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6099 charset_list = Vemacs_mule_charset_list;
6101 return charset_list;
6105 /* Return a list of charsets supported by CODING-SYSTEM. */
6107 Lisp_Object
6108 coding_system_charset_list (coding_system)
6109 Lisp_Object coding_system;
6111 int id;
6112 Lisp_Object attrs, charset_list;
6114 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6115 attrs = CODING_ID_ATTRS (id);
6117 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6119 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6121 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6122 charset_list = Viso_2022_charset_list;
6123 else
6124 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6126 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6128 charset_list = Vemacs_mule_charset_list;
6130 else
6132 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6134 return charset_list;
6138 /* Return raw-text or one of its subsidiaries that has the same
6139 eol_type as CODING-SYSTEM. */
6141 Lisp_Object
6142 raw_text_coding_system (coding_system)
6143 Lisp_Object coding_system;
6145 Lisp_Object spec, attrs;
6146 Lisp_Object eol_type, raw_text_eol_type;
6148 if (NILP (coding_system))
6149 return Qraw_text;
6150 spec = CODING_SYSTEM_SPEC (coding_system);
6151 attrs = AREF (spec, 0);
6153 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6154 return coding_system;
6156 eol_type = AREF (spec, 2);
6157 if (VECTORP (eol_type))
6158 return Qraw_text;
6159 spec = CODING_SYSTEM_SPEC (Qraw_text);
6160 raw_text_eol_type = AREF (spec, 2);
6161 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6162 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6163 : AREF (raw_text_eol_type, 2));
6167 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6168 the subsidiary that has the same eol-spec as PARENT (if it is not
6169 nil and specifies end-of-line format) or the system's setting
6170 (system_eol_type). */
6172 Lisp_Object
6173 coding_inherit_eol_type (coding_system, parent)
6174 Lisp_Object coding_system, parent;
6176 Lisp_Object spec, eol_type;
6178 if (NILP (coding_system))
6179 coding_system = Qraw_text;
6180 spec = CODING_SYSTEM_SPEC (coding_system);
6181 eol_type = AREF (spec, 2);
6182 if (VECTORP (eol_type))
6184 Lisp_Object parent_eol_type;
6186 if (! NILP (parent))
6188 Lisp_Object parent_spec;
6190 parent_spec = CODING_SYSTEM_SPEC (parent);
6191 parent_eol_type = AREF (parent_spec, 2);
6192 if (VECTORP (parent_eol_type))
6193 parent_eol_type = system_eol_type;
6195 else
6196 parent_eol_type = system_eol_type;
6197 if (EQ (parent_eol_type, Qunix))
6198 coding_system = AREF (eol_type, 0);
6199 else if (EQ (parent_eol_type, Qdos))
6200 coding_system = AREF (eol_type, 1);
6201 else if (EQ (parent_eol_type, Qmac))
6202 coding_system = AREF (eol_type, 2);
6204 return coding_system;
6208 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6209 decided for writing to a process. If not, complement them, and
6210 return a new coding system. */
6212 Lisp_Object
6213 complement_process_encoding_system (coding_system)
6214 Lisp_Object coding_system;
6216 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6217 Lisp_Object spec, attrs;
6218 int i;
6220 for (i = 0; i < 3; i++)
6222 if (i == 1)
6223 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6224 else if (i == 2)
6225 coding_system = preferred_coding_system ();
6226 spec = CODING_SYSTEM_SPEC (coding_system);
6227 if (NILP (spec))
6228 continue;
6229 attrs = AREF (spec, 0);
6230 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6231 coding_base = CODING_ATTR_BASE_NAME (attrs);
6232 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6233 eol_base = coding_system;
6234 if (! NILP (coding_base) && ! NILP (eol_base))
6235 break;
6238 if (i > 0)
6239 /* The original CODING_SYSTEM didn't specify text-conversion or
6240 eol-conversion. Be sure that we return a fully complemented
6241 coding system. */
6242 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6243 return coding_system;
6247 /* Emacs has a mechanism to automatically detect a coding system if it
6248 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6249 it's impossible to distinguish some coding systems accurately
6250 because they use the same range of codes. So, at first, coding
6251 systems are categorized into 7, those are:
6253 o coding-category-emacs-mule
6255 The category for a coding system which has the same code range
6256 as Emacs' internal format. Assigned the coding-system (Lisp
6257 symbol) `emacs-mule' by default.
6259 o coding-category-sjis
6261 The category for a coding system which has the same code range
6262 as SJIS. Assigned the coding-system (Lisp
6263 symbol) `japanese-shift-jis' by default.
6265 o coding-category-iso-7
6267 The category for a coding system which has the same code range
6268 as ISO2022 of 7-bit environment. This doesn't use any locking
6269 shift and single shift functions. This can encode/decode all
6270 charsets. Assigned the coding-system (Lisp symbol)
6271 `iso-2022-7bit' by default.
6273 o coding-category-iso-7-tight
6275 Same as coding-category-iso-7 except that this can
6276 encode/decode only the specified charsets.
6278 o coding-category-iso-8-1
6280 The category for a coding system which has the same code range
6281 as ISO2022 of 8-bit environment and graphic plane 1 used only
6282 for DIMENSION1 charset. This doesn't use any locking shift
6283 and single shift functions. Assigned the coding-system (Lisp
6284 symbol) `iso-latin-1' by default.
6286 o coding-category-iso-8-2
6288 The category for a coding system which has the same code range
6289 as ISO2022 of 8-bit environment and graphic plane 1 used only
6290 for DIMENSION2 charset. This doesn't use any locking shift
6291 and single shift functions. Assigned the coding-system (Lisp
6292 symbol) `japanese-iso-8bit' by default.
6294 o coding-category-iso-7-else
6296 The category for a coding system which has the same code range
6297 as ISO2022 of 7-bit environment but uses locking shift or
6298 single shift functions. Assigned the coding-system (Lisp
6299 symbol) `iso-2022-7bit-lock' by default.
6301 o coding-category-iso-8-else
6303 The category for a coding system which has the same code range
6304 as ISO2022 of 8-bit environment but uses locking shift or
6305 single shift functions. Assigned the coding-system (Lisp
6306 symbol) `iso-2022-8bit-ss2' by default.
6308 o coding-category-big5
6310 The category for a coding system which has the same code range
6311 as BIG5. Assigned the coding-system (Lisp symbol)
6312 `cn-big5' by default.
6314 o coding-category-utf-8
6316 The category for a coding system which has the same code range
6317 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6318 symbol) `utf-8' by default.
6320 o coding-category-utf-16-be
6322 The category for a coding system in which a text has an
6323 Unicode signature (cf. Unicode Standard) in the order of BIG
6324 endian at the head. Assigned the coding-system (Lisp symbol)
6325 `utf-16-be' by default.
6327 o coding-category-utf-16-le
6329 The category for a coding system in which a text has an
6330 Unicode signature (cf. Unicode Standard) in the order of
6331 LITTLE endian at the head. Assigned the coding-system (Lisp
6332 symbol) `utf-16-le' by default.
6334 o coding-category-ccl
6336 The category for a coding system of which encoder/decoder is
6337 written in CCL programs. The default value is nil, i.e., no
6338 coding system is assigned.
6340 o coding-category-binary
6342 The category for a coding system not categorized in any of the
6343 above. Assigned the coding-system (Lisp symbol)
6344 `no-conversion' by default.
6346 Each of them is a Lisp symbol and the value is an actual
6347 `coding-system's (this is also a Lisp symbol) assigned by a user.
6348 What Emacs does actually is to detect a category of coding system.
6349 Then, it uses a `coding-system' assigned to it. If Emacs can't
6350 decide only one possible category, it selects a category of the
6351 highest priority. Priorities of categories are also specified by a
6352 user in a Lisp variable `coding-category-list'.
6356 #define EOL_SEEN_NONE 0
6357 #define EOL_SEEN_LF 1
6358 #define EOL_SEEN_CR 2
6359 #define EOL_SEEN_CRLF 4
6361 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6362 SOURCE is encoded. If CATEGORY is one of
6363 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6364 two-byte, else they are encoded by one-byte.
6366 Return one of EOL_SEEN_XXX. */
6368 #define MAX_EOL_CHECK_COUNT 3
6370 static int
6371 detect_eol (source, src_bytes, category)
6372 const unsigned char *source;
6373 EMACS_INT src_bytes;
6374 enum coding_category category;
6376 const unsigned char *src = source, *src_end = src + src_bytes;
6377 unsigned char c;
6378 int total = 0;
6379 int eol_seen = EOL_SEEN_NONE;
6381 if ((1 << category) & CATEGORY_MASK_UTF_16)
6383 int msb, lsb;
6385 msb = category == (coding_category_utf_16_le
6386 | coding_category_utf_16_le_nosig);
6387 lsb = 1 - msb;
6389 while (src + 1 < src_end)
6391 c = src[lsb];
6392 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6394 int this_eol;
6396 if (c == '\n')
6397 this_eol = EOL_SEEN_LF;
6398 else if (src + 3 >= src_end
6399 || src[msb + 2] != 0
6400 || src[lsb + 2] != '\n')
6401 this_eol = EOL_SEEN_CR;
6402 else
6404 this_eol = EOL_SEEN_CRLF;
6405 src += 2;
6408 if (eol_seen == EOL_SEEN_NONE)
6409 /* This is the first end-of-line. */
6410 eol_seen = this_eol;
6411 else if (eol_seen != this_eol)
6413 /* The found type is different from what found before.
6414 Allow for stray ^M characters in DOS EOL files. */
6415 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6416 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6417 eol_seen = EOL_SEEN_CRLF;
6418 else
6420 eol_seen = EOL_SEEN_LF;
6421 break;
6424 if (++total == MAX_EOL_CHECK_COUNT)
6425 break;
6427 src += 2;
6430 else
6432 while (src < src_end)
6434 c = *src++;
6435 if (c == '\n' || c == '\r')
6437 int this_eol;
6439 if (c == '\n')
6440 this_eol = EOL_SEEN_LF;
6441 else if (src >= src_end || *src != '\n')
6442 this_eol = EOL_SEEN_CR;
6443 else
6444 this_eol = EOL_SEEN_CRLF, src++;
6446 if (eol_seen == EOL_SEEN_NONE)
6447 /* This is the first end-of-line. */
6448 eol_seen = this_eol;
6449 else if (eol_seen != this_eol)
6451 /* The found type is different from what found before.
6452 Allow for stray ^M characters in DOS EOL files. */
6453 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6454 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6455 eol_seen = EOL_SEEN_CRLF;
6456 else
6458 eol_seen = EOL_SEEN_LF;
6459 break;
6462 if (++total == MAX_EOL_CHECK_COUNT)
6463 break;
6467 return eol_seen;
6471 static Lisp_Object
6472 adjust_coding_eol_type (coding, eol_seen)
6473 struct coding_system *coding;
6474 int eol_seen;
6476 Lisp_Object eol_type;
6478 eol_type = CODING_ID_EOL_TYPE (coding->id);
6479 if (eol_seen & EOL_SEEN_LF)
6481 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6482 eol_type = Qunix;
6484 else if (eol_seen & EOL_SEEN_CRLF)
6486 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6487 eol_type = Qdos;
6489 else if (eol_seen & EOL_SEEN_CR)
6491 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6492 eol_type = Qmac;
6494 return eol_type;
6497 /* Detect how a text specified in CODING is encoded. If a coding
6498 system is detected, update fields of CODING by the detected coding
6499 system. */
6501 void
6502 detect_coding (coding)
6503 struct coding_system *coding;
6505 const unsigned char *src, *src_end;
6506 int saved_mode = coding->mode;
6508 coding->consumed = coding->consumed_char = 0;
6509 coding->produced = coding->produced_char = 0;
6510 coding_set_source (coding);
6512 src_end = coding->source + coding->src_bytes;
6513 coding->head_ascii = 0;
6515 /* If we have not yet decided the text encoding type, detect it
6516 now. */
6517 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6519 int c, i;
6520 struct coding_detection_info detect_info;
6521 int null_byte_found = 0, eight_bit_found = 0;
6523 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6524 for (src = coding->source; src < src_end; src++)
6526 c = *src;
6527 if (c & 0x80)
6529 eight_bit_found = 1;
6530 if (null_byte_found)
6531 break;
6533 else if (c < 0x20)
6535 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6536 && ! inhibit_iso_escape_detection
6537 && ! detect_info.checked)
6539 if (detect_coding_iso_2022 (coding, &detect_info))
6541 /* We have scanned the whole data. */
6542 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6544 /* We didn't find an 8-bit code. We may
6545 have found a null-byte, but it's very
6546 rare that a binary file confirm to
6547 ISO-2022. */
6548 src = src_end;
6549 coding->head_ascii = src - coding->source;
6551 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6552 break;
6555 else if (! c && !inhibit_null_byte_detection)
6557 null_byte_found = 1;
6558 if (eight_bit_found)
6559 break;
6561 if (! eight_bit_found)
6562 coding->head_ascii++;
6564 else if (! eight_bit_found)
6565 coding->head_ascii++;
6568 if (null_byte_found || eight_bit_found
6569 || coding->head_ascii < coding->src_bytes
6570 || detect_info.found)
6572 enum coding_category category;
6573 struct coding_system *this;
6575 if (coding->head_ascii == coding->src_bytes)
6576 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6577 for (i = 0; i < coding_category_raw_text; i++)
6579 category = coding_priorities[i];
6580 this = coding_categories + category;
6581 if (detect_info.found & (1 << category))
6582 break;
6584 else
6586 if (null_byte_found)
6588 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6589 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6591 for (i = 0; i < coding_category_raw_text; i++)
6593 category = coding_priorities[i];
6594 this = coding_categories + category;
6595 if (this->id < 0)
6597 /* No coding system of this category is defined. */
6598 detect_info.rejected |= (1 << category);
6600 else if (category >= coding_category_raw_text)
6601 continue;
6602 else if (detect_info.checked & (1 << category))
6604 if (detect_info.found & (1 << category))
6605 break;
6607 else if ((*(this->detector)) (coding, &detect_info)
6608 && detect_info.found & (1 << category))
6610 if (category == coding_category_utf_16_auto)
6612 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6613 category = coding_category_utf_16_le;
6614 else
6615 category = coding_category_utf_16_be;
6617 break;
6622 if (i < coding_category_raw_text)
6623 setup_coding_system (CODING_ID_NAME (this->id), coding);
6624 else if (null_byte_found)
6625 setup_coding_system (Qno_conversion, coding);
6626 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6627 == CATEGORY_MASK_ANY)
6628 setup_coding_system (Qraw_text, coding);
6629 else if (detect_info.rejected)
6630 for (i = 0; i < coding_category_raw_text; i++)
6631 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6633 this = coding_categories + coding_priorities[i];
6634 setup_coding_system (CODING_ID_NAME (this->id), coding);
6635 break;
6639 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6640 == coding_category_utf_8_auto)
6642 Lisp_Object coding_systems;
6643 struct coding_detection_info detect_info;
6645 coding_systems
6646 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6647 detect_info.found = detect_info.rejected = 0;
6648 coding->head_ascii = 0;
6649 if (CONSP (coding_systems)
6650 && detect_coding_utf_8 (coding, &detect_info))
6652 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6653 setup_coding_system (XCAR (coding_systems), coding);
6654 else
6655 setup_coding_system (XCDR (coding_systems), coding);
6658 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6659 == coding_category_utf_16_auto)
6661 Lisp_Object coding_systems;
6662 struct coding_detection_info detect_info;
6664 coding_systems
6665 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6666 detect_info.found = detect_info.rejected = 0;
6667 coding->head_ascii = 0;
6668 if (CONSP (coding_systems)
6669 && detect_coding_utf_16 (coding, &detect_info))
6671 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6672 setup_coding_system (XCAR (coding_systems), coding);
6673 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6674 setup_coding_system (XCDR (coding_systems), coding);
6677 coding->mode = saved_mode;
6681 static void
6682 decode_eol (coding)
6683 struct coding_system *coding;
6685 Lisp_Object eol_type;
6686 unsigned char *p, *pbeg, *pend;
6688 eol_type = CODING_ID_EOL_TYPE (coding->id);
6689 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6690 return;
6692 if (NILP (coding->dst_object))
6693 pbeg = coding->destination;
6694 else
6695 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6696 pend = pbeg + coding->produced;
6698 if (VECTORP (eol_type))
6700 int eol_seen = EOL_SEEN_NONE;
6702 for (p = pbeg; p < pend; p++)
6704 if (*p == '\n')
6705 eol_seen |= EOL_SEEN_LF;
6706 else if (*p == '\r')
6708 if (p + 1 < pend && *(p + 1) == '\n')
6710 eol_seen |= EOL_SEEN_CRLF;
6711 p++;
6713 else
6714 eol_seen |= EOL_SEEN_CR;
6717 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6718 if ((eol_seen & EOL_SEEN_CRLF) != 0
6719 && (eol_seen & EOL_SEEN_CR) != 0
6720 && (eol_seen & EOL_SEEN_LF) == 0)
6721 eol_seen = EOL_SEEN_CRLF;
6722 else if (eol_seen != EOL_SEEN_NONE
6723 && eol_seen != EOL_SEEN_LF
6724 && eol_seen != EOL_SEEN_CRLF
6725 && eol_seen != EOL_SEEN_CR)
6726 eol_seen = EOL_SEEN_LF;
6727 if (eol_seen != EOL_SEEN_NONE)
6728 eol_type = adjust_coding_eol_type (coding, eol_seen);
6731 if (EQ (eol_type, Qmac))
6733 for (p = pbeg; p < pend; p++)
6734 if (*p == '\r')
6735 *p = '\n';
6737 else if (EQ (eol_type, Qdos))
6739 int n = 0;
6741 if (NILP (coding->dst_object))
6743 /* Start deleting '\r' from the tail to minimize the memory
6744 movement. */
6745 for (p = pend - 2; p >= pbeg; p--)
6746 if (*p == '\r')
6748 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6749 n++;
6752 else
6754 int pos_byte = coding->dst_pos_byte;
6755 int pos = coding->dst_pos;
6756 int pos_end = pos + coding->produced_char - 1;
6758 while (pos < pos_end)
6760 p = BYTE_POS_ADDR (pos_byte);
6761 if (*p == '\r' && p[1] == '\n')
6763 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6764 n++;
6765 pos_end--;
6767 pos++;
6768 if (coding->dst_multibyte)
6769 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6770 else
6771 pos_byte++;
6774 coding->produced -= n;
6775 coding->produced_char -= n;
6780 /* Return a translation table (or list of them) from coding system
6781 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6782 decoding (ENCODEP is zero). */
6784 static Lisp_Object
6785 get_translation_table (attrs, encodep, max_lookup)
6786 Lisp_Object attrs;
6787 int encodep, *max_lookup;
6789 Lisp_Object standard, translation_table;
6790 Lisp_Object val;
6792 if (NILP (Venable_character_translation))
6794 if (max_lookup)
6795 *max_lookup = 0;
6796 return Qnil;
6798 if (encodep)
6799 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6800 standard = Vstandard_translation_table_for_encode;
6801 else
6802 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6803 standard = Vstandard_translation_table_for_decode;
6804 if (NILP (translation_table))
6805 translation_table = standard;
6806 else
6808 if (SYMBOLP (translation_table))
6809 translation_table = Fget (translation_table, Qtranslation_table);
6810 else if (CONSP (translation_table))
6812 translation_table = Fcopy_sequence (translation_table);
6813 for (val = translation_table; CONSP (val); val = XCDR (val))
6814 if (SYMBOLP (XCAR (val)))
6815 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6817 if (CHAR_TABLE_P (standard))
6819 if (CONSP (translation_table))
6820 translation_table = nconc2 (translation_table,
6821 Fcons (standard, Qnil));
6822 else
6823 translation_table = Fcons (translation_table,
6824 Fcons (standard, Qnil));
6828 if (max_lookup)
6830 *max_lookup = 1;
6831 if (CHAR_TABLE_P (translation_table)
6832 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6834 val = XCHAR_TABLE (translation_table)->extras[1];
6835 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6836 *max_lookup = XFASTINT (val);
6838 else if (CONSP (translation_table))
6840 Lisp_Object tail, val;
6842 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6843 if (CHAR_TABLE_P (XCAR (tail))
6844 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6846 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6847 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6848 *max_lookup = XFASTINT (val);
6852 return translation_table;
6855 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6856 do { \
6857 trans = Qnil; \
6858 if (CHAR_TABLE_P (table)) \
6860 trans = CHAR_TABLE_REF (table, c); \
6861 if (CHARACTERP (trans)) \
6862 c = XFASTINT (trans), trans = Qnil; \
6864 else if (CONSP (table)) \
6866 Lisp_Object tail; \
6868 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6869 if (CHAR_TABLE_P (XCAR (tail))) \
6871 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6872 if (CHARACTERP (trans)) \
6873 c = XFASTINT (trans), trans = Qnil; \
6874 else if (! NILP (trans)) \
6875 break; \
6878 } while (0)
6881 /* Return a translation of character(s) at BUF according to TRANS.
6882 TRANS is TO-CHAR or ((FROM . TO) ...) where
6883 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6884 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6885 translation is found, and Qnil if not found..
6886 If BUF is too short to lookup characters in FROM, return Qt. */
6888 static Lisp_Object
6889 get_translation (trans, buf, buf_end)
6890 Lisp_Object trans;
6891 int *buf, *buf_end;
6894 if (INTEGERP (trans))
6895 return trans;
6896 for (; CONSP (trans); trans = XCDR (trans))
6898 Lisp_Object val = XCAR (trans);
6899 Lisp_Object from = XCAR (val);
6900 int len = ASIZE (from);
6901 int i;
6903 for (i = 0; i < len; i++)
6905 if (buf + i == buf_end)
6906 return Qt;
6907 if (XINT (AREF (from, i)) != buf[i])
6908 break;
6910 if (i == len)
6911 return val;
6913 return Qnil;
6917 static int
6918 produce_chars (coding, translation_table, last_block)
6919 struct coding_system *coding;
6920 Lisp_Object translation_table;
6921 int last_block;
6923 unsigned char *dst = coding->destination + coding->produced;
6924 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6925 EMACS_INT produced;
6926 EMACS_INT produced_chars = 0;
6927 int carryover = 0;
6929 if (! coding->chars_at_source)
6931 /* Source characters are in coding->charbuf. */
6932 int *buf = coding->charbuf;
6933 int *buf_end = buf + coding->charbuf_used;
6935 if (EQ (coding->src_object, coding->dst_object))
6937 coding_set_source (coding);
6938 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6941 while (buf < buf_end)
6943 int c = *buf, i;
6945 if (c >= 0)
6947 int from_nchars = 1, to_nchars = 1;
6948 Lisp_Object trans = Qnil;
6950 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6951 if (! NILP (trans))
6953 trans = get_translation (trans, buf, buf_end);
6954 if (INTEGERP (trans))
6955 c = XINT (trans);
6956 else if (CONSP (trans))
6958 from_nchars = ASIZE (XCAR (trans));
6959 trans = XCDR (trans);
6960 if (INTEGERP (trans))
6961 c = XINT (trans);
6962 else
6964 to_nchars = ASIZE (trans);
6965 c = XINT (AREF (trans, 0));
6968 else if (EQ (trans, Qt) && ! last_block)
6969 break;
6972 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6974 dst = alloc_destination (coding,
6975 buf_end - buf
6976 + MAX_MULTIBYTE_LENGTH * to_nchars,
6977 dst);
6978 if (EQ (coding->src_object, coding->dst_object))
6980 coding_set_source (coding);
6981 dst_end = (((unsigned char *) coding->source)
6982 + coding->consumed);
6984 else
6985 dst_end = coding->destination + coding->dst_bytes;
6988 for (i = 0; i < to_nchars; i++)
6990 if (i > 0)
6991 c = XINT (AREF (trans, i));
6992 if (coding->dst_multibyte
6993 || ! CHAR_BYTE8_P (c))
6994 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6995 else
6996 *dst++ = CHAR_TO_BYTE8 (c);
6998 produced_chars += to_nchars;
6999 buf += from_nchars;
7001 else
7002 /* This is an annotation datum. (-C) is the length. */
7003 buf += -c;
7005 carryover = buf_end - buf;
7007 else
7009 /* Source characters are at coding->source. */
7010 const unsigned char *src = coding->source;
7011 const unsigned char *src_end = src + coding->consumed;
7013 if (EQ (coding->dst_object, coding->src_object))
7014 dst_end = (unsigned char *) src;
7015 if (coding->src_multibyte != coding->dst_multibyte)
7017 if (coding->src_multibyte)
7019 int multibytep = 1;
7020 EMACS_INT consumed_chars = 0;
7022 while (1)
7024 const unsigned char *src_base = src;
7025 int c;
7027 ONE_MORE_BYTE (c);
7028 if (dst == dst_end)
7030 if (EQ (coding->src_object, coding->dst_object))
7031 dst_end = (unsigned char *) src;
7032 if (dst == dst_end)
7034 EMACS_INT offset = src - coding->source;
7036 dst = alloc_destination (coding, src_end - src + 1,
7037 dst);
7038 dst_end = coding->destination + coding->dst_bytes;
7039 coding_set_source (coding);
7040 src = coding->source + offset;
7041 src_end = coding->source + coding->src_bytes;
7042 if (EQ (coding->src_object, coding->dst_object))
7043 dst_end = (unsigned char *) src;
7046 *dst++ = c;
7047 produced_chars++;
7049 no_more_source:
7052 else
7053 while (src < src_end)
7055 int multibytep = 1;
7056 int c = *src++;
7058 if (dst >= dst_end - 1)
7060 if (EQ (coding->src_object, coding->dst_object))
7061 dst_end = (unsigned char *) src;
7062 if (dst >= dst_end - 1)
7064 EMACS_INT offset = src - coding->source;
7065 EMACS_INT more_bytes;
7067 if (EQ (coding->src_object, coding->dst_object))
7068 more_bytes = ((src_end - src) / 2) + 2;
7069 else
7070 more_bytes = src_end - src + 2;
7071 dst = alloc_destination (coding, more_bytes, dst);
7072 dst_end = coding->destination + coding->dst_bytes;
7073 coding_set_source (coding);
7074 src = coding->source + offset;
7075 src_end = coding->source + coding->src_bytes;
7076 if (EQ (coding->src_object, coding->dst_object))
7077 dst_end = (unsigned char *) src;
7080 EMIT_ONE_BYTE (c);
7083 else
7085 if (!EQ (coding->src_object, coding->dst_object))
7087 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
7089 if (require > 0)
7091 EMACS_INT offset = src - coding->source;
7093 dst = alloc_destination (coding, require, dst);
7094 coding_set_source (coding);
7095 src = coding->source + offset;
7096 src_end = coding->source + coding->src_bytes;
7099 produced_chars = coding->consumed_char;
7100 while (src < src_end)
7101 *dst++ = *src++;
7105 produced = dst - (coding->destination + coding->produced);
7106 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7107 insert_from_gap (produced_chars, produced);
7108 coding->produced += produced;
7109 coding->produced_char += produced_chars;
7110 return carryover;
7113 /* Compose text in CODING->object according to the annotation data at
7114 CHARBUF. CHARBUF is an array:
7115 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7118 static INLINE void
7119 produce_composition (coding, charbuf, pos)
7120 struct coding_system *coding;
7121 int *charbuf;
7122 EMACS_INT pos;
7124 int len;
7125 EMACS_INT to;
7126 enum composition_method method;
7127 Lisp_Object components;
7129 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7130 to = pos + charbuf[2];
7131 method = (enum composition_method) (charbuf[4]);
7133 if (method == COMPOSITION_RELATIVE)
7134 components = Qnil;
7135 else
7137 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7138 int i, j;
7140 if (method == COMPOSITION_WITH_RULE)
7141 len = charbuf[2] * 3 - 2;
7142 charbuf += MAX_ANNOTATION_LENGTH;
7143 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7144 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7146 if (charbuf[i] >= 0)
7147 args[j] = make_number (charbuf[i]);
7148 else
7150 i++;
7151 args[j] = make_number (charbuf[i] % 0x100);
7154 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7156 compose_text (pos, to, components, Qnil, coding->dst_object);
7160 /* Put `charset' property on text in CODING->object according to
7161 the annotation data at CHARBUF. CHARBUF is an array:
7162 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7165 static INLINE void
7166 produce_charset (coding, charbuf, pos)
7167 struct coding_system *coding;
7168 int *charbuf;
7169 EMACS_INT pos;
7171 EMACS_INT from = pos - charbuf[2];
7172 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7174 Fput_text_property (make_number (from), make_number (pos),
7175 Qcharset, CHARSET_NAME (charset),
7176 coding->dst_object);
7180 #define CHARBUF_SIZE 0x4000
7182 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7183 do { \
7184 int size = CHARBUF_SIZE; \
7186 coding->charbuf = NULL; \
7187 while (size > 1024) \
7189 coding->charbuf = (int *) alloca (sizeof (int) * size); \
7190 if (coding->charbuf) \
7191 break; \
7192 size >>= 1; \
7194 if (! coding->charbuf) \
7196 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7197 return coding->result; \
7199 coding->charbuf_size = size; \
7200 } while (0)
7203 static void
7204 produce_annotation (coding, pos)
7205 struct coding_system *coding;
7206 EMACS_INT pos;
7208 int *charbuf = coding->charbuf;
7209 int *charbuf_end = charbuf + coding->charbuf_used;
7211 if (NILP (coding->dst_object))
7212 return;
7214 while (charbuf < charbuf_end)
7216 if (*charbuf >= 0)
7217 pos++, charbuf++;
7218 else
7220 int len = -*charbuf;
7222 if (len > 2)
7223 switch (charbuf[1])
7225 case CODING_ANNOTATE_COMPOSITION_MASK:
7226 produce_composition (coding, charbuf, pos);
7227 break;
7228 case CODING_ANNOTATE_CHARSET_MASK:
7229 produce_charset (coding, charbuf, pos);
7230 break;
7232 charbuf += len;
7237 /* Decode the data at CODING->src_object into CODING->dst_object.
7238 CODING->src_object is a buffer, a string, or nil.
7239 CODING->dst_object is a buffer.
7241 If CODING->src_object is a buffer, it must be the current buffer.
7242 In this case, if CODING->src_pos is positive, it is a position of
7243 the source text in the buffer, otherwise, the source text is in the
7244 gap area of the buffer, and CODING->src_pos specifies the offset of
7245 the text from GPT (which must be the same as PT). If this is the
7246 same buffer as CODING->dst_object, CODING->src_pos must be
7247 negative.
7249 If CODING->src_object is a string, CODING->src_pos is an index to
7250 that string.
7252 If CODING->src_object is nil, CODING->source must already point to
7253 the non-relocatable memory area. In this case, CODING->src_pos is
7254 an offset from CODING->source.
7256 The decoded data is inserted at the current point of the buffer
7257 CODING->dst_object.
7260 static int
7261 decode_coding (coding)
7262 struct coding_system *coding;
7264 Lisp_Object attrs;
7265 Lisp_Object undo_list;
7266 Lisp_Object translation_table;
7267 struct ccl_spec cclspec;
7268 int carryover;
7269 int i;
7271 if (BUFFERP (coding->src_object)
7272 && coding->src_pos > 0
7273 && coding->src_pos < GPT
7274 && coding->src_pos + coding->src_chars > GPT)
7275 move_gap_both (coding->src_pos, coding->src_pos_byte);
7277 undo_list = Qt;
7278 if (BUFFERP (coding->dst_object))
7280 if (current_buffer != XBUFFER (coding->dst_object))
7281 set_buffer_internal (XBUFFER (coding->dst_object));
7282 if (GPT != PT)
7283 move_gap_both (PT, PT_BYTE);
7284 undo_list = current_buffer->undo_list;
7285 current_buffer->undo_list = Qt;
7288 coding->consumed = coding->consumed_char = 0;
7289 coding->produced = coding->produced_char = 0;
7290 coding->chars_at_source = 0;
7291 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7292 coding->errors = 0;
7294 ALLOC_CONVERSION_WORK_AREA (coding);
7296 attrs = CODING_ID_ATTRS (coding->id);
7297 translation_table = get_translation_table (attrs, 0, NULL);
7299 carryover = 0;
7300 if (coding->decoder == decode_coding_ccl)
7302 coding->spec.ccl = &cclspec;
7303 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7307 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7309 coding_set_source (coding);
7310 coding->annotated = 0;
7311 coding->charbuf_used = carryover;
7312 (*(coding->decoder)) (coding);
7313 coding_set_destination (coding);
7314 carryover = produce_chars (coding, translation_table, 0);
7315 if (coding->annotated)
7316 produce_annotation (coding, pos);
7317 for (i = 0; i < carryover; i++)
7318 coding->charbuf[i]
7319 = coding->charbuf[coding->charbuf_used - carryover + i];
7321 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7322 || (coding->consumed < coding->src_bytes
7323 && (coding->result == CODING_RESULT_SUCCESS
7324 || coding->result == CODING_RESULT_INVALID_SRC)));
7326 if (carryover > 0)
7328 coding_set_destination (coding);
7329 coding->charbuf_used = carryover;
7330 produce_chars (coding, translation_table, 1);
7333 coding->carryover_bytes = 0;
7334 if (coding->consumed < coding->src_bytes)
7336 int nbytes = coding->src_bytes - coding->consumed;
7337 const unsigned char *src;
7339 coding_set_source (coding);
7340 coding_set_destination (coding);
7341 src = coding->source + coding->consumed;
7343 if (coding->mode & CODING_MODE_LAST_BLOCK)
7345 /* Flush out unprocessed data as binary chars. We are sure
7346 that the number of data is less than the size of
7347 coding->charbuf. */
7348 coding->charbuf_used = 0;
7349 coding->chars_at_source = 0;
7351 while (nbytes-- > 0)
7353 int c = *src++;
7355 if (c & 0x80)
7356 c = BYTE8_TO_CHAR (c);
7357 coding->charbuf[coding->charbuf_used++] = c;
7359 produce_chars (coding, Qnil, 1);
7361 else
7363 /* Record unprocessed bytes in coding->carryover. We are
7364 sure that the number of data is less than the size of
7365 coding->carryover. */
7366 unsigned char *p = coding->carryover;
7368 if (nbytes > sizeof coding->carryover)
7369 nbytes = sizeof coding->carryover;
7370 coding->carryover_bytes = nbytes;
7371 while (nbytes-- > 0)
7372 *p++ = *src++;
7374 coding->consumed = coding->src_bytes;
7377 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7378 && !inhibit_eol_conversion)
7379 decode_eol (coding);
7380 if (BUFFERP (coding->dst_object))
7382 current_buffer->undo_list = undo_list;
7383 record_insert (coding->dst_pos, coding->produced_char);
7385 return coding->result;
7389 /* Extract an annotation datum from a composition starting at POS and
7390 ending before LIMIT of CODING->src_object (buffer or string), store
7391 the data in BUF, set *STOP to a starting position of the next
7392 composition (if any) or to LIMIT, and return the address of the
7393 next element of BUF.
7395 If such an annotation is not found, set *STOP to a starting
7396 position of a composition after POS (if any) or to LIMIT, and
7397 return BUF. */
7399 static INLINE int *
7400 handle_composition_annotation (pos, limit, coding, buf, stop)
7401 EMACS_INT pos, limit;
7402 struct coding_system *coding;
7403 int *buf;
7404 EMACS_INT *stop;
7406 EMACS_INT start, end;
7407 Lisp_Object prop;
7409 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7410 || end > limit)
7411 *stop = limit;
7412 else if (start > pos)
7413 *stop = start;
7414 else
7416 if (start == pos)
7418 /* We found a composition. Store the corresponding
7419 annotation data in BUF. */
7420 int *head = buf;
7421 enum composition_method method = COMPOSITION_METHOD (prop);
7422 int nchars = COMPOSITION_LENGTH (prop);
7424 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7425 if (method != COMPOSITION_RELATIVE)
7427 Lisp_Object components;
7428 int len, i, i_byte;
7430 components = COMPOSITION_COMPONENTS (prop);
7431 if (VECTORP (components))
7433 len = XVECTOR_SIZE (components);
7434 for (i = 0; i < len; i++)
7435 *buf++ = XINT (AREF (components, i));
7437 else if (STRINGP (components))
7439 len = SCHARS (components);
7440 i = i_byte = 0;
7441 while (i < len)
7443 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7444 buf++;
7447 else if (INTEGERP (components))
7449 len = 1;
7450 *buf++ = XINT (components);
7452 else if (CONSP (components))
7454 for (len = 0; CONSP (components);
7455 len++, components = XCDR (components))
7456 *buf++ = XINT (XCAR (components));
7458 else
7459 abort ();
7460 *head -= len;
7464 if (find_composition (end, limit, &start, &end, &prop,
7465 coding->src_object)
7466 && end <= limit)
7467 *stop = start;
7468 else
7469 *stop = limit;
7471 return buf;
7475 /* Extract an annotation datum from a text property `charset' at POS of
7476 CODING->src_object (buffer of string), store the data in BUF, set
7477 *STOP to the position where the value of `charset' property changes
7478 (limiting by LIMIT), and return the address of the next element of
7479 BUF.
7481 If the property value is nil, set *STOP to the position where the
7482 property value is non-nil (limiting by LIMIT), and return BUF. */
7484 static INLINE int *
7485 handle_charset_annotation (pos, limit, coding, buf, stop)
7486 EMACS_INT pos, limit;
7487 struct coding_system *coding;
7488 int *buf;
7489 EMACS_INT *stop;
7491 Lisp_Object val, next;
7492 int id;
7494 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7495 if (! NILP (val) && CHARSETP (val))
7496 id = XINT (CHARSET_SYMBOL_ID (val));
7497 else
7498 id = -1;
7499 ADD_CHARSET_DATA (buf, 0, id);
7500 next = Fnext_single_property_change (make_number (pos), Qcharset,
7501 coding->src_object,
7502 make_number (limit));
7503 *stop = XINT (next);
7504 return buf;
7508 static void
7509 consume_chars (coding, translation_table, max_lookup)
7510 struct coding_system *coding;
7511 Lisp_Object translation_table;
7512 int max_lookup;
7514 int *buf = coding->charbuf;
7515 int *buf_end = coding->charbuf + coding->charbuf_size;
7516 const unsigned char *src = coding->source + coding->consumed;
7517 const unsigned char *src_end = coding->source + coding->src_bytes;
7518 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7519 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7520 int multibytep = coding->src_multibyte;
7521 Lisp_Object eol_type;
7522 int c;
7523 EMACS_INT stop, stop_composition, stop_charset;
7524 int *lookup_buf = NULL;
7526 if (! NILP (translation_table))
7527 lookup_buf = alloca (sizeof (int) * max_lookup);
7529 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7530 if (VECTORP (eol_type))
7531 eol_type = Qunix;
7533 /* Note: composition handling is not yet implemented. */
7534 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7536 if (NILP (coding->src_object))
7537 stop = stop_composition = stop_charset = end_pos;
7538 else
7540 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7541 stop = stop_composition = pos;
7542 else
7543 stop = stop_composition = end_pos;
7544 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7545 stop = stop_charset = pos;
7546 else
7547 stop_charset = end_pos;
7550 /* Compensate for CRLF and conversion. */
7551 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7552 while (buf < buf_end)
7554 Lisp_Object trans;
7556 if (pos == stop)
7558 if (pos == end_pos)
7559 break;
7560 if (pos == stop_composition)
7561 buf = handle_composition_annotation (pos, end_pos, coding,
7562 buf, &stop_composition);
7563 if (pos == stop_charset)
7564 buf = handle_charset_annotation (pos, end_pos, coding,
7565 buf, &stop_charset);
7566 stop = (stop_composition < stop_charset
7567 ? stop_composition : stop_charset);
7570 if (! multibytep)
7572 EMACS_INT bytes;
7574 if (coding->encoder == encode_coding_raw_text
7575 || coding->encoder == encode_coding_ccl)
7576 c = *src++, pos++;
7577 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7578 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7579 else
7580 c = BYTE8_TO_CHAR (*src), src++, pos++;
7582 else
7583 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7584 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7585 c = '\n';
7586 if (! EQ (eol_type, Qunix))
7588 if (c == '\n')
7590 if (EQ (eol_type, Qdos))
7591 *buf++ = '\r';
7592 else
7593 c = '\r';
7597 trans = Qnil;
7598 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7599 if (NILP (trans))
7600 *buf++ = c;
7601 else
7603 int from_nchars = 1, to_nchars = 1;
7604 int *lookup_buf_end;
7605 const unsigned char *p = src;
7606 int i;
7608 lookup_buf[0] = c;
7609 for (i = 1; i < max_lookup && p < src_end; i++)
7610 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7611 lookup_buf_end = lookup_buf + i;
7612 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7613 if (INTEGERP (trans))
7614 c = XINT (trans);
7615 else if (CONSP (trans))
7617 from_nchars = ASIZE (XCAR (trans));
7618 trans = XCDR (trans);
7619 if (INTEGERP (trans))
7620 c = XINT (trans);
7621 else
7623 to_nchars = ASIZE (trans);
7624 if (buf + to_nchars > buf_end)
7625 break;
7626 c = XINT (AREF (trans, 0));
7629 else
7630 break;
7631 *buf++ = c;
7632 for (i = 1; i < to_nchars; i++)
7633 *buf++ = XINT (AREF (trans, i));
7634 for (i = 1; i < from_nchars; i++, pos++)
7635 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7639 coding->consumed = src - coding->source;
7640 coding->consumed_char = pos - coding->src_pos;
7641 coding->charbuf_used = buf - coding->charbuf;
7642 coding->chars_at_source = 0;
7646 /* Encode the text at CODING->src_object into CODING->dst_object.
7647 CODING->src_object is a buffer or a string.
7648 CODING->dst_object is a buffer or nil.
7650 If CODING->src_object is a buffer, it must be the current buffer.
7651 In this case, if CODING->src_pos is positive, it is a position of
7652 the source text in the buffer, otherwise. the source text is in the
7653 gap area of the buffer, and coding->src_pos specifies the offset of
7654 the text from GPT (which must be the same as PT). If this is the
7655 same buffer as CODING->dst_object, CODING->src_pos must be
7656 negative and CODING should not have `pre-write-conversion'.
7658 If CODING->src_object is a string, CODING should not have
7659 `pre-write-conversion'.
7661 If CODING->dst_object is a buffer, the encoded data is inserted at
7662 the current point of that buffer.
7664 If CODING->dst_object is nil, the encoded data is placed at the
7665 memory area specified by CODING->destination. */
7667 static int
7668 encode_coding (coding)
7669 struct coding_system *coding;
7671 Lisp_Object attrs;
7672 Lisp_Object translation_table;
7673 int max_lookup;
7674 struct ccl_spec cclspec;
7676 attrs = CODING_ID_ATTRS (coding->id);
7677 if (coding->encoder == encode_coding_raw_text)
7678 translation_table = Qnil, max_lookup = 0;
7679 else
7680 translation_table = get_translation_table (attrs, 1, &max_lookup);
7682 if (BUFFERP (coding->dst_object))
7684 set_buffer_internal (XBUFFER (coding->dst_object));
7685 coding->dst_multibyte
7686 = ! NILP (current_buffer->enable_multibyte_characters);
7689 coding->consumed = coding->consumed_char = 0;
7690 coding->produced = coding->produced_char = 0;
7691 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7692 coding->errors = 0;
7694 ALLOC_CONVERSION_WORK_AREA (coding);
7696 if (coding->encoder == encode_coding_ccl)
7698 coding->spec.ccl = &cclspec;
7699 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7701 do {
7702 coding_set_source (coding);
7703 consume_chars (coding, translation_table, max_lookup);
7704 coding_set_destination (coding);
7705 (*(coding->encoder)) (coding);
7706 } while (coding->consumed_char < coding->src_chars);
7708 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7709 insert_from_gap (coding->produced_char, coding->produced);
7711 return (coding->result);
7715 /* Name (or base name) of work buffer for code conversion. */
7716 static Lisp_Object Vcode_conversion_workbuf_name;
7718 /* A working buffer used by the top level conversion. Once it is
7719 created, it is never destroyed. It has the name
7720 Vcode_conversion_workbuf_name. The other working buffers are
7721 destroyed after the use is finished, and their names are modified
7722 versions of Vcode_conversion_workbuf_name. */
7723 static Lisp_Object Vcode_conversion_reused_workbuf;
7725 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7726 static int reused_workbuf_in_use;
7729 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7730 multibyteness of returning buffer. */
7732 static Lisp_Object
7733 make_conversion_work_buffer (multibyte)
7734 int multibyte;
7736 Lisp_Object name, workbuf;
7737 struct buffer *current;
7739 if (reused_workbuf_in_use++)
7741 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7742 workbuf = Fget_buffer_create (name);
7744 else
7746 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7747 Vcode_conversion_reused_workbuf
7748 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7749 workbuf = Vcode_conversion_reused_workbuf;
7751 current = current_buffer;
7752 set_buffer_internal (XBUFFER (workbuf));
7753 /* We can't allow modification hooks to run in the work buffer. For
7754 instance, directory_files_internal assumes that file decoding
7755 doesn't compile new regexps. */
7756 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7757 Ferase_buffer ();
7758 current_buffer->undo_list = Qt;
7759 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7760 set_buffer_internal (current);
7761 return workbuf;
7765 static Lisp_Object
7766 code_conversion_restore (arg)
7767 Lisp_Object arg;
7769 Lisp_Object current, workbuf;
7770 struct gcpro gcpro1;
7772 GCPRO1 (arg);
7773 current = XCAR (arg);
7774 workbuf = XCDR (arg);
7775 if (! NILP (workbuf))
7777 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7778 reused_workbuf_in_use = 0;
7779 else if (! NILP (Fbuffer_live_p (workbuf)))
7780 Fkill_buffer (workbuf);
7782 set_buffer_internal (XBUFFER (current));
7783 UNGCPRO;
7784 return Qnil;
7787 Lisp_Object
7788 code_conversion_save (with_work_buf, multibyte)
7789 int with_work_buf, multibyte;
7791 Lisp_Object workbuf = Qnil;
7793 if (with_work_buf)
7794 workbuf = make_conversion_work_buffer (multibyte);
7795 record_unwind_protect (code_conversion_restore,
7796 Fcons (Fcurrent_buffer (), workbuf));
7797 return workbuf;
7801 decode_coding_gap (coding, chars, bytes)
7802 struct coding_system *coding;
7803 EMACS_INT chars, bytes;
7805 int count = specpdl_ptr - specpdl;
7806 Lisp_Object attrs;
7808 code_conversion_save (0, 0);
7810 coding->src_object = Fcurrent_buffer ();
7811 coding->src_chars = chars;
7812 coding->src_bytes = bytes;
7813 coding->src_pos = -chars;
7814 coding->src_pos_byte = -bytes;
7815 coding->src_multibyte = chars < bytes;
7816 coding->dst_object = coding->src_object;
7817 coding->dst_pos = PT;
7818 coding->dst_pos_byte = PT_BYTE;
7819 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7821 if (CODING_REQUIRE_DETECTION (coding))
7822 detect_coding (coding);
7824 coding->mode |= CODING_MODE_LAST_BLOCK;
7825 current_buffer->text->inhibit_shrinking = 1;
7826 decode_coding (coding);
7827 current_buffer->text->inhibit_shrinking = 0;
7829 attrs = CODING_ID_ATTRS (coding->id);
7830 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7832 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7833 Lisp_Object val;
7835 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7836 val = call1 (CODING_ATTR_POST_READ (attrs),
7837 make_number (coding->produced_char));
7838 CHECK_NATNUM (val);
7839 coding->produced_char += Z - prev_Z;
7840 coding->produced += Z_BYTE - prev_Z_BYTE;
7843 unbind_to (count, Qnil);
7844 return coding->result;
7848 encode_coding_gap (coding, chars, bytes)
7849 struct coding_system *coding;
7850 EMACS_INT chars, bytes;
7852 int count = specpdl_ptr - specpdl;
7854 code_conversion_save (0, 0);
7856 coding->src_object = Fcurrent_buffer ();
7857 coding->src_chars = chars;
7858 coding->src_bytes = bytes;
7859 coding->src_pos = -chars;
7860 coding->src_pos_byte = -bytes;
7861 coding->src_multibyte = chars < bytes;
7862 coding->dst_object = coding->src_object;
7863 coding->dst_pos = PT;
7864 coding->dst_pos_byte = PT_BYTE;
7866 encode_coding (coding);
7868 unbind_to (count, Qnil);
7869 return coding->result;
7873 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7874 SRC_OBJECT into DST_OBJECT by coding context CODING.
7876 SRC_OBJECT is a buffer, a string, or Qnil.
7878 If it is a buffer, the text is at point of the buffer. FROM and TO
7879 are positions in the buffer.
7881 If it is a string, the text is at the beginning of the string.
7882 FROM and TO are indices to the string.
7884 If it is nil, the text is at coding->source. FROM and TO are
7885 indices to coding->source.
7887 DST_OBJECT is a buffer, Qt, or Qnil.
7889 If it is a buffer, the decoded text is inserted at point of the
7890 buffer. If the buffer is the same as SRC_OBJECT, the source text
7891 is deleted.
7893 If it is Qt, a string is made from the decoded text, and
7894 set in CODING->dst_object.
7896 If it is Qnil, the decoded text is stored at CODING->destination.
7897 The caller must allocate CODING->dst_bytes bytes at
7898 CODING->destination by xmalloc. If the decoded text is longer than
7899 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7902 void
7903 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7904 dst_object)
7905 struct coding_system *coding;
7906 Lisp_Object src_object;
7907 EMACS_INT from, from_byte, to, to_byte;
7908 Lisp_Object dst_object;
7910 int count = specpdl_ptr - specpdl;
7911 unsigned char *destination;
7912 EMACS_INT dst_bytes;
7913 EMACS_INT chars = to - from;
7914 EMACS_INT bytes = to_byte - from_byte;
7915 Lisp_Object attrs;
7916 int saved_pt = -1, saved_pt_byte;
7917 int need_marker_adjustment = 0;
7918 Lisp_Object old_deactivate_mark;
7920 old_deactivate_mark = Vdeactivate_mark;
7922 if (NILP (dst_object))
7924 destination = coding->destination;
7925 dst_bytes = coding->dst_bytes;
7928 coding->src_object = src_object;
7929 coding->src_chars = chars;
7930 coding->src_bytes = bytes;
7931 coding->src_multibyte = chars < bytes;
7933 if (STRINGP (src_object))
7935 coding->src_pos = from;
7936 coding->src_pos_byte = from_byte;
7938 else if (BUFFERP (src_object))
7940 set_buffer_internal (XBUFFER (src_object));
7941 if (from != GPT)
7942 move_gap_both (from, from_byte);
7943 if (EQ (src_object, dst_object))
7945 struct Lisp_Marker *tail;
7947 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7949 tail->need_adjustment
7950 = tail->charpos == (tail->insertion_type ? from : to);
7951 need_marker_adjustment |= tail->need_adjustment;
7953 saved_pt = PT, saved_pt_byte = PT_BYTE;
7954 TEMP_SET_PT_BOTH (from, from_byte);
7955 current_buffer->text->inhibit_shrinking = 1;
7956 del_range_both (from, from_byte, to, to_byte, 1);
7957 coding->src_pos = -chars;
7958 coding->src_pos_byte = -bytes;
7960 else
7962 coding->src_pos = from;
7963 coding->src_pos_byte = from_byte;
7967 if (CODING_REQUIRE_DETECTION (coding))
7968 detect_coding (coding);
7969 attrs = CODING_ID_ATTRS (coding->id);
7971 if (EQ (dst_object, Qt)
7972 || (! NILP (CODING_ATTR_POST_READ (attrs))
7973 && NILP (dst_object)))
7975 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7976 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7977 coding->dst_pos = BEG;
7978 coding->dst_pos_byte = BEG_BYTE;
7980 else if (BUFFERP (dst_object))
7982 code_conversion_save (0, 0);
7983 coding->dst_object = dst_object;
7984 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7985 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7986 coding->dst_multibyte
7987 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7989 else
7991 code_conversion_save (0, 0);
7992 coding->dst_object = Qnil;
7993 /* Most callers presume this will return a multibyte result, and they
7994 won't use `binary' or `raw-text' anyway, so let's not worry about
7995 CODING_FOR_UNIBYTE. */
7996 coding->dst_multibyte = 1;
7999 decode_coding (coding);
8001 if (BUFFERP (coding->dst_object))
8002 set_buffer_internal (XBUFFER (coding->dst_object));
8004 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8006 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8007 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8008 Lisp_Object val;
8010 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8011 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8012 old_deactivate_mark);
8013 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8014 make_number (coding->produced_char));
8015 UNGCPRO;
8016 CHECK_NATNUM (val);
8017 coding->produced_char += Z - prev_Z;
8018 coding->produced += Z_BYTE - prev_Z_BYTE;
8021 if (EQ (dst_object, Qt))
8023 coding->dst_object = Fbuffer_string ();
8025 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8027 set_buffer_internal (XBUFFER (coding->dst_object));
8028 if (dst_bytes < coding->produced)
8030 destination = xrealloc (destination, coding->produced);
8031 if (! destination)
8033 record_conversion_result (coding,
8034 CODING_RESULT_INSUFFICIENT_MEM);
8035 unbind_to (count, Qnil);
8036 return;
8038 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8039 move_gap_both (BEGV, BEGV_BYTE);
8040 bcopy (BEGV_ADDR, destination, coding->produced);
8041 coding->destination = destination;
8045 if (saved_pt >= 0)
8047 /* This is the case of:
8048 (BUFFERP (src_object) && EQ (src_object, dst_object))
8049 As we have moved PT while replacing the original buffer
8050 contents, we must recover it now. */
8051 set_buffer_internal (XBUFFER (src_object));
8052 current_buffer->text->inhibit_shrinking = 0;
8053 if (saved_pt < from)
8054 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8055 else if (saved_pt < from + chars)
8056 TEMP_SET_PT_BOTH (from, from_byte);
8057 else if (! NILP (current_buffer->enable_multibyte_characters))
8058 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8059 saved_pt_byte + (coding->produced - bytes));
8060 else
8061 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8062 saved_pt_byte + (coding->produced - bytes));
8064 if (need_marker_adjustment)
8066 struct Lisp_Marker *tail;
8068 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8069 if (tail->need_adjustment)
8071 tail->need_adjustment = 0;
8072 if (tail->insertion_type)
8074 tail->bytepos = from_byte;
8075 tail->charpos = from;
8077 else
8079 tail->bytepos = from_byte + coding->produced;
8080 tail->charpos
8081 = (NILP (current_buffer->enable_multibyte_characters)
8082 ? tail->bytepos : from + coding->produced_char);
8088 Vdeactivate_mark = old_deactivate_mark;
8089 unbind_to (count, coding->dst_object);
8093 void
8094 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
8095 dst_object)
8096 struct coding_system *coding;
8097 Lisp_Object src_object;
8098 EMACS_INT from, from_byte, to, to_byte;
8099 Lisp_Object dst_object;
8101 int count = specpdl_ptr - specpdl;
8102 EMACS_INT chars = to - from;
8103 EMACS_INT bytes = to_byte - from_byte;
8104 Lisp_Object attrs;
8105 int saved_pt = -1, saved_pt_byte;
8106 int need_marker_adjustment = 0;
8107 int kill_src_buffer = 0;
8108 Lisp_Object old_deactivate_mark;
8110 old_deactivate_mark = Vdeactivate_mark;
8112 coding->src_object = src_object;
8113 coding->src_chars = chars;
8114 coding->src_bytes = bytes;
8115 coding->src_multibyte = chars < bytes;
8117 attrs = CODING_ID_ATTRS (coding->id);
8119 if (EQ (src_object, dst_object))
8121 struct Lisp_Marker *tail;
8123 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8125 tail->need_adjustment
8126 = tail->charpos == (tail->insertion_type ? from : to);
8127 need_marker_adjustment |= tail->need_adjustment;
8131 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8133 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8134 set_buffer_internal (XBUFFER (coding->src_object));
8135 if (STRINGP (src_object))
8136 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8137 else if (BUFFERP (src_object))
8138 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8139 else
8140 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8142 if (EQ (src_object, dst_object))
8144 set_buffer_internal (XBUFFER (src_object));
8145 saved_pt = PT, saved_pt_byte = PT_BYTE;
8146 del_range_both (from, from_byte, to, to_byte, 1);
8147 set_buffer_internal (XBUFFER (coding->src_object));
8151 Lisp_Object args[3];
8152 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8154 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8155 old_deactivate_mark);
8156 args[0] = CODING_ATTR_PRE_WRITE (attrs);
8157 args[1] = make_number (BEG);
8158 args[2] = make_number (Z);
8159 safe_call (3, args);
8160 UNGCPRO;
8162 if (XBUFFER (coding->src_object) != current_buffer)
8163 kill_src_buffer = 1;
8164 coding->src_object = Fcurrent_buffer ();
8165 if (BEG != GPT)
8166 move_gap_both (BEG, BEG_BYTE);
8167 coding->src_chars = Z - BEG;
8168 coding->src_bytes = Z_BYTE - BEG_BYTE;
8169 coding->src_pos = BEG;
8170 coding->src_pos_byte = BEG_BYTE;
8171 coding->src_multibyte = Z < Z_BYTE;
8173 else if (STRINGP (src_object))
8175 code_conversion_save (0, 0);
8176 coding->src_pos = from;
8177 coding->src_pos_byte = from_byte;
8179 else if (BUFFERP (src_object))
8181 code_conversion_save (0, 0);
8182 set_buffer_internal (XBUFFER (src_object));
8183 if (EQ (src_object, dst_object))
8185 saved_pt = PT, saved_pt_byte = PT_BYTE;
8186 coding->src_object = del_range_1 (from, to, 1, 1);
8187 coding->src_pos = 0;
8188 coding->src_pos_byte = 0;
8190 else
8192 if (from < GPT && to >= GPT)
8193 move_gap_both (from, from_byte);
8194 coding->src_pos = from;
8195 coding->src_pos_byte = from_byte;
8198 else
8199 code_conversion_save (0, 0);
8201 if (BUFFERP (dst_object))
8203 coding->dst_object = dst_object;
8204 if (EQ (src_object, dst_object))
8206 coding->dst_pos = from;
8207 coding->dst_pos_byte = from_byte;
8209 else
8211 struct buffer *current = current_buffer;
8213 set_buffer_temp (XBUFFER (dst_object));
8214 coding->dst_pos = PT;
8215 coding->dst_pos_byte = PT_BYTE;
8216 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8217 set_buffer_temp (current);
8219 coding->dst_multibyte
8220 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8222 else if (EQ (dst_object, Qt))
8224 coding->dst_object = Qnil;
8225 coding->dst_bytes = coding->src_chars;
8226 if (coding->dst_bytes == 0)
8227 coding->dst_bytes = 1;
8228 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8229 coding->dst_multibyte = 0;
8231 else
8233 coding->dst_object = Qnil;
8234 coding->dst_multibyte = 0;
8237 encode_coding (coding);
8239 if (EQ (dst_object, Qt))
8241 if (BUFFERP (coding->dst_object))
8242 coding->dst_object = Fbuffer_string ();
8243 else
8245 coding->dst_object
8246 = make_unibyte_string ((char *) coding->destination,
8247 coding->produced);
8248 xfree (coding->destination);
8252 if (saved_pt >= 0)
8254 /* This is the case of:
8255 (BUFFERP (src_object) && EQ (src_object, dst_object))
8256 As we have moved PT while replacing the original buffer
8257 contents, we must recover it now. */
8258 set_buffer_internal (XBUFFER (src_object));
8259 if (saved_pt < from)
8260 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8261 else if (saved_pt < from + chars)
8262 TEMP_SET_PT_BOTH (from, from_byte);
8263 else if (! NILP (current_buffer->enable_multibyte_characters))
8264 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8265 saved_pt_byte + (coding->produced - bytes));
8266 else
8267 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8268 saved_pt_byte + (coding->produced - bytes));
8270 if (need_marker_adjustment)
8272 struct Lisp_Marker *tail;
8274 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8275 if (tail->need_adjustment)
8277 tail->need_adjustment = 0;
8278 if (tail->insertion_type)
8280 tail->bytepos = from_byte;
8281 tail->charpos = from;
8283 else
8285 tail->bytepos = from_byte + coding->produced;
8286 tail->charpos
8287 = (NILP (current_buffer->enable_multibyte_characters)
8288 ? tail->bytepos : from + coding->produced_char);
8294 if (kill_src_buffer)
8295 Fkill_buffer (coding->src_object);
8297 Vdeactivate_mark = old_deactivate_mark;
8298 unbind_to (count, Qnil);
8302 Lisp_Object
8303 preferred_coding_system ()
8305 int id = coding_categories[coding_priorities[0]].id;
8307 return CODING_ID_NAME (id);
8311 #ifdef emacs
8312 /*** 8. Emacs Lisp library functions ***/
8314 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8315 doc: /* Return t if OBJECT is nil or a coding-system.
8316 See the documentation of `define-coding-system' for information
8317 about coding-system objects. */)
8318 (object)
8319 Lisp_Object object;
8321 if (NILP (object)
8322 || CODING_SYSTEM_ID (object) >= 0)
8323 return Qt;
8324 if (! SYMBOLP (object)
8325 || NILP (Fget (object, Qcoding_system_define_form)))
8326 return Qnil;
8327 return Qt;
8330 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8331 Sread_non_nil_coding_system, 1, 1, 0,
8332 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8333 (prompt)
8334 Lisp_Object prompt;
8336 Lisp_Object val;
8339 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8340 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8342 while (SCHARS (val) == 0);
8343 return (Fintern (val, Qnil));
8346 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8347 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8348 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8349 Ignores case when completing coding systems (all Emacs coding systems
8350 are lower-case). */)
8351 (prompt, default_coding_system)
8352 Lisp_Object prompt, default_coding_system;
8354 Lisp_Object val;
8355 int count = SPECPDL_INDEX ();
8357 if (SYMBOLP (default_coding_system))
8358 default_coding_system = SYMBOL_NAME (default_coding_system);
8359 specbind (Qcompletion_ignore_case, Qt);
8360 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8361 Qt, Qnil, Qcoding_system_history,
8362 default_coding_system, Qnil);
8363 unbind_to (count, Qnil);
8364 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8367 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8368 1, 1, 0,
8369 doc: /* Check validity of CODING-SYSTEM.
8370 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8371 It is valid if it is nil or a symbol defined as a coding system by the
8372 function `define-coding-system'. */)
8373 (coding_system)
8374 Lisp_Object coding_system;
8376 Lisp_Object define_form;
8378 define_form = Fget (coding_system, Qcoding_system_define_form);
8379 if (! NILP (define_form))
8381 Fput (coding_system, Qcoding_system_define_form, Qnil);
8382 safe_eval (define_form);
8384 if (!NILP (Fcoding_system_p (coding_system)))
8385 return coding_system;
8386 xsignal1 (Qcoding_system_error, coding_system);
8390 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8391 HIGHEST is nonzero, return the coding system of the highest
8392 priority among the detected coding systems. Otherwise return a
8393 list of detected coding systems sorted by their priorities. If
8394 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8395 multibyte form but contains only ASCII and eight-bit chars.
8396 Otherwise, the bytes are raw bytes.
8398 CODING-SYSTEM controls the detection as below:
8400 If it is nil, detect both text-format and eol-format. If the
8401 text-format part of CODING-SYSTEM is already specified
8402 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8403 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8404 detect only text-format. */
8406 Lisp_Object
8407 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8408 coding_system)
8409 const unsigned char *src;
8410 EMACS_INT src_chars, src_bytes;
8411 int highest;
8412 int multibytep;
8413 Lisp_Object coding_system;
8415 const unsigned char *src_end = src + src_bytes;
8416 Lisp_Object attrs, eol_type;
8417 Lisp_Object val = Qnil;
8418 struct coding_system coding;
8419 int id;
8420 struct coding_detection_info detect_info;
8421 enum coding_category base_category;
8422 int null_byte_found = 0, eight_bit_found = 0;
8424 if (NILP (coding_system))
8425 coding_system = Qundecided;
8426 setup_coding_system (coding_system, &coding);
8427 attrs = CODING_ID_ATTRS (coding.id);
8428 eol_type = CODING_ID_EOL_TYPE (coding.id);
8429 coding_system = CODING_ATTR_BASE_NAME (attrs);
8431 coding.source = src;
8432 coding.src_chars = src_chars;
8433 coding.src_bytes = src_bytes;
8434 coding.src_multibyte = multibytep;
8435 coding.consumed = 0;
8436 coding.mode |= CODING_MODE_LAST_BLOCK;
8437 coding.head_ascii = 0;
8439 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8441 /* At first, detect text-format if necessary. */
8442 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8443 if (base_category == coding_category_undecided)
8445 enum coding_category category;
8446 struct coding_system *this;
8447 int c, i;
8449 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8450 for (; src < src_end; src++)
8452 c = *src;
8453 if (c & 0x80)
8455 eight_bit_found = 1;
8456 if (null_byte_found)
8457 break;
8459 else if (c < 0x20)
8461 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8462 && ! inhibit_iso_escape_detection
8463 && ! detect_info.checked)
8465 if (detect_coding_iso_2022 (&coding, &detect_info))
8467 /* We have scanned the whole data. */
8468 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8470 /* We didn't find an 8-bit code. We may
8471 have found a null-byte, but it's very
8472 rare that a binary file confirm to
8473 ISO-2022. */
8474 src = src_end;
8475 coding.head_ascii = src - coding.source;
8477 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8478 break;
8481 else if (! c && !inhibit_null_byte_detection)
8483 null_byte_found = 1;
8484 if (eight_bit_found)
8485 break;
8487 if (! eight_bit_found)
8488 coding.head_ascii++;
8490 else if (! eight_bit_found)
8491 coding.head_ascii++;
8494 if (null_byte_found || eight_bit_found
8495 || coding.head_ascii < coding.src_bytes
8496 || detect_info.found)
8498 if (coding.head_ascii == coding.src_bytes)
8499 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8500 for (i = 0; i < coding_category_raw_text; i++)
8502 category = coding_priorities[i];
8503 this = coding_categories + category;
8504 if (detect_info.found & (1 << category))
8505 break;
8507 else
8509 if (null_byte_found)
8511 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8512 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8514 for (i = 0; i < coding_category_raw_text; i++)
8516 category = coding_priorities[i];
8517 this = coding_categories + category;
8519 if (this->id < 0)
8521 /* No coding system of this category is defined. */
8522 detect_info.rejected |= (1 << category);
8524 else if (category >= coding_category_raw_text)
8525 continue;
8526 else if (detect_info.checked & (1 << category))
8528 if (highest
8529 && (detect_info.found & (1 << category)))
8530 break;
8532 else if ((*(this->detector)) (&coding, &detect_info)
8533 && highest
8534 && (detect_info.found & (1 << category)))
8536 if (category == coding_category_utf_16_auto)
8538 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8539 category = coding_category_utf_16_le;
8540 else
8541 category = coding_category_utf_16_be;
8543 break;
8549 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8550 || null_byte_found)
8552 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8553 id = CODING_SYSTEM_ID (Qno_conversion);
8554 val = Fcons (make_number (id), Qnil);
8556 else if (! detect_info.rejected && ! detect_info.found)
8558 detect_info.found = CATEGORY_MASK_ANY;
8559 id = coding_categories[coding_category_undecided].id;
8560 val = Fcons (make_number (id), Qnil);
8562 else if (highest)
8564 if (detect_info.found)
8566 detect_info.found = 1 << category;
8567 val = Fcons (make_number (this->id), Qnil);
8569 else
8570 for (i = 0; i < coding_category_raw_text; i++)
8571 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8573 detect_info.found = 1 << coding_priorities[i];
8574 id = coding_categories[coding_priorities[i]].id;
8575 val = Fcons (make_number (id), Qnil);
8576 break;
8579 else
8581 int mask = detect_info.rejected | detect_info.found;
8582 int found = 0;
8584 for (i = coding_category_raw_text - 1; i >= 0; i--)
8586 category = coding_priorities[i];
8587 if (! (mask & (1 << category)))
8589 found |= 1 << category;
8590 id = coding_categories[category].id;
8591 if (id >= 0)
8592 val = Fcons (make_number (id), val);
8595 for (i = coding_category_raw_text - 1; i >= 0; i--)
8597 category = coding_priorities[i];
8598 if (detect_info.found & (1 << category))
8600 id = coding_categories[category].id;
8601 val = Fcons (make_number (id), val);
8604 detect_info.found |= found;
8607 else if (base_category == coding_category_utf_8_auto)
8609 if (detect_coding_utf_8 (&coding, &detect_info))
8611 struct coding_system *this;
8613 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8614 this = coding_categories + coding_category_utf_8_sig;
8615 else
8616 this = coding_categories + coding_category_utf_8_nosig;
8617 val = Fcons (make_number (this->id), Qnil);
8620 else if (base_category == coding_category_utf_16_auto)
8622 if (detect_coding_utf_16 (&coding, &detect_info))
8624 struct coding_system *this;
8626 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8627 this = coding_categories + coding_category_utf_16_le;
8628 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8629 this = coding_categories + coding_category_utf_16_be;
8630 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8631 this = coding_categories + coding_category_utf_16_be_nosig;
8632 else
8633 this = coding_categories + coding_category_utf_16_le_nosig;
8634 val = Fcons (make_number (this->id), Qnil);
8637 else
8639 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8640 val = Fcons (make_number (coding.id), Qnil);
8643 /* Then, detect eol-format if necessary. */
8645 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8646 Lisp_Object tail;
8648 if (VECTORP (eol_type))
8650 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8652 if (null_byte_found)
8653 normal_eol = EOL_SEEN_LF;
8654 else
8655 normal_eol = detect_eol (coding.source, src_bytes,
8656 coding_category_raw_text);
8658 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8659 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8660 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8661 coding_category_utf_16_be);
8662 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8663 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8664 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8665 coding_category_utf_16_le);
8667 else
8669 if (EQ (eol_type, Qunix))
8670 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8671 else if (EQ (eol_type, Qdos))
8672 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8673 else
8674 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8677 for (tail = val; CONSP (tail); tail = XCDR (tail))
8679 enum coding_category category;
8680 int this_eol;
8682 id = XINT (XCAR (tail));
8683 attrs = CODING_ID_ATTRS (id);
8684 category = XINT (CODING_ATTR_CATEGORY (attrs));
8685 eol_type = CODING_ID_EOL_TYPE (id);
8686 if (VECTORP (eol_type))
8688 if (category == coding_category_utf_16_be
8689 || category == coding_category_utf_16_be_nosig)
8690 this_eol = utf_16_be_eol;
8691 else if (category == coding_category_utf_16_le
8692 || category == coding_category_utf_16_le_nosig)
8693 this_eol = utf_16_le_eol;
8694 else
8695 this_eol = normal_eol;
8697 if (this_eol == EOL_SEEN_LF)
8698 XSETCAR (tail, AREF (eol_type, 0));
8699 else if (this_eol == EOL_SEEN_CRLF)
8700 XSETCAR (tail, AREF (eol_type, 1));
8701 else if (this_eol == EOL_SEEN_CR)
8702 XSETCAR (tail, AREF (eol_type, 2));
8703 else
8704 XSETCAR (tail, CODING_ID_NAME (id));
8706 else
8707 XSETCAR (tail, CODING_ID_NAME (id));
8711 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8715 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8716 2, 3, 0,
8717 doc: /* Detect coding system of the text in the region between START and END.
8718 Return a list of possible coding systems ordered by priority.
8719 The coding systems to try and their priorities follows what
8720 the function `coding-system-priority-list' (which see) returns.
8722 If only ASCII characters are found (except for such ISO-2022 control
8723 characters as ESC), it returns a list of single element `undecided'
8724 or its subsidiary coding system according to a detected end-of-line
8725 format.
8727 If optional argument HIGHEST is non-nil, return the coding system of
8728 highest priority. */)
8729 (start, end, highest)
8730 Lisp_Object start, end, highest;
8732 int from, to;
8733 int from_byte, to_byte;
8735 CHECK_NUMBER_COERCE_MARKER (start);
8736 CHECK_NUMBER_COERCE_MARKER (end);
8738 validate_region (&start, &end);
8739 from = XINT (start), to = XINT (end);
8740 from_byte = CHAR_TO_BYTE (from);
8741 to_byte = CHAR_TO_BYTE (to);
8743 if (from < GPT && to >= GPT)
8744 move_gap_both (to, to_byte);
8746 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8747 to - from, to_byte - from_byte,
8748 !NILP (highest),
8749 !NILP (current_buffer
8750 ->enable_multibyte_characters),
8751 Qnil);
8754 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8755 1, 2, 0,
8756 doc: /* Detect coding system of the text in STRING.
8757 Return a list of possible coding systems ordered by priority.
8758 The coding systems to try and their priorities follows what
8759 the function `coding-system-priority-list' (which see) returns.
8761 If only ASCII characters are found (except for such ISO-2022 control
8762 characters as ESC), it returns a list of single element `undecided'
8763 or its subsidiary coding system according to a detected end-of-line
8764 format.
8766 If optional argument HIGHEST is non-nil, return the coding system of
8767 highest priority. */)
8768 (string, highest)
8769 Lisp_Object string, highest;
8771 CHECK_STRING (string);
8773 return detect_coding_system (SDATA (string),
8774 SCHARS (string), SBYTES (string),
8775 !NILP (highest), STRING_MULTIBYTE (string),
8776 Qnil);
8780 static INLINE int
8781 char_encodable_p (c, attrs)
8782 int c;
8783 Lisp_Object attrs;
8785 Lisp_Object tail;
8786 struct charset *charset;
8787 Lisp_Object translation_table;
8789 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8790 if (! NILP (translation_table))
8791 c = translate_char (translation_table, c);
8792 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8793 CONSP (tail); tail = XCDR (tail))
8795 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8796 if (CHAR_CHARSET_P (c, charset))
8797 break;
8799 return (! NILP (tail));
8803 /* Return a list of coding systems that safely encode the text between
8804 START and END. If EXCLUDE is non-nil, it is a list of coding
8805 systems not to check. The returned list doesn't contain any such
8806 coding systems. In any case, if the text contains only ASCII or is
8807 unibyte, return t. */
8809 DEFUN ("find-coding-systems-region-internal",
8810 Ffind_coding_systems_region_internal,
8811 Sfind_coding_systems_region_internal, 2, 3, 0,
8812 doc: /* Internal use only. */)
8813 (start, end, exclude)
8814 Lisp_Object start, end, exclude;
8816 Lisp_Object coding_attrs_list, safe_codings;
8817 EMACS_INT start_byte, end_byte;
8818 const unsigned char *p, *pbeg, *pend;
8819 int c;
8820 Lisp_Object tail, elt, work_table;
8822 if (STRINGP (start))
8824 if (!STRING_MULTIBYTE (start)
8825 || SCHARS (start) == SBYTES (start))
8826 return Qt;
8827 start_byte = 0;
8828 end_byte = SBYTES (start);
8830 else
8832 CHECK_NUMBER_COERCE_MARKER (start);
8833 CHECK_NUMBER_COERCE_MARKER (end);
8834 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8835 args_out_of_range (start, end);
8836 if (NILP (current_buffer->enable_multibyte_characters))
8837 return Qt;
8838 start_byte = CHAR_TO_BYTE (XINT (start));
8839 end_byte = CHAR_TO_BYTE (XINT (end));
8840 if (XINT (end) - XINT (start) == end_byte - start_byte)
8841 return Qt;
8843 if (XINT (start) < GPT && XINT (end) > GPT)
8845 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8846 move_gap_both (XINT (start), start_byte);
8847 else
8848 move_gap_both (XINT (end), end_byte);
8852 coding_attrs_list = Qnil;
8853 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8854 if (NILP (exclude)
8855 || NILP (Fmemq (XCAR (tail), exclude)))
8857 Lisp_Object attrs;
8859 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8860 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8861 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8863 ASET (attrs, coding_attr_trans_tbl,
8864 get_translation_table (attrs, 1, NULL));
8865 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8869 if (STRINGP (start))
8870 p = pbeg = SDATA (start);
8871 else
8872 p = pbeg = BYTE_POS_ADDR (start_byte);
8873 pend = p + (end_byte - start_byte);
8875 while (p < pend && ASCII_BYTE_P (*p)) p++;
8876 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8878 work_table = Fmake_char_table (Qnil, Qnil);
8879 while (p < pend)
8881 if (ASCII_BYTE_P (*p))
8882 p++;
8883 else
8885 c = STRING_CHAR_ADVANCE (p);
8886 if (!NILP (char_table_ref (work_table, c)))
8887 /* This character was already checked. Ignore it. */
8888 continue;
8890 charset_map_loaded = 0;
8891 for (tail = coding_attrs_list; CONSP (tail);)
8893 elt = XCAR (tail);
8894 if (NILP (elt))
8895 tail = XCDR (tail);
8896 else if (char_encodable_p (c, elt))
8897 tail = XCDR (tail);
8898 else if (CONSP (XCDR (tail)))
8900 XSETCAR (tail, XCAR (XCDR (tail)));
8901 XSETCDR (tail, XCDR (XCDR (tail)));
8903 else
8905 XSETCAR (tail, Qnil);
8906 tail = XCDR (tail);
8909 if (charset_map_loaded)
8911 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8913 if (STRINGP (start))
8914 pbeg = SDATA (start);
8915 else
8916 pbeg = BYTE_POS_ADDR (start_byte);
8917 p = pbeg + p_offset;
8918 pend = pbeg + pend_offset;
8920 char_table_set (work_table, c, Qt);
8924 safe_codings = list2 (Qraw_text, Qno_conversion);
8925 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8926 if (! NILP (XCAR (tail)))
8927 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8929 return safe_codings;
8933 DEFUN ("unencodable-char-position", Funencodable_char_position,
8934 Sunencodable_char_position, 3, 5, 0,
8935 doc: /*
8936 Return position of first un-encodable character in a region.
8937 START and END specify the region and CODING-SYSTEM specifies the
8938 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8940 If optional 4th argument COUNT is non-nil, it specifies at most how
8941 many un-encodable characters to search. In this case, the value is a
8942 list of positions.
8944 If optional 5th argument STRING is non-nil, it is a string to search
8945 for un-encodable characters. In that case, START and END are indexes
8946 to the string. */)
8947 (start, end, coding_system, count, string)
8948 Lisp_Object start, end, coding_system, count, string;
8950 int n;
8951 struct coding_system coding;
8952 Lisp_Object attrs, charset_list, translation_table;
8953 Lisp_Object positions;
8954 int from, to;
8955 const unsigned char *p, *stop, *pend;
8956 int ascii_compatible;
8958 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8959 attrs = CODING_ID_ATTRS (coding.id);
8960 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8961 return Qnil;
8962 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8963 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8964 translation_table = get_translation_table (attrs, 1, NULL);
8966 if (NILP (string))
8968 validate_region (&start, &end);
8969 from = XINT (start);
8970 to = XINT (end);
8971 if (NILP (current_buffer->enable_multibyte_characters)
8972 || (ascii_compatible
8973 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8974 return Qnil;
8975 p = CHAR_POS_ADDR (from);
8976 pend = CHAR_POS_ADDR (to);
8977 if (from < GPT && to >= GPT)
8978 stop = GPT_ADDR;
8979 else
8980 stop = pend;
8982 else
8984 CHECK_STRING (string);
8985 CHECK_NATNUM (start);
8986 CHECK_NATNUM (end);
8987 from = XINT (start);
8988 to = XINT (end);
8989 if (from > to
8990 || to > SCHARS (string))
8991 args_out_of_range_3 (string, start, end);
8992 if (! STRING_MULTIBYTE (string))
8993 return Qnil;
8994 p = SDATA (string) + string_char_to_byte (string, from);
8995 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8996 if (ascii_compatible && (to - from) == (pend - p))
8997 return Qnil;
9000 if (NILP (count))
9001 n = 1;
9002 else
9004 CHECK_NATNUM (count);
9005 n = XINT (count);
9008 positions = Qnil;
9009 while (1)
9011 int c;
9013 if (ascii_compatible)
9014 while (p < stop && ASCII_BYTE_P (*p))
9015 p++, from++;
9016 if (p >= stop)
9018 if (p >= pend)
9019 break;
9020 stop = pend;
9021 p = GAP_END_ADDR;
9024 c = STRING_CHAR_ADVANCE (p);
9025 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9026 && ! char_charset (translate_char (translation_table, c),
9027 charset_list, NULL))
9029 positions = Fcons (make_number (from), positions);
9030 n--;
9031 if (n == 0)
9032 break;
9035 from++;
9038 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9042 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9043 Scheck_coding_systems_region, 3, 3, 0,
9044 doc: /* Check if the region is encodable by coding systems.
9046 START and END are buffer positions specifying the region.
9047 CODING-SYSTEM-LIST is a list of coding systems to check.
9049 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9050 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9051 whole region, POS0, POS1, ... are buffer positions where non-encodable
9052 characters are found.
9054 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9055 value is nil.
9057 START may be a string. In that case, check if the string is
9058 encodable, and the value contains indices to the string instead of
9059 buffer positions. END is ignored.
9061 If the current buffer (or START if it is a string) is unibyte, the value
9062 is nil. */)
9063 (start, end, coding_system_list)
9064 Lisp_Object start, end, coding_system_list;
9066 Lisp_Object list;
9067 EMACS_INT start_byte, end_byte;
9068 int pos;
9069 const unsigned char *p, *pbeg, *pend;
9070 int c;
9071 Lisp_Object tail, elt, attrs;
9073 if (STRINGP (start))
9075 if (!STRING_MULTIBYTE (start)
9076 || SCHARS (start) == SBYTES (start))
9077 return Qnil;
9078 start_byte = 0;
9079 end_byte = SBYTES (start);
9080 pos = 0;
9082 else
9084 CHECK_NUMBER_COERCE_MARKER (start);
9085 CHECK_NUMBER_COERCE_MARKER (end);
9086 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9087 args_out_of_range (start, end);
9088 if (NILP (current_buffer->enable_multibyte_characters))
9089 return Qnil;
9090 start_byte = CHAR_TO_BYTE (XINT (start));
9091 end_byte = CHAR_TO_BYTE (XINT (end));
9092 if (XINT (end) - XINT (start) == end_byte - start_byte)
9093 return Qnil;
9095 if (XINT (start) < GPT && XINT (end) > GPT)
9097 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9098 move_gap_both (XINT (start), start_byte);
9099 else
9100 move_gap_both (XINT (end), end_byte);
9102 pos = XINT (start);
9105 list = Qnil;
9106 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9108 elt = XCAR (tail);
9109 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9110 ASET (attrs, coding_attr_trans_tbl,
9111 get_translation_table (attrs, 1, NULL));
9112 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9115 if (STRINGP (start))
9116 p = pbeg = SDATA (start);
9117 else
9118 p = pbeg = BYTE_POS_ADDR (start_byte);
9119 pend = p + (end_byte - start_byte);
9121 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9122 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9124 while (p < pend)
9126 if (ASCII_BYTE_P (*p))
9127 p++;
9128 else
9130 c = STRING_CHAR_ADVANCE (p);
9132 charset_map_loaded = 0;
9133 for (tail = list; CONSP (tail); tail = XCDR (tail))
9135 elt = XCDR (XCAR (tail));
9136 if (! char_encodable_p (c, XCAR (elt)))
9137 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9139 if (charset_map_loaded)
9141 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9143 if (STRINGP (start))
9144 pbeg = SDATA (start);
9145 else
9146 pbeg = BYTE_POS_ADDR (start_byte);
9147 p = pbeg + p_offset;
9148 pend = pbeg + pend_offset;
9151 pos++;
9154 tail = list;
9155 list = Qnil;
9156 for (; CONSP (tail); tail = XCDR (tail))
9158 elt = XCAR (tail);
9159 if (CONSP (XCDR (XCDR (elt))))
9160 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9161 list);
9164 return list;
9168 Lisp_Object
9169 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9170 Lisp_Object start, end, coding_system, dst_object;
9171 int encodep, norecord;
9173 struct coding_system coding;
9174 EMACS_INT from, from_byte, to, to_byte;
9175 Lisp_Object src_object;
9177 CHECK_NUMBER_COERCE_MARKER (start);
9178 CHECK_NUMBER_COERCE_MARKER (end);
9179 if (NILP (coding_system))
9180 coding_system = Qno_conversion;
9181 else
9182 CHECK_CODING_SYSTEM (coding_system);
9183 src_object = Fcurrent_buffer ();
9184 if (NILP (dst_object))
9185 dst_object = src_object;
9186 else if (! EQ (dst_object, Qt))
9187 CHECK_BUFFER (dst_object);
9189 validate_region (&start, &end);
9190 from = XFASTINT (start);
9191 from_byte = CHAR_TO_BYTE (from);
9192 to = XFASTINT (end);
9193 to_byte = CHAR_TO_BYTE (to);
9195 setup_coding_system (coding_system, &coding);
9196 coding.mode |= CODING_MODE_LAST_BLOCK;
9198 if (encodep)
9199 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9200 dst_object);
9201 else
9202 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9203 dst_object);
9204 if (! norecord)
9205 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9207 return (BUFFERP (dst_object)
9208 ? make_number (coding.produced_char)
9209 : coding.dst_object);
9213 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9214 3, 4, "r\nzCoding system: ",
9215 doc: /* Decode the current region from the specified coding system.
9216 When called from a program, takes four arguments:
9217 START, END, CODING-SYSTEM, and DESTINATION.
9218 START and END are buffer positions.
9220 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9221 If nil, the region between START and END is replaced by the decoded text.
9222 If buffer, the decoded text is inserted in that buffer after point (point
9223 does not move).
9224 In those cases, the length of the decoded text is returned.
9225 If DESTINATION is t, the decoded text is returned.
9227 This function sets `last-coding-system-used' to the precise coding system
9228 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9229 not fully specified.) */)
9230 (start, end, coding_system, destination)
9231 Lisp_Object start, end, coding_system, destination;
9233 return code_convert_region (start, end, coding_system, destination, 0, 0);
9236 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9237 3, 4, "r\nzCoding system: ",
9238 doc: /* Encode the current region by specified coding system.
9239 When called from a program, takes four arguments:
9240 START, END, CODING-SYSTEM and DESTINATION.
9241 START and END are buffer positions.
9243 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9244 If nil, the region between START and END is replace by the encoded text.
9245 If buffer, the encoded text is inserted in that buffer after point (point
9246 does not move).
9247 In those cases, the length of the encoded text is returned.
9248 If DESTINATION is t, the encoded text is returned.
9250 This function sets `last-coding-system-used' to the precise coding system
9251 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9252 not fully specified.) */)
9253 (start, end, coding_system, destination)
9254 Lisp_Object start, end, coding_system, destination;
9256 return code_convert_region (start, end, coding_system, destination, 1, 0);
9259 Lisp_Object
9260 code_convert_string (string, coding_system, dst_object,
9261 encodep, nocopy, norecord)
9262 Lisp_Object string, coding_system, dst_object;
9263 int encodep, nocopy, norecord;
9265 struct coding_system coding;
9266 EMACS_INT chars, bytes;
9268 CHECK_STRING (string);
9269 if (NILP (coding_system))
9271 if (! norecord)
9272 Vlast_coding_system_used = Qno_conversion;
9273 if (NILP (dst_object))
9274 return (nocopy ? Fcopy_sequence (string) : string);
9277 if (NILP (coding_system))
9278 coding_system = Qno_conversion;
9279 else
9280 CHECK_CODING_SYSTEM (coding_system);
9281 if (NILP (dst_object))
9282 dst_object = Qt;
9283 else if (! EQ (dst_object, Qt))
9284 CHECK_BUFFER (dst_object);
9286 setup_coding_system (coding_system, &coding);
9287 coding.mode |= CODING_MODE_LAST_BLOCK;
9288 chars = SCHARS (string);
9289 bytes = SBYTES (string);
9290 if (encodep)
9291 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9292 else
9293 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9294 if (! norecord)
9295 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9297 return (BUFFERP (dst_object)
9298 ? make_number (coding.produced_char)
9299 : coding.dst_object);
9303 /* Encode or decode STRING according to CODING_SYSTEM.
9304 Do not set Vlast_coding_system_used.
9306 This function is called only from macros DECODE_FILE and
9307 ENCODE_FILE, thus we ignore character composition. */
9309 Lisp_Object
9310 code_convert_string_norecord (string, coding_system, encodep)
9311 Lisp_Object string, coding_system;
9312 int encodep;
9314 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9318 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9319 2, 4, 0,
9320 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9322 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9323 if the decoding operation is trivial.
9325 Optional fourth arg BUFFER non-nil means that the decoded text is
9326 inserted in that buffer after point (point does not move). In this
9327 case, the return value is the length of the decoded text.
9329 This function sets `last-coding-system-used' to the precise coding system
9330 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9331 not fully specified.) */)
9332 (string, coding_system, nocopy, buffer)
9333 Lisp_Object string, coding_system, nocopy, buffer;
9335 return code_convert_string (string, coding_system, buffer,
9336 0, ! NILP (nocopy), 0);
9339 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9340 2, 4, 0,
9341 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9343 Optional third arg NOCOPY non-nil means it is OK to return STRING
9344 itself if the encoding operation is trivial.
9346 Optional fourth arg BUFFER non-nil means that the encoded text is
9347 inserted in that buffer after point (point does not move). In this
9348 case, the return value is the length of the encoded text.
9350 This function sets `last-coding-system-used' to the precise coding system
9351 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9352 not fully specified.) */)
9353 (string, coding_system, nocopy, buffer)
9354 Lisp_Object string, coding_system, nocopy, buffer;
9356 return code_convert_string (string, coding_system, buffer,
9357 1, ! NILP (nocopy), 1);
9361 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9362 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9363 Return the corresponding character. */)
9364 (code)
9365 Lisp_Object code;
9367 Lisp_Object spec, attrs, val;
9368 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9369 int c;
9371 CHECK_NATNUM (code);
9372 c = XFASTINT (code);
9373 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9374 attrs = AREF (spec, 0);
9376 if (ASCII_BYTE_P (c)
9377 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9378 return code;
9380 val = CODING_ATTR_CHARSET_LIST (attrs);
9381 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9382 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9383 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9385 if (c <= 0x7F)
9386 charset = charset_roman;
9387 else if (c >= 0xA0 && c < 0xDF)
9389 charset = charset_kana;
9390 c -= 0x80;
9392 else
9394 int s1 = c >> 8, s2 = c & 0xFF;
9396 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9397 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9398 error ("Invalid code: %d", code);
9399 SJIS_TO_JIS (c);
9400 charset = charset_kanji;
9402 c = DECODE_CHAR (charset, c);
9403 if (c < 0)
9404 error ("Invalid code: %d", code);
9405 return make_number (c);
9409 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9410 doc: /* Encode a Japanese character CH to shift_jis encoding.
9411 Return the corresponding code in SJIS. */)
9412 (ch)
9413 Lisp_Object ch;
9415 Lisp_Object spec, attrs, charset_list;
9416 int c;
9417 struct charset *charset;
9418 unsigned code;
9420 CHECK_CHARACTER (ch);
9421 c = XFASTINT (ch);
9422 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9423 attrs = AREF (spec, 0);
9425 if (ASCII_CHAR_P (c)
9426 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9427 return ch;
9429 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9430 charset = char_charset (c, charset_list, &code);
9431 if (code == CHARSET_INVALID_CODE (charset))
9432 error ("Can't encode by shift_jis encoding: %d", c);
9433 JIS_TO_SJIS (code);
9435 return make_number (code);
9438 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9439 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9440 Return the corresponding character. */)
9441 (code)
9442 Lisp_Object code;
9444 Lisp_Object spec, attrs, val;
9445 struct charset *charset_roman, *charset_big5, *charset;
9446 int c;
9448 CHECK_NATNUM (code);
9449 c = XFASTINT (code);
9450 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9451 attrs = AREF (spec, 0);
9453 if (ASCII_BYTE_P (c)
9454 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9455 return code;
9457 val = CODING_ATTR_CHARSET_LIST (attrs);
9458 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9459 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9461 if (c <= 0x7F)
9462 charset = charset_roman;
9463 else
9465 int b1 = c >> 8, b2 = c & 0x7F;
9466 if (b1 < 0xA1 || b1 > 0xFE
9467 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9468 error ("Invalid code: %d", code);
9469 charset = charset_big5;
9471 c = DECODE_CHAR (charset, (unsigned )c);
9472 if (c < 0)
9473 error ("Invalid code: %d", code);
9474 return make_number (c);
9477 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9478 doc: /* Encode the Big5 character CH to BIG5 coding system.
9479 Return the corresponding character code in Big5. */)
9480 (ch)
9481 Lisp_Object ch;
9483 Lisp_Object spec, attrs, charset_list;
9484 struct charset *charset;
9485 int c;
9486 unsigned code;
9488 CHECK_CHARACTER (ch);
9489 c = XFASTINT (ch);
9490 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9491 attrs = AREF (spec, 0);
9492 if (ASCII_CHAR_P (c)
9493 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9494 return ch;
9496 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9497 charset = char_charset (c, charset_list, &code);
9498 if (code == CHARSET_INVALID_CODE (charset))
9499 error ("Can't encode by Big5 encoding: %d", c);
9501 return make_number (code);
9505 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9506 Sset_terminal_coding_system_internal, 1, 2, 0,
9507 doc: /* Internal use only. */)
9508 (coding_system, terminal)
9509 Lisp_Object coding_system;
9510 Lisp_Object terminal;
9512 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9513 CHECK_SYMBOL (coding_system);
9514 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9515 /* We had better not send unsafe characters to terminal. */
9516 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9517 /* Character composition should be disabled. */
9518 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9519 terminal_coding->src_multibyte = 1;
9520 terminal_coding->dst_multibyte = 0;
9521 return Qnil;
9524 DEFUN ("set-safe-terminal-coding-system-internal",
9525 Fset_safe_terminal_coding_system_internal,
9526 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9527 doc: /* Internal use only. */)
9528 (coding_system)
9529 Lisp_Object coding_system;
9531 CHECK_SYMBOL (coding_system);
9532 setup_coding_system (Fcheck_coding_system (coding_system),
9533 &safe_terminal_coding);
9534 /* Character composition should be disabled. */
9535 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9536 safe_terminal_coding.src_multibyte = 1;
9537 safe_terminal_coding.dst_multibyte = 0;
9538 return Qnil;
9541 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9542 Sterminal_coding_system, 0, 1, 0,
9543 doc: /* Return coding system specified for terminal output on the given terminal.
9544 TERMINAL may be a terminal object, a frame, or nil for the selected
9545 frame's terminal device. */)
9546 (terminal)
9547 Lisp_Object terminal;
9549 struct coding_system *terminal_coding
9550 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9551 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9553 /* For backward compatibility, return nil if it is `undecided'. */
9554 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9557 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9558 Sset_keyboard_coding_system_internal, 1, 2, 0,
9559 doc: /* Internal use only. */)
9560 (coding_system, terminal)
9561 Lisp_Object coding_system;
9562 Lisp_Object terminal;
9564 struct terminal *t = get_terminal (terminal, 1);
9565 CHECK_SYMBOL (coding_system);
9566 if (NILP (coding_system))
9567 coding_system = Qno_conversion;
9568 else
9569 Fcheck_coding_system (coding_system);
9570 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9571 /* Character composition should be disabled. */
9572 TERMINAL_KEYBOARD_CODING (t)->common_flags
9573 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9574 return Qnil;
9577 DEFUN ("keyboard-coding-system",
9578 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9579 doc: /* Return coding system specified for decoding keyboard input. */)
9580 (terminal)
9581 Lisp_Object terminal;
9583 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9584 (get_terminal (terminal, 1))->id);
9588 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9589 Sfind_operation_coding_system, 1, MANY, 0,
9590 doc: /* Choose a coding system for an operation based on the target name.
9591 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9592 DECODING-SYSTEM is the coding system to use for decoding
9593 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9594 for encoding (in case OPERATION does encoding).
9596 The first argument OPERATION specifies an I/O primitive:
9597 For file I/O, `insert-file-contents' or `write-region'.
9598 For process I/O, `call-process', `call-process-region', or `start-process'.
9599 For network I/O, `open-network-stream'.
9601 The remaining arguments should be the same arguments that were passed
9602 to the primitive. Depending on which primitive, one of those arguments
9603 is selected as the TARGET. For example, if OPERATION does file I/O,
9604 whichever argument specifies the file name is TARGET.
9606 TARGET has a meaning which depends on OPERATION:
9607 For file I/O, TARGET is a file name (except for the special case below).
9608 For process I/O, TARGET is a process name.
9609 For network I/O, TARGET is a service name or a port number.
9611 This function looks up what is specified for TARGET in
9612 `file-coding-system-alist', `process-coding-system-alist',
9613 or `network-coding-system-alist' depending on OPERATION.
9614 They may specify a coding system, a cons of coding systems,
9615 or a function symbol to call.
9616 In the last case, we call the function with one argument,
9617 which is a list of all the arguments given to this function.
9618 If the function can't decide a coding system, it can return
9619 `undecided' so that the normal code-detection is performed.
9621 If OPERATION is `insert-file-contents', the argument corresponding to
9622 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9623 file name to look up, and BUFFER is a buffer that contains the file's
9624 contents (not yet decoded). If `file-coding-system-alist' specifies a
9625 function to call for FILENAME, that function should examine the
9626 contents of BUFFER instead of reading the file.
9628 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9629 (nargs, args)
9630 int nargs;
9631 Lisp_Object *args;
9633 Lisp_Object operation, target_idx, target, val;
9634 register Lisp_Object chain;
9636 if (nargs < 2)
9637 error ("Too few arguments");
9638 operation = args[0];
9639 if (!SYMBOLP (operation)
9640 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9641 error ("Invalid first argument");
9642 if (nargs < 1 + XINT (target_idx))
9643 error ("Too few arguments for operation: %s",
9644 SDATA (SYMBOL_NAME (operation)));
9645 target = args[XINT (target_idx) + 1];
9646 if (!(STRINGP (target)
9647 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9648 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9649 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9650 error ("Invalid %dth argument", XINT (target_idx) + 1);
9651 if (CONSP (target))
9652 target = XCAR (target);
9654 chain = ((EQ (operation, Qinsert_file_contents)
9655 || EQ (operation, Qwrite_region))
9656 ? Vfile_coding_system_alist
9657 : (EQ (operation, Qopen_network_stream)
9658 ? Vnetwork_coding_system_alist
9659 : Vprocess_coding_system_alist));
9660 if (NILP (chain))
9661 return Qnil;
9663 for (; CONSP (chain); chain = XCDR (chain))
9665 Lisp_Object elt;
9667 elt = XCAR (chain);
9668 if (CONSP (elt)
9669 && ((STRINGP (target)
9670 && STRINGP (XCAR (elt))
9671 && fast_string_match (XCAR (elt), target) >= 0)
9672 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9674 val = XCDR (elt);
9675 /* Here, if VAL is both a valid coding system and a valid
9676 function symbol, we return VAL as a coding system. */
9677 if (CONSP (val))
9678 return val;
9679 if (! SYMBOLP (val))
9680 return Qnil;
9681 if (! NILP (Fcoding_system_p (val)))
9682 return Fcons (val, val);
9683 if (! NILP (Ffboundp (val)))
9685 /* We use call1 rather than safe_call1
9686 so as to get bug reports about functions called here
9687 which don't handle the current interface. */
9688 val = call1 (val, Flist (nargs, args));
9689 if (CONSP (val))
9690 return val;
9691 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9692 return Fcons (val, val);
9694 return Qnil;
9697 return Qnil;
9700 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9701 Sset_coding_system_priority, 0, MANY, 0,
9702 doc: /* Assign higher priority to the coding systems given as arguments.
9703 If multiple coding systems belong to the same category,
9704 all but the first one are ignored.
9706 usage: (set-coding-system-priority &rest coding-systems) */)
9707 (nargs, args)
9708 int nargs;
9709 Lisp_Object *args;
9711 int i, j;
9712 int changed[coding_category_max];
9713 enum coding_category priorities[coding_category_max];
9715 bzero (changed, sizeof changed);
9717 for (i = j = 0; i < nargs; i++)
9719 enum coding_category category;
9720 Lisp_Object spec, attrs;
9722 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9723 attrs = AREF (spec, 0);
9724 category = XINT (CODING_ATTR_CATEGORY (attrs));
9725 if (changed[category])
9726 /* Ignore this coding system because a coding system of the
9727 same category already had a higher priority. */
9728 continue;
9729 changed[category] = 1;
9730 priorities[j++] = category;
9731 if (coding_categories[category].id >= 0
9732 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9733 setup_coding_system (args[i], &coding_categories[category]);
9734 Fset (AREF (Vcoding_category_table, category), args[i]);
9737 /* Now we have decided top J priorities. Reflect the order of the
9738 original priorities to the remaining priorities. */
9740 for (i = j, j = 0; i < coding_category_max; i++, j++)
9742 while (j < coding_category_max
9743 && changed[coding_priorities[j]])
9744 j++;
9745 if (j == coding_category_max)
9746 abort ();
9747 priorities[i] = coding_priorities[j];
9750 bcopy (priorities, coding_priorities, sizeof priorities);
9752 /* Update `coding-category-list'. */
9753 Vcoding_category_list = Qnil;
9754 for (i = coding_category_max - 1; i >= 0; i--)
9755 Vcoding_category_list
9756 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9757 Vcoding_category_list);
9759 return Qnil;
9762 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9763 Scoding_system_priority_list, 0, 1, 0,
9764 doc: /* Return a list of coding systems ordered by their priorities.
9765 The list contains a subset of coding systems; i.e. coding systems
9766 assigned to each coding category (see `coding-category-list').
9768 HIGHESTP non-nil means just return the highest priority one. */)
9769 (highestp)
9770 Lisp_Object highestp;
9772 int i;
9773 Lisp_Object val;
9775 for (i = 0, val = Qnil; i < coding_category_max; i++)
9777 enum coding_category category = coding_priorities[i];
9778 int id = coding_categories[category].id;
9779 Lisp_Object attrs;
9781 if (id < 0)
9782 continue;
9783 attrs = CODING_ID_ATTRS (id);
9784 if (! NILP (highestp))
9785 return CODING_ATTR_BASE_NAME (attrs);
9786 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9788 return Fnreverse (val);
9791 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9793 static Lisp_Object
9794 make_subsidiaries (base)
9795 Lisp_Object base;
9797 Lisp_Object subsidiaries;
9798 int base_name_len = SBYTES (SYMBOL_NAME (base));
9799 char *buf = (char *) alloca (base_name_len + 6);
9800 int i;
9802 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9803 subsidiaries = Fmake_vector (make_number (3), Qnil);
9804 for (i = 0; i < 3; i++)
9806 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9807 ASET (subsidiaries, i, intern (buf));
9809 return subsidiaries;
9813 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9814 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9815 doc: /* For internal use only.
9816 usage: (define-coding-system-internal ...) */)
9817 (nargs, args)
9818 int nargs;
9819 Lisp_Object *args;
9821 Lisp_Object name;
9822 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9823 Lisp_Object attrs; /* Vector of attributes. */
9824 Lisp_Object eol_type;
9825 Lisp_Object aliases;
9826 Lisp_Object coding_type, charset_list, safe_charsets;
9827 enum coding_category category;
9828 Lisp_Object tail, val;
9829 int max_charset_id = 0;
9830 int i;
9832 if (nargs < coding_arg_max)
9833 goto short_args;
9835 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9837 name = args[coding_arg_name];
9838 CHECK_SYMBOL (name);
9839 CODING_ATTR_BASE_NAME (attrs) = name;
9841 val = args[coding_arg_mnemonic];
9842 if (! STRINGP (val))
9843 CHECK_CHARACTER (val);
9844 CODING_ATTR_MNEMONIC (attrs) = val;
9846 coding_type = args[coding_arg_coding_type];
9847 CHECK_SYMBOL (coding_type);
9848 CODING_ATTR_TYPE (attrs) = coding_type;
9850 charset_list = args[coding_arg_charset_list];
9851 if (SYMBOLP (charset_list))
9853 if (EQ (charset_list, Qiso_2022))
9855 if (! EQ (coding_type, Qiso_2022))
9856 error ("Invalid charset-list");
9857 charset_list = Viso_2022_charset_list;
9859 else if (EQ (charset_list, Qemacs_mule))
9861 if (! EQ (coding_type, Qemacs_mule))
9862 error ("Invalid charset-list");
9863 charset_list = Vemacs_mule_charset_list;
9865 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9866 if (max_charset_id < XFASTINT (XCAR (tail)))
9867 max_charset_id = XFASTINT (XCAR (tail));
9869 else
9871 charset_list = Fcopy_sequence (charset_list);
9872 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9874 struct charset *charset;
9876 val = XCAR (tail);
9877 CHECK_CHARSET_GET_CHARSET (val, charset);
9878 if (EQ (coding_type, Qiso_2022)
9879 ? CHARSET_ISO_FINAL (charset) < 0
9880 : EQ (coding_type, Qemacs_mule)
9881 ? CHARSET_EMACS_MULE_ID (charset) < 0
9882 : 0)
9883 error ("Can't handle charset `%s'",
9884 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9886 XSETCAR (tail, make_number (charset->id));
9887 if (max_charset_id < charset->id)
9888 max_charset_id = charset->id;
9891 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9893 safe_charsets = make_uninit_string (max_charset_id + 1);
9894 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9895 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9896 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9897 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9899 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9901 val = args[coding_arg_decode_translation_table];
9902 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9903 CHECK_SYMBOL (val);
9904 CODING_ATTR_DECODE_TBL (attrs) = val;
9906 val = args[coding_arg_encode_translation_table];
9907 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9908 CHECK_SYMBOL (val);
9909 CODING_ATTR_ENCODE_TBL (attrs) = val;
9911 val = args[coding_arg_post_read_conversion];
9912 CHECK_SYMBOL (val);
9913 CODING_ATTR_POST_READ (attrs) = val;
9915 val = args[coding_arg_pre_write_conversion];
9916 CHECK_SYMBOL (val);
9917 CODING_ATTR_PRE_WRITE (attrs) = val;
9919 val = args[coding_arg_default_char];
9920 if (NILP (val))
9921 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9922 else
9924 CHECK_CHARACTER (val);
9925 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9928 val = args[coding_arg_for_unibyte];
9929 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9931 val = args[coding_arg_plist];
9932 CHECK_LIST (val);
9933 CODING_ATTR_PLIST (attrs) = val;
9935 if (EQ (coding_type, Qcharset))
9937 /* Generate a lisp vector of 256 elements. Each element is nil,
9938 integer, or a list of charset IDs.
9940 If Nth element is nil, the byte code N is invalid in this
9941 coding system.
9943 If Nth element is a number NUM, N is the first byte of a
9944 charset whose ID is NUM.
9946 If Nth element is a list of charset IDs, N is the first byte
9947 of one of them. The list is sorted by dimensions of the
9948 charsets. A charset of smaller dimension comes first. */
9949 val = Fmake_vector (make_number (256), Qnil);
9951 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9953 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9954 int dim = CHARSET_DIMENSION (charset);
9955 int idx = (dim - 1) * 4;
9957 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9958 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9960 for (i = charset->code_space[idx];
9961 i <= charset->code_space[idx + 1]; i++)
9963 Lisp_Object tmp, tmp2;
9964 int dim2;
9966 tmp = AREF (val, i);
9967 if (NILP (tmp))
9968 tmp = XCAR (tail);
9969 else if (NUMBERP (tmp))
9971 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9972 if (dim < dim2)
9973 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9974 else
9975 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9977 else
9979 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9981 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9982 if (dim < dim2)
9983 break;
9985 if (NILP (tmp2))
9986 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9987 else
9989 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9990 XSETCAR (tmp2, XCAR (tail));
9993 ASET (val, i, tmp);
9996 ASET (attrs, coding_attr_charset_valids, val);
9997 category = coding_category_charset;
9999 else if (EQ (coding_type, Qccl))
10001 Lisp_Object valids;
10003 if (nargs < coding_arg_ccl_max)
10004 goto short_args;
10006 val = args[coding_arg_ccl_decoder];
10007 CHECK_CCL_PROGRAM (val);
10008 if (VECTORP (val))
10009 val = Fcopy_sequence (val);
10010 ASET (attrs, coding_attr_ccl_decoder, val);
10012 val = args[coding_arg_ccl_encoder];
10013 CHECK_CCL_PROGRAM (val);
10014 if (VECTORP (val))
10015 val = Fcopy_sequence (val);
10016 ASET (attrs, coding_attr_ccl_encoder, val);
10018 val = args[coding_arg_ccl_valids];
10019 valids = Fmake_string (make_number (256), make_number (0));
10020 for (tail = val; !NILP (tail); tail = Fcdr (tail))
10022 int from, to;
10024 val = Fcar (tail);
10025 if (INTEGERP (val))
10027 from = to = XINT (val);
10028 if (from < 0 || from > 255)
10029 args_out_of_range_3 (val, make_number (0), make_number (255));
10031 else
10033 CHECK_CONS (val);
10034 CHECK_NATNUM_CAR (val);
10035 CHECK_NATNUM_CDR (val);
10036 from = XINT (XCAR (val));
10037 if (from > 255)
10038 args_out_of_range_3 (XCAR (val),
10039 make_number (0), make_number (255));
10040 to = XINT (XCDR (val));
10041 if (to < from || to > 255)
10042 args_out_of_range_3 (XCDR (val),
10043 XCAR (val), make_number (255));
10045 for (i = from; i <= to; i++)
10046 SSET (valids, i, 1);
10048 ASET (attrs, coding_attr_ccl_valids, valids);
10050 category = coding_category_ccl;
10052 else if (EQ (coding_type, Qutf_16))
10054 Lisp_Object bom, endian;
10056 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10058 if (nargs < coding_arg_utf16_max)
10059 goto short_args;
10061 bom = args[coding_arg_utf16_bom];
10062 if (! NILP (bom) && ! EQ (bom, Qt))
10064 CHECK_CONS (bom);
10065 val = XCAR (bom);
10066 CHECK_CODING_SYSTEM (val);
10067 val = XCDR (bom);
10068 CHECK_CODING_SYSTEM (val);
10070 ASET (attrs, coding_attr_utf_bom, bom);
10072 endian = args[coding_arg_utf16_endian];
10073 CHECK_SYMBOL (endian);
10074 if (NILP (endian))
10075 endian = Qbig;
10076 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10077 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10078 ASET (attrs, coding_attr_utf_16_endian, endian);
10080 category = (CONSP (bom)
10081 ? coding_category_utf_16_auto
10082 : NILP (bom)
10083 ? (EQ (endian, Qbig)
10084 ? coding_category_utf_16_be_nosig
10085 : coding_category_utf_16_le_nosig)
10086 : (EQ (endian, Qbig)
10087 ? coding_category_utf_16_be
10088 : coding_category_utf_16_le));
10090 else if (EQ (coding_type, Qiso_2022))
10092 Lisp_Object initial, reg_usage, request, flags;
10093 int i;
10095 if (nargs < coding_arg_iso2022_max)
10096 goto short_args;
10098 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10099 CHECK_VECTOR (initial);
10100 for (i = 0; i < 4; i++)
10102 val = Faref (initial, make_number (i));
10103 if (! NILP (val))
10105 struct charset *charset;
10107 CHECK_CHARSET_GET_CHARSET (val, charset);
10108 ASET (initial, i, make_number (CHARSET_ID (charset)));
10109 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10110 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10112 else
10113 ASET (initial, i, make_number (-1));
10116 reg_usage = args[coding_arg_iso2022_reg_usage];
10117 CHECK_CONS (reg_usage);
10118 CHECK_NUMBER_CAR (reg_usage);
10119 CHECK_NUMBER_CDR (reg_usage);
10121 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10122 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
10124 int id;
10125 Lisp_Object tmp;
10127 val = Fcar (tail);
10128 CHECK_CONS (val);
10129 tmp = XCAR (val);
10130 CHECK_CHARSET_GET_ID (tmp, id);
10131 CHECK_NATNUM_CDR (val);
10132 if (XINT (XCDR (val)) >= 4)
10133 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
10134 XSETCAR (val, make_number (id));
10137 flags = args[coding_arg_iso2022_flags];
10138 CHECK_NATNUM (flags);
10139 i = XINT (flags);
10140 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10141 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10143 ASET (attrs, coding_attr_iso_initial, initial);
10144 ASET (attrs, coding_attr_iso_usage, reg_usage);
10145 ASET (attrs, coding_attr_iso_request, request);
10146 ASET (attrs, coding_attr_iso_flags, flags);
10147 setup_iso_safe_charsets (attrs);
10149 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10150 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10151 | CODING_ISO_FLAG_SINGLE_SHIFT))
10152 ? coding_category_iso_7_else
10153 : EQ (args[coding_arg_charset_list], Qiso_2022)
10154 ? coding_category_iso_7
10155 : coding_category_iso_7_tight);
10156 else
10158 int id = XINT (AREF (initial, 1));
10160 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10161 || EQ (args[coding_arg_charset_list], Qiso_2022)
10162 || id < 0)
10163 ? coding_category_iso_8_else
10164 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10165 ? coding_category_iso_8_1
10166 : coding_category_iso_8_2);
10168 if (category != coding_category_iso_8_1
10169 && category != coding_category_iso_8_2)
10170 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10172 else if (EQ (coding_type, Qemacs_mule))
10174 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10175 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10176 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10177 category = coding_category_emacs_mule;
10179 else if (EQ (coding_type, Qshift_jis))
10182 struct charset *charset;
10184 if (XINT (Flength (charset_list)) != 3
10185 && XINT (Flength (charset_list)) != 4)
10186 error ("There should be three or four charsets");
10188 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10189 if (CHARSET_DIMENSION (charset) != 1)
10190 error ("Dimension of charset %s is not one",
10191 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10192 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10193 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10195 charset_list = XCDR (charset_list);
10196 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10197 if (CHARSET_DIMENSION (charset) != 1)
10198 error ("Dimension of charset %s is not one",
10199 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10201 charset_list = XCDR (charset_list);
10202 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10203 if (CHARSET_DIMENSION (charset) != 2)
10204 error ("Dimension of charset %s is not two",
10205 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10207 charset_list = XCDR (charset_list);
10208 if (! NILP (charset_list))
10210 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10211 if (CHARSET_DIMENSION (charset) != 2)
10212 error ("Dimension of charset %s is not two",
10213 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10216 category = coding_category_sjis;
10217 Vsjis_coding_system = name;
10219 else if (EQ (coding_type, Qbig5))
10221 struct charset *charset;
10223 if (XINT (Flength (charset_list)) != 2)
10224 error ("There should be just two charsets");
10226 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10227 if (CHARSET_DIMENSION (charset) != 1)
10228 error ("Dimension of charset %s is not one",
10229 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10230 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10231 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10233 charset_list = XCDR (charset_list);
10234 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10235 if (CHARSET_DIMENSION (charset) != 2)
10236 error ("Dimension of charset %s is not two",
10237 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10239 category = coding_category_big5;
10240 Vbig5_coding_system = name;
10242 else if (EQ (coding_type, Qraw_text))
10244 category = coding_category_raw_text;
10245 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10247 else if (EQ (coding_type, Qutf_8))
10249 Lisp_Object bom;
10251 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10253 if (nargs < coding_arg_utf8_max)
10254 goto short_args;
10256 bom = args[coding_arg_utf8_bom];
10257 if (! NILP (bom) && ! EQ (bom, Qt))
10259 CHECK_CONS (bom);
10260 val = XCAR (bom);
10261 CHECK_CODING_SYSTEM (val);
10262 val = XCDR (bom);
10263 CHECK_CODING_SYSTEM (val);
10265 ASET (attrs, coding_attr_utf_bom, bom);
10267 category = (CONSP (bom) ? coding_category_utf_8_auto
10268 : NILP (bom) ? coding_category_utf_8_nosig
10269 : coding_category_utf_8_sig);
10271 else if (EQ (coding_type, Qundecided))
10272 category = coding_category_undecided;
10273 else
10274 error ("Invalid coding system type: %s",
10275 SDATA (SYMBOL_NAME (coding_type)));
10277 CODING_ATTR_CATEGORY (attrs) = make_number (category);
10278 CODING_ATTR_PLIST (attrs)
10279 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10280 CODING_ATTR_PLIST (attrs)));
10281 CODING_ATTR_PLIST (attrs)
10282 = Fcons (QCascii_compatible_p,
10283 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10284 CODING_ATTR_PLIST (attrs)));
10286 eol_type = args[coding_arg_eol_type];
10287 if (! NILP (eol_type)
10288 && ! EQ (eol_type, Qunix)
10289 && ! EQ (eol_type, Qdos)
10290 && ! EQ (eol_type, Qmac))
10291 error ("Invalid eol-type");
10293 aliases = Fcons (name, Qnil);
10295 if (NILP (eol_type))
10297 eol_type = make_subsidiaries (name);
10298 for (i = 0; i < 3; i++)
10300 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10302 this_name = AREF (eol_type, i);
10303 this_aliases = Fcons (this_name, Qnil);
10304 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10305 this_spec = Fmake_vector (make_number (3), attrs);
10306 ASET (this_spec, 1, this_aliases);
10307 ASET (this_spec, 2, this_eol_type);
10308 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10309 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10310 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10311 if (NILP (val))
10312 Vcoding_system_alist
10313 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10314 Vcoding_system_alist);
10318 spec_vec = Fmake_vector (make_number (3), attrs);
10319 ASET (spec_vec, 1, aliases);
10320 ASET (spec_vec, 2, eol_type);
10322 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10323 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10324 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10325 if (NILP (val))
10326 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10327 Vcoding_system_alist);
10330 int id = coding_categories[category].id;
10332 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10333 setup_coding_system (name, &coding_categories[category]);
10336 return Qnil;
10338 short_args:
10339 return Fsignal (Qwrong_number_of_arguments,
10340 Fcons (intern ("define-coding-system-internal"),
10341 make_number (nargs)));
10345 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10346 3, 3, 0,
10347 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10348 (coding_system, prop, val)
10349 Lisp_Object coding_system, prop, val;
10351 Lisp_Object spec, attrs;
10353 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10354 attrs = AREF (spec, 0);
10355 if (EQ (prop, QCmnemonic))
10357 if (! STRINGP (val))
10358 CHECK_CHARACTER (val);
10359 CODING_ATTR_MNEMONIC (attrs) = val;
10361 else if (EQ (prop, QCdefault_char))
10363 if (NILP (val))
10364 val = make_number (' ');
10365 else
10366 CHECK_CHARACTER (val);
10367 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10369 else if (EQ (prop, QCdecode_translation_table))
10371 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10372 CHECK_SYMBOL (val);
10373 CODING_ATTR_DECODE_TBL (attrs) = val;
10375 else if (EQ (prop, QCencode_translation_table))
10377 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10378 CHECK_SYMBOL (val);
10379 CODING_ATTR_ENCODE_TBL (attrs) = val;
10381 else if (EQ (prop, QCpost_read_conversion))
10383 CHECK_SYMBOL (val);
10384 CODING_ATTR_POST_READ (attrs) = val;
10386 else if (EQ (prop, QCpre_write_conversion))
10388 CHECK_SYMBOL (val);
10389 CODING_ATTR_PRE_WRITE (attrs) = val;
10391 else if (EQ (prop, QCascii_compatible_p))
10393 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10396 CODING_ATTR_PLIST (attrs)
10397 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10398 return val;
10402 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10403 Sdefine_coding_system_alias, 2, 2, 0,
10404 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10405 (alias, coding_system)
10406 Lisp_Object alias, coding_system;
10408 Lisp_Object spec, aliases, eol_type, val;
10410 CHECK_SYMBOL (alias);
10411 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10412 aliases = AREF (spec, 1);
10413 /* ALIASES should be a list of length more than zero, and the first
10414 element is a base coding system. Append ALIAS at the tail of the
10415 list. */
10416 while (!NILP (XCDR (aliases)))
10417 aliases = XCDR (aliases);
10418 XSETCDR (aliases, Fcons (alias, Qnil));
10420 eol_type = AREF (spec, 2);
10421 if (VECTORP (eol_type))
10423 Lisp_Object subsidiaries;
10424 int i;
10426 subsidiaries = make_subsidiaries (alias);
10427 for (i = 0; i < 3; i++)
10428 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10429 AREF (eol_type, i));
10432 Fputhash (alias, spec, Vcoding_system_hash_table);
10433 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10434 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10435 if (NILP (val))
10436 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10437 Vcoding_system_alist);
10439 return Qnil;
10442 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10443 1, 1, 0,
10444 doc: /* Return the base of CODING-SYSTEM.
10445 Any alias or subsidiary coding system is not a base coding system. */)
10446 (coding_system)
10447 Lisp_Object coding_system;
10449 Lisp_Object spec, attrs;
10451 if (NILP (coding_system))
10452 return (Qno_conversion);
10453 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10454 attrs = AREF (spec, 0);
10455 return CODING_ATTR_BASE_NAME (attrs);
10458 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10459 1, 1, 0,
10460 doc: "Return the property list of CODING-SYSTEM.")
10461 (coding_system)
10462 Lisp_Object coding_system;
10464 Lisp_Object spec, attrs;
10466 if (NILP (coding_system))
10467 coding_system = Qno_conversion;
10468 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10469 attrs = AREF (spec, 0);
10470 return CODING_ATTR_PLIST (attrs);
10474 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10475 1, 1, 0,
10476 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10477 (coding_system)
10478 Lisp_Object coding_system;
10480 Lisp_Object spec;
10482 if (NILP (coding_system))
10483 coding_system = Qno_conversion;
10484 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10485 return AREF (spec, 1);
10488 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10489 Scoding_system_eol_type, 1, 1, 0,
10490 doc: /* Return eol-type of CODING-SYSTEM.
10491 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10493 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10494 and CR respectively.
10496 A vector value indicates that a format of end-of-line should be
10497 detected automatically. Nth element of the vector is the subsidiary
10498 coding system whose eol-type is N. */)
10499 (coding_system)
10500 Lisp_Object coding_system;
10502 Lisp_Object spec, eol_type;
10503 int n;
10505 if (NILP (coding_system))
10506 coding_system = Qno_conversion;
10507 if (! CODING_SYSTEM_P (coding_system))
10508 return Qnil;
10509 spec = CODING_SYSTEM_SPEC (coding_system);
10510 eol_type = AREF (spec, 2);
10511 if (VECTORP (eol_type))
10512 return Fcopy_sequence (eol_type);
10513 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10514 return make_number (n);
10517 #endif /* emacs */
10520 /*** 9. Post-amble ***/
10522 void
10523 init_coding_once ()
10525 int i;
10527 for (i = 0; i < coding_category_max; i++)
10529 coding_categories[i].id = -1;
10530 coding_priorities[i] = i;
10533 /* ISO2022 specific initialize routine. */
10534 for (i = 0; i < 0x20; i++)
10535 iso_code_class[i] = ISO_control_0;
10536 for (i = 0x21; i < 0x7F; i++)
10537 iso_code_class[i] = ISO_graphic_plane_0;
10538 for (i = 0x80; i < 0xA0; i++)
10539 iso_code_class[i] = ISO_control_1;
10540 for (i = 0xA1; i < 0xFF; i++)
10541 iso_code_class[i] = ISO_graphic_plane_1;
10542 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10543 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10544 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10545 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10546 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10547 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10548 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10549 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10550 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10552 for (i = 0; i < 256; i++)
10554 emacs_mule_bytes[i] = 1;
10556 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10557 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10558 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10559 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10562 #ifdef emacs
10564 void
10565 syms_of_coding ()
10567 staticpro (&Vcoding_system_hash_table);
10569 Lisp_Object args[2];
10570 args[0] = QCtest;
10571 args[1] = Qeq;
10572 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10575 staticpro (&Vsjis_coding_system);
10576 Vsjis_coding_system = Qnil;
10578 staticpro (&Vbig5_coding_system);
10579 Vbig5_coding_system = Qnil;
10581 staticpro (&Vcode_conversion_reused_workbuf);
10582 Vcode_conversion_reused_workbuf = Qnil;
10584 staticpro (&Vcode_conversion_workbuf_name);
10585 Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10587 reused_workbuf_in_use = 0;
10589 DEFSYM (Qcharset, "charset");
10590 DEFSYM (Qtarget_idx, "target-idx");
10591 DEFSYM (Qcoding_system_history, "coding-system-history");
10592 Fset (Qcoding_system_history, Qnil);
10594 /* Target FILENAME is the first argument. */
10595 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10596 /* Target FILENAME is the third argument. */
10597 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10599 DEFSYM (Qcall_process, "call-process");
10600 /* Target PROGRAM is the first argument. */
10601 Fput (Qcall_process, Qtarget_idx, make_number (0));
10603 DEFSYM (Qcall_process_region, "call-process-region");
10604 /* Target PROGRAM is the third argument. */
10605 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10607 DEFSYM (Qstart_process, "start-process");
10608 /* Target PROGRAM is the third argument. */
10609 Fput (Qstart_process, Qtarget_idx, make_number (2));
10611 DEFSYM (Qopen_network_stream, "open-network-stream");
10612 /* Target SERVICE is the fourth argument. */
10613 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10615 DEFSYM (Qcoding_system, "coding-system");
10616 DEFSYM (Qcoding_aliases, "coding-aliases");
10618 DEFSYM (Qeol_type, "eol-type");
10619 DEFSYM (Qunix, "unix");
10620 DEFSYM (Qdos, "dos");
10622 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10623 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10624 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10625 DEFSYM (Qdefault_char, "default-char");
10626 DEFSYM (Qundecided, "undecided");
10627 DEFSYM (Qno_conversion, "no-conversion");
10628 DEFSYM (Qraw_text, "raw-text");
10630 DEFSYM (Qiso_2022, "iso-2022");
10632 DEFSYM (Qutf_8, "utf-8");
10633 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10635 DEFSYM (Qutf_16, "utf-16");
10636 DEFSYM (Qbig, "big");
10637 DEFSYM (Qlittle, "little");
10639 DEFSYM (Qshift_jis, "shift-jis");
10640 DEFSYM (Qbig5, "big5");
10642 DEFSYM (Qcoding_system_p, "coding-system-p");
10644 DEFSYM (Qcoding_system_error, "coding-system-error");
10645 Fput (Qcoding_system_error, Qerror_conditions,
10646 pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10647 Fput (Qcoding_system_error, Qerror_message,
10648 make_pure_c_string ("Invalid coding system"));
10650 /* Intern this now in case it isn't already done.
10651 Setting this variable twice is harmless.
10652 But don't staticpro it here--that is done in alloc.c. */
10653 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10655 DEFSYM (Qtranslation_table, "translation-table");
10656 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10657 DEFSYM (Qtranslation_table_id, "translation-table-id");
10658 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10659 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10661 DEFSYM (Qvalid_codes, "valid-codes");
10663 DEFSYM (Qemacs_mule, "emacs-mule");
10665 DEFSYM (QCcategory, ":category");
10666 DEFSYM (QCmnemonic, ":mnemonic");
10667 DEFSYM (QCdefault_char, ":default-char");
10668 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10669 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10670 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10671 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10672 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10674 Vcoding_category_table
10675 = Fmake_vector (make_number (coding_category_max), Qnil);
10676 staticpro (&Vcoding_category_table);
10677 /* Followings are target of code detection. */
10678 ASET (Vcoding_category_table, coding_category_iso_7,
10679 intern_c_string ("coding-category-iso-7"));
10680 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10681 intern_c_string ("coding-category-iso-7-tight"));
10682 ASET (Vcoding_category_table, coding_category_iso_8_1,
10683 intern_c_string ("coding-category-iso-8-1"));
10684 ASET (Vcoding_category_table, coding_category_iso_8_2,
10685 intern_c_string ("coding-category-iso-8-2"));
10686 ASET (Vcoding_category_table, coding_category_iso_7_else,
10687 intern_c_string ("coding-category-iso-7-else"));
10688 ASET (Vcoding_category_table, coding_category_iso_8_else,
10689 intern_c_string ("coding-category-iso-8-else"));
10690 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10691 intern_c_string ("coding-category-utf-8-auto"));
10692 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10693 intern_c_string ("coding-category-utf-8"));
10694 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10695 intern_c_string ("coding-category-utf-8-sig"));
10696 ASET (Vcoding_category_table, coding_category_utf_16_be,
10697 intern_c_string ("coding-category-utf-16-be"));
10698 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10699 intern_c_string ("coding-category-utf-16-auto"));
10700 ASET (Vcoding_category_table, coding_category_utf_16_le,
10701 intern_c_string ("coding-category-utf-16-le"));
10702 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10703 intern_c_string ("coding-category-utf-16-be-nosig"));
10704 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10705 intern_c_string ("coding-category-utf-16-le-nosig"));
10706 ASET (Vcoding_category_table, coding_category_charset,
10707 intern_c_string ("coding-category-charset"));
10708 ASET (Vcoding_category_table, coding_category_sjis,
10709 intern_c_string ("coding-category-sjis"));
10710 ASET (Vcoding_category_table, coding_category_big5,
10711 intern_c_string ("coding-category-big5"));
10712 ASET (Vcoding_category_table, coding_category_ccl,
10713 intern_c_string ("coding-category-ccl"));
10714 ASET (Vcoding_category_table, coding_category_emacs_mule,
10715 intern_c_string ("coding-category-emacs-mule"));
10716 /* Followings are NOT target of code detection. */
10717 ASET (Vcoding_category_table, coding_category_raw_text,
10718 intern_c_string ("coding-category-raw-text"));
10719 ASET (Vcoding_category_table, coding_category_undecided,
10720 intern_c_string ("coding-category-undecided"));
10722 DEFSYM (Qinsufficient_source, "insufficient-source");
10723 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10724 DEFSYM (Qinvalid_source, "invalid-source");
10725 DEFSYM (Qinterrupted, "interrupted");
10726 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10727 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10729 defsubr (&Scoding_system_p);
10730 defsubr (&Sread_coding_system);
10731 defsubr (&Sread_non_nil_coding_system);
10732 defsubr (&Scheck_coding_system);
10733 defsubr (&Sdetect_coding_region);
10734 defsubr (&Sdetect_coding_string);
10735 defsubr (&Sfind_coding_systems_region_internal);
10736 defsubr (&Sunencodable_char_position);
10737 defsubr (&Scheck_coding_systems_region);
10738 defsubr (&Sdecode_coding_region);
10739 defsubr (&Sencode_coding_region);
10740 defsubr (&Sdecode_coding_string);
10741 defsubr (&Sencode_coding_string);
10742 defsubr (&Sdecode_sjis_char);
10743 defsubr (&Sencode_sjis_char);
10744 defsubr (&Sdecode_big5_char);
10745 defsubr (&Sencode_big5_char);
10746 defsubr (&Sset_terminal_coding_system_internal);
10747 defsubr (&Sset_safe_terminal_coding_system_internal);
10748 defsubr (&Sterminal_coding_system);
10749 defsubr (&Sset_keyboard_coding_system_internal);
10750 defsubr (&Skeyboard_coding_system);
10751 defsubr (&Sfind_operation_coding_system);
10752 defsubr (&Sset_coding_system_priority);
10753 defsubr (&Sdefine_coding_system_internal);
10754 defsubr (&Sdefine_coding_system_alias);
10755 defsubr (&Scoding_system_put);
10756 defsubr (&Scoding_system_base);
10757 defsubr (&Scoding_system_plist);
10758 defsubr (&Scoding_system_aliases);
10759 defsubr (&Scoding_system_eol_type);
10760 defsubr (&Scoding_system_priority_list);
10762 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10763 doc: /* List of coding systems.
10765 Do not alter the value of this variable manually. This variable should be
10766 updated by the functions `define-coding-system' and
10767 `define-coding-system-alias'. */);
10768 Vcoding_system_list = Qnil;
10770 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10771 doc: /* Alist of coding system names.
10772 Each element is one element list of coding system name.
10773 This variable is given to `completing-read' as COLLECTION argument.
10775 Do not alter the value of this variable manually. This variable should be
10776 updated by the functions `make-coding-system' and
10777 `define-coding-system-alias'. */);
10778 Vcoding_system_alist = Qnil;
10780 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10781 doc: /* List of coding-categories (symbols) ordered by priority.
10783 On detecting a coding system, Emacs tries code detection algorithms
10784 associated with each coding-category one by one in this order. When
10785 one algorithm agrees with a byte sequence of source text, the coding
10786 system bound to the corresponding coding-category is selected.
10788 Don't modify this variable directly, but use `set-coding-priority'. */);
10790 int i;
10792 Vcoding_category_list = Qnil;
10793 for (i = coding_category_max - 1; i >= 0; i--)
10794 Vcoding_category_list
10795 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10796 Vcoding_category_list);
10799 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10800 doc: /* Specify the coding system for read operations.
10801 It is useful to bind this variable with `let', but do not set it globally.
10802 If the value is a coding system, it is used for decoding on read operation.
10803 If not, an appropriate element is used from one of the coding system alists.
10804 There are three such tables: `file-coding-system-alist',
10805 `process-coding-system-alist', and `network-coding-system-alist'. */);
10806 Vcoding_system_for_read = Qnil;
10808 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10809 doc: /* Specify the coding system for write operations.
10810 Programs bind this variable with `let', but you should not set it globally.
10811 If the value is a coding system, it is used for encoding of output,
10812 when writing it to a file and when sending it to a file or subprocess.
10814 If this does not specify a coding system, an appropriate element
10815 is used from one of the coding system alists.
10816 There are three such tables: `file-coding-system-alist',
10817 `process-coding-system-alist', and `network-coding-system-alist'.
10818 For output to files, if the above procedure does not specify a coding system,
10819 the value of `buffer-file-coding-system' is used. */);
10820 Vcoding_system_for_write = Qnil;
10822 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10823 doc: /*
10824 Coding system used in the latest file or process I/O. */);
10825 Vlast_coding_system_used = Qnil;
10827 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10828 doc: /*
10829 Error status of the last code conversion.
10831 When an error was detected in the last code conversion, this variable
10832 is set to one of the following symbols.
10833 `insufficient-source'
10834 `inconsistent-eol'
10835 `invalid-source'
10836 `interrupted'
10837 `insufficient-memory'
10838 When no error was detected, the value doesn't change. So, to check
10839 the error status of a code conversion by this variable, you must
10840 explicitly set this variable to nil before performing code
10841 conversion. */);
10842 Vlast_code_conversion_error = Qnil;
10844 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10845 doc: /*
10846 *Non-nil means always inhibit code conversion of end-of-line format.
10847 See info node `Coding Systems' and info node `Text and Binary' concerning
10848 such conversion. */);
10849 inhibit_eol_conversion = 0;
10851 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10852 doc: /*
10853 Non-nil means process buffer inherits coding system of process output.
10854 Bind it to t if the process output is to be treated as if it were a file
10855 read from some filesystem. */);
10856 inherit_process_coding_system = 0;
10858 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10859 doc: /*
10860 Alist to decide a coding system to use for a file I/O operation.
10861 The format is ((PATTERN . VAL) ...),
10862 where PATTERN is a regular expression matching a file name,
10863 VAL is a coding system, a cons of coding systems, or a function symbol.
10864 If VAL is a coding system, it is used for both decoding and encoding
10865 the file contents.
10866 If VAL is a cons of coding systems, the car part is used for decoding,
10867 and the cdr part is used for encoding.
10868 If VAL is a function symbol, the function must return a coding system
10869 or a cons of coding systems which are used as above. The function is
10870 called with an argument that is a list of the arguments with which
10871 `find-operation-coding-system' was called. If the function can't decide
10872 a coding system, it can return `undecided' so that the normal
10873 code-detection is performed.
10875 See also the function `find-operation-coding-system'
10876 and the variable `auto-coding-alist'. */);
10877 Vfile_coding_system_alist = Qnil;
10879 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10880 doc: /*
10881 Alist to decide a coding system to use for a process I/O operation.
10882 The format is ((PATTERN . VAL) ...),
10883 where PATTERN is a regular expression matching a program name,
10884 VAL is a coding system, a cons of coding systems, or a function symbol.
10885 If VAL is a coding system, it is used for both decoding what received
10886 from the program and encoding what sent to the program.
10887 If VAL is a cons of coding systems, the car part is used for decoding,
10888 and the cdr part is used for encoding.
10889 If VAL is a function symbol, the function must return a coding system
10890 or a cons of coding systems which are used as above.
10892 See also the function `find-operation-coding-system'. */);
10893 Vprocess_coding_system_alist = Qnil;
10895 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10896 doc: /*
10897 Alist to decide a coding system to use for a network I/O operation.
10898 The format is ((PATTERN . VAL) ...),
10899 where PATTERN is a regular expression matching a network service name
10900 or is a port number to connect to,
10901 VAL is a coding system, a cons of coding systems, or a function symbol.
10902 If VAL is a coding system, it is used for both decoding what received
10903 from the network stream and encoding what sent to the network stream.
10904 If VAL is a cons of coding systems, the car part is used for decoding,
10905 and the cdr part is used for encoding.
10906 If VAL is a function symbol, the function must return a coding system
10907 or a cons of coding systems which are used as above.
10909 See also the function `find-operation-coding-system'. */);
10910 Vnetwork_coding_system_alist = Qnil;
10912 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10913 doc: /* Coding system to use with system messages.
10914 Also used for decoding keyboard input on X Window system. */);
10915 Vlocale_coding_system = Qnil;
10917 /* The eol mnemonics are reset in startup.el system-dependently. */
10918 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10919 doc: /*
10920 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10921 eol_mnemonic_unix = make_pure_c_string (":");
10923 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10924 doc: /*
10925 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10926 eol_mnemonic_dos = make_pure_c_string ("\\");
10928 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10929 doc: /*
10930 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10931 eol_mnemonic_mac = make_pure_c_string ("/");
10933 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10934 doc: /*
10935 *String displayed in mode line when end-of-line format is not yet determined. */);
10936 eol_mnemonic_undecided = make_pure_c_string (":");
10938 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10939 doc: /*
10940 *Non-nil enables character translation while encoding and decoding. */);
10941 Venable_character_translation = Qt;
10943 DEFVAR_LISP ("standard-translation-table-for-decode",
10944 &Vstandard_translation_table_for_decode,
10945 doc: /* Table for translating characters while decoding. */);
10946 Vstandard_translation_table_for_decode = Qnil;
10948 DEFVAR_LISP ("standard-translation-table-for-encode",
10949 &Vstandard_translation_table_for_encode,
10950 doc: /* Table for translating characters while encoding. */);
10951 Vstandard_translation_table_for_encode = Qnil;
10953 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10954 doc: /* Alist of charsets vs revision numbers.
10955 While encoding, if a charset (car part of an element) is found,
10956 designate it with the escape sequence identifying revision (cdr part
10957 of the element). */);
10958 Vcharset_revision_table = Qnil;
10960 DEFVAR_LISP ("default-process-coding-system",
10961 &Vdefault_process_coding_system,
10962 doc: /* Cons of coding systems used for process I/O by default.
10963 The car part is used for decoding a process output,
10964 the cdr part is used for encoding a text to be sent to a process. */);
10965 Vdefault_process_coding_system = Qnil;
10967 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10968 doc: /*
10969 Table of extra Latin codes in the range 128..159 (inclusive).
10970 This is a vector of length 256.
10971 If Nth element is non-nil, the existence of code N in a file
10972 \(or output of subprocess) doesn't prevent it to be detected as
10973 a coding system of ISO 2022 variant which has a flag
10974 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10975 or reading output of a subprocess.
10976 Only 128th through 159th elements have a meaning. */);
10977 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10979 DEFVAR_LISP ("select-safe-coding-system-function",
10980 &Vselect_safe_coding_system_function,
10981 doc: /*
10982 Function to call to select safe coding system for encoding a text.
10984 If set, this function is called to force a user to select a proper
10985 coding system which can encode the text in the case that a default
10986 coding system used in each operation can't encode the text. The
10987 function should take care that the buffer is not modified while
10988 the coding system is being selected.
10990 The default value is `select-safe-coding-system' (which see). */);
10991 Vselect_safe_coding_system_function = Qnil;
10993 DEFVAR_BOOL ("coding-system-require-warning",
10994 &coding_system_require_warning,
10995 doc: /* Internal use only.
10996 If non-nil, on writing a file, `select-safe-coding-system-function' is
10997 called even if `coding-system-for-write' is non-nil. The command
10998 `universal-coding-system-argument' binds this variable to t temporarily. */);
10999 coding_system_require_warning = 0;
11002 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11003 &inhibit_iso_escape_detection,
11004 doc: /*
11005 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11007 When Emacs reads text, it tries to detect how the text is encoded.
11008 This code detection is sensitive to escape sequences. If Emacs sees
11009 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11010 of the ISO2022 encodings, and decodes text by the corresponding coding
11011 system (e.g. `iso-2022-7bit').
11013 However, there may be a case that you want to read escape sequences in
11014 a file as is. In such a case, you can set this variable to non-nil.
11015 Then the code detection will ignore any escape sequences, and no text is
11016 detected as encoded in some ISO-2022 encoding. The result is that all
11017 escape sequences become visible in a buffer.
11019 The default value is nil, and it is strongly recommended not to change
11020 it. That is because many Emacs Lisp source files that contain
11021 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11022 in Emacs's distribution, and they won't be decoded correctly on
11023 reading if you suppress escape sequence detection.
11025 The other way to read escape sequences in a file without decoding is
11026 to explicitly specify some coding system that doesn't use ISO-2022
11027 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
11028 inhibit_iso_escape_detection = 0;
11030 DEFVAR_BOOL ("inhibit-null-byte-detection",
11031 &inhibit_null_byte_detection,
11032 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11033 By default, Emacs treats it as binary data, and does not attempt to
11034 decode it. The effect is as if you specified `no-conversion' for
11035 reading that text.
11037 Set this to non-nil when a regular text happens to include null bytes.
11038 Examples are Index nodes of Info files and null-byte delimited output
11039 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11040 decode text as usual. */);
11041 inhibit_null_byte_detection = 0;
11043 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
11044 doc: /* Char table for translating self-inserting characters.
11045 This is applied to the result of input methods, not their input.
11046 See also `keyboard-translate-table'.
11048 Use of this variable for character code unification was rendered
11049 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11050 internal character representation. */);
11051 Vtranslation_table_for_input = Qnil;
11054 Lisp_Object args[coding_arg_max];
11055 Lisp_Object plist[16];
11056 int i;
11058 for (i = 0; i < coding_arg_max; i++)
11059 args[i] = Qnil;
11061 plist[0] = intern_c_string (":name");
11062 plist[1] = args[coding_arg_name] = Qno_conversion;
11063 plist[2] = intern_c_string (":mnemonic");
11064 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11065 plist[4] = intern_c_string (":coding-type");
11066 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11067 plist[6] = intern_c_string (":ascii-compatible-p");
11068 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11069 plist[8] = intern_c_string (":default-char");
11070 plist[9] = args[coding_arg_default_char] = make_number (0);
11071 plist[10] = intern_c_string (":for-unibyte");
11072 plist[11] = args[coding_arg_for_unibyte] = Qt;
11073 plist[12] = intern_c_string (":docstring");
11074 plist[13] = make_pure_c_string ("Do no conversion.\n\
11076 When you visit a file with this coding, the file is read into a\n\
11077 unibyte buffer as is, thus each byte of a file is treated as a\n\
11078 character.");
11079 plist[14] = intern_c_string (":eol-type");
11080 plist[15] = args[coding_arg_eol_type] = Qunix;
11081 args[coding_arg_plist] = Flist (16, plist);
11082 Fdefine_coding_system_internal (coding_arg_max, args);
11084 plist[1] = args[coding_arg_name] = Qundecided;
11085 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11086 plist[5] = args[coding_arg_coding_type] = Qundecided;
11087 /* This is already set.
11088 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11089 plist[8] = intern_c_string (":charset-list");
11090 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11091 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11092 plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11093 plist[15] = args[coding_arg_eol_type] = Qnil;
11094 args[coding_arg_plist] = Flist (16, plist);
11095 Fdefine_coding_system_internal (coding_arg_max, args);
11098 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11101 int i;
11103 for (i = 0; i < coding_category_max; i++)
11104 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11106 #if defined (MSDOS) || defined (WINDOWSNT)
11107 system_eol_type = Qdos;
11108 #else
11109 system_eol_type = Qunix;
11110 #endif
11111 staticpro (&system_eol_type);
11114 char *
11115 emacs_strerror (error_number)
11116 int error_number;
11118 char *str;
11120 synchronize_system_messages_locale ();
11121 str = strerror (error_number);
11123 if (! NILP (Vlocale_coding_system))
11125 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11126 Vlocale_coding_system,
11128 str = (char *) SDATA (dec);
11131 return str;
11134 #endif /* emacs */
11136 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
11137 (do not change this comment) */