(calc-mode): Replace `copy-list' with `copy-sequence'.
[emacs.git] / src / coding.c
blobd374ca8b476bb1987805b1ab75c5ec64423e56db
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008, 2009
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
12 This file is part of GNU Emacs.
14 GNU Emacs is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
27 /*** TABLE OF CONTENTS ***
29 0. General comments
30 1. Preamble
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
45 /*** 0. General comments ***
48 CODING SYSTEM
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
57 coding system.
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
61 stored in the hash table Vcharset_hash_table. The conversion from
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
65 Coding systems are classified into the following types depending on
66 the encoding mechanism. Here's a brief description of the types.
68 o UTF-8
70 o UTF-16
72 o Charset-base coding system
74 A coding system defined by one or more (coded) character sets.
75 Decoding and encoding are done by a code converter defined for each
76 character set.
78 o Old Emacs internal format (emacs-mule)
80 The coding system adopted by old versions of Emacs (20 and 21).
82 o ISO2022-base coding system
84 The most famous coding system for multiple character sets. X's
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
89 o SJIS (or Shift-JIS or MS-Kanji-Code)
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
93 section 8.
95 o BIG5
97 A coding system to encode character sets: ASCII and Big5. Widely
98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
103 o CCL
105 If a user wants to decode/encode text encoded in a coding system
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
110 o Raw-text
112 A coding system for text containing raw eight-bit data. Emacs
113 treats each byte of source text as a character (except for
114 end-of-line conversion).
116 o No-conversion
118 Like raw text, but don't do end-of-line conversion.
121 END-OF-LINE FORMAT
123 How text end-of-line is encoded depends on operating system. For
124 instance, Unix's format is just one byte of LF (line-feed) code,
125 whereas DOS's format is two-byte sequence of `carriage-return' and
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
129 Since text character encoding and end-of-line encoding are
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
133 STRUCT CODING_SYSTEM
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
138 conversion (e.g. the location of source and destination data).
142 /* COMMON MACROS */
145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
147 These functions check if a byte sequence specified as a source in
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
153 Below is the template of these functions. */
155 #if 0
156 static int
157 detect_coding_XXX (coding, detect_info)
158 struct coding_system *coding;
159 struct coding_detection_info *detect_info;
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
163 int multibytep = coding->src_multibyte;
164 int consumed_chars = 0;
165 int found = 0;
166 ...;
168 while (1)
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
181 return 0;
183 no_more_source:
184 /* The source exausted successfully. */
185 detect_info->found |= found;
186 return 1;
188 #endif
190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
202 Below is the template of these functions. */
204 #if 0
205 static void
206 decode_coding_XXXX (coding)
207 struct coding_system *coding;
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
214 const unsigned char *src_base;
215 /* A buffer to produce decoded characters. */
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
218 int multibytep = coding->src_multibyte;
220 while (1)
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
243 #endif
245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
261 Below is a template of these functions. */
262 #if 0
263 static void
264 encode_coding_XXX (coding)
265 struct coding_system *coding;
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
285 #endif
288 /*** 1. Preamble ***/
290 #include <config.h>
291 #include <stdio.h>
293 #include "lisp.h"
294 #include "buffer.h"
295 #include "character.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
307 Lisp_Object Qunix, Qdos;
308 extern Lisp_Object Qmac; /* frame.c */
309 Lisp_Object Qbuffer_file_coding_system;
310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
311 Lisp_Object Qdefault_char;
312 Lisp_Object Qno_conversion, Qundecided;
313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
314 Lisp_Object Qbig, Qlittle;
315 Lisp_Object Qcoding_system_history;
316 Lisp_Object Qvalid_codes;
317 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
320 Lisp_Object QCascii_compatible_p;
322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
323 Lisp_Object Qcall_process, Qcall_process_region;
324 Lisp_Object Qstart_process, Qopen_network_stream;
325 Lisp_Object Qtarget_idx;
327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328 Lisp_Object Qinterrupted, Qinsufficient_memory;
330 extern Lisp_Object Qcompletion_ignore_case;
332 /* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334 static Lisp_Object Qcoding_system_define_form;
336 int coding_system_require_warning;
338 Lisp_Object Vselect_safe_coding_system_function;
340 /* Mnemonic string for each format of end-of-line. */
341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342 /* Mnemonic string to indicate format of end-of-line is not yet
343 decided. */
344 Lisp_Object eol_mnemonic_undecided;
346 /* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350 static Lisp_Object system_eol_type;
352 #ifdef emacs
354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
358 /* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360 Lisp_Object Qemacs_mule, Qraw_text;
361 Lisp_Object Qutf_8_emacs;
363 /* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365 /* Coding-system for reading files and receiving data from process. */
366 Lisp_Object Vcoding_system_for_read;
367 /* Coding-system for writing files and sending data to process. */
368 Lisp_Object Vcoding_system_for_write;
369 /* Coding-system actually used in the latest I/O. */
370 Lisp_Object Vlast_coding_system_used;
371 /* Set to non-nil when an error is detected while code conversion. */
372 Lisp_Object Vlast_code_conversion_error;
373 /* A vector of length 256 which contains information about special
374 Latin codes (especially for dealing with Microsoft codes). */
375 Lisp_Object Vlatin_extra_code_table;
377 /* Flag to inhibit code conversion of end-of-line format. */
378 int inhibit_eol_conversion;
380 /* Flag to inhibit ISO2022 escape sequence detection. */
381 int inhibit_iso_escape_detection;
383 /* Flag to inhibit detection of binary files through null bytes. */
384 int inhibit_null_byte_detection;
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
389 /* Coding system to be used to encode text for terminal display when
390 terminal coding system is nil. */
391 struct coding_system safe_terminal_coding;
393 Lisp_Object Vfile_coding_system_alist;
394 Lisp_Object Vprocess_coding_system_alist;
395 Lisp_Object Vnetwork_coding_system_alist;
397 Lisp_Object Vlocale_coding_system;
399 #endif /* emacs */
401 /* Flag to tell if we look up translation table on character code
402 conversion. */
403 Lisp_Object Venable_character_translation;
404 /* Standard translation table to look up on decoding (reading). */
405 Lisp_Object Vstandard_translation_table_for_decode;
406 /* Standard translation table to look up on encoding (writing). */
407 Lisp_Object Vstandard_translation_table_for_encode;
409 Lisp_Object Qtranslation_table;
410 Lisp_Object Qtranslation_table_id;
411 Lisp_Object Qtranslation_table_for_decode;
412 Lisp_Object Qtranslation_table_for_encode;
414 /* Alist of charsets vs revision number. */
415 static Lisp_Object Vcharset_revision_table;
417 /* Default coding systems used for process I/O. */
418 Lisp_Object Vdefault_process_coding_system;
420 /* Char table for translating Quail and self-inserting input. */
421 Lisp_Object Vtranslation_table_for_input;
423 /* Two special coding systems. */
424 Lisp_Object Vsjis_coding_system;
425 Lisp_Object Vbig5_coding_system;
427 /* ISO2022 section */
429 #define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
435 #define CODING_ISO_REQUEST(coding, charset_id) \
436 (((charset_id) <= (coding)->max_charset_id \
437 ? ((coding)->safe_charsets[charset_id] != 255 \
438 ? (coding)->safe_charsets[charset_id] \
439 : -1) \
440 : -1))
443 #define CODING_ISO_FLAGS(coding) \
444 ((coding)->spec.iso_2022.flags)
445 #define CODING_ISO_DESIGNATION(coding, reg) \
446 ((coding)->spec.iso_2022.current_designation[reg])
447 #define CODING_ISO_INVOCATION(coding, plane) \
448 ((coding)->spec.iso_2022.current_invocation[plane])
449 #define CODING_ISO_SINGLE_SHIFTING(coding) \
450 ((coding)->spec.iso_2022.single_shifting)
451 #define CODING_ISO_BOL(coding) \
452 ((coding)->spec.iso_2022.bol)
453 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
454 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
455 #define CODING_ISO_CMP_STATUS(coding) \
456 (&(coding)->spec.iso_2022.cmp_status)
457 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
458 ((coding)->spec.iso_2022.ctext_extended_segment_len)
459 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
460 ((coding)->spec.iso_2022.embedded_utf_8)
462 /* Control characters of ISO2022. */
463 /* code */ /* function */
464 #define ISO_CODE_LF 0x0A /* line-feed */
465 #define ISO_CODE_CR 0x0D /* carriage-return */
466 #define ISO_CODE_SO 0x0E /* shift-out */
467 #define ISO_CODE_SI 0x0F /* shift-in */
468 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
469 #define ISO_CODE_ESC 0x1B /* escape */
470 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
471 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
472 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
474 /* All code (1-byte) of ISO2022 is classified into one of the
475 followings. */
476 enum iso_code_class_type
478 ISO_control_0, /* Control codes in the range
479 0x00..0x1F and 0x7F, except for the
480 following 5 codes. */
481 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
482 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
483 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
484 ISO_escape, /* ISO_CODE_SO (0x1B) */
485 ISO_control_1, /* Control codes in the range
486 0x80..0x9F, except for the
487 following 3 codes. */
488 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
489 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
490 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
491 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
492 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
493 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
494 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
497 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
498 `iso-flags' attribute of an iso2022 coding system. */
500 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
501 instead of the correct short-form sequence (e.g. ESC $ A). */
502 #define CODING_ISO_FLAG_LONG_FORM 0x0001
504 /* If set, reset graphic planes and registers at end-of-line to the
505 initial state. */
506 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
508 /* If set, reset graphic planes and registers before any control
509 characters to the initial state. */
510 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
512 /* If set, encode by 7-bit environment. */
513 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
515 /* If set, use locking-shift function. */
516 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
518 /* If set, use single-shift function. Overwrite
519 CODING_ISO_FLAG_LOCKING_SHIFT. */
520 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
522 /* If set, use designation escape sequence. */
523 #define CODING_ISO_FLAG_DESIGNATION 0x0040
525 /* If set, produce revision number sequence. */
526 #define CODING_ISO_FLAG_REVISION 0x0080
528 /* If set, produce ISO6429's direction specifying sequence. */
529 #define CODING_ISO_FLAG_DIRECTION 0x0100
531 /* If set, assume designation states are reset at beginning of line on
532 output. */
533 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
535 /* If set, designation sequence should be placed at beginning of line
536 on output. */
537 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
539 /* If set, do not encode unsafe charactes on output. */
540 #define CODING_ISO_FLAG_SAFE 0x0800
542 /* If set, extra latin codes (128..159) are accepted as a valid code
543 on input. */
544 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
546 #define CODING_ISO_FLAG_COMPOSITION 0x2000
548 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
550 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
552 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
554 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
556 /* A character to be produced on output if encoding of the original
557 character is prohibited by CODING_ISO_FLAG_SAFE. */
558 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
560 /* UTF-8 section */
561 #define CODING_UTF_8_BOM(coding) \
562 ((coding)->spec.utf_8_bom)
564 /* UTF-16 section */
565 #define CODING_UTF_16_BOM(coding) \
566 ((coding)->spec.utf_16.bom)
568 #define CODING_UTF_16_ENDIAN(coding) \
569 ((coding)->spec.utf_16.endian)
571 #define CODING_UTF_16_SURROGATE(coding) \
572 ((coding)->spec.utf_16.surrogate)
575 /* CCL section */
576 #define CODING_CCL_DECODER(coding) \
577 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
578 #define CODING_CCL_ENCODER(coding) \
579 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
580 #define CODING_CCL_VALIDS(coding) \
581 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
583 /* Index for each coding category in `coding_categories' */
585 enum coding_category
587 coding_category_iso_7,
588 coding_category_iso_7_tight,
589 coding_category_iso_8_1,
590 coding_category_iso_8_2,
591 coding_category_iso_7_else,
592 coding_category_iso_8_else,
593 coding_category_utf_8_auto,
594 coding_category_utf_8_nosig,
595 coding_category_utf_8_sig,
596 coding_category_utf_16_auto,
597 coding_category_utf_16_be,
598 coding_category_utf_16_le,
599 coding_category_utf_16_be_nosig,
600 coding_category_utf_16_le_nosig,
601 coding_category_charset,
602 coding_category_sjis,
603 coding_category_big5,
604 coding_category_ccl,
605 coding_category_emacs_mule,
606 /* All above are targets of code detection. */
607 coding_category_raw_text,
608 coding_category_undecided,
609 coding_category_max
612 /* Definitions of flag bits used in detect_coding_XXXX. */
613 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
614 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
615 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
616 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
617 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
618 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
619 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
620 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
621 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
622 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
623 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
624 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
625 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
626 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
627 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
628 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
629 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
630 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
631 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
632 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
634 /* This value is returned if detect_coding_mask () find nothing other
635 than ASCII characters. */
636 #define CATEGORY_MASK_ANY \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_8_1 \
640 | CATEGORY_MASK_ISO_8_2 \
641 | CATEGORY_MASK_ISO_7_ELSE \
642 | CATEGORY_MASK_ISO_8_ELSE \
643 | CATEGORY_MASK_UTF_8_AUTO \
644 | CATEGORY_MASK_UTF_8_NOSIG \
645 | CATEGORY_MASK_UTF_8_SIG \
646 | CATEGORY_MASK_UTF_16_AUTO \
647 | CATEGORY_MASK_UTF_16_BE \
648 | CATEGORY_MASK_UTF_16_LE \
649 | CATEGORY_MASK_UTF_16_BE_NOSIG \
650 | CATEGORY_MASK_UTF_16_LE_NOSIG \
651 | CATEGORY_MASK_CHARSET \
652 | CATEGORY_MASK_SJIS \
653 | CATEGORY_MASK_BIG5 \
654 | CATEGORY_MASK_CCL \
655 | CATEGORY_MASK_EMACS_MULE)
658 #define CATEGORY_MASK_ISO_7BIT \
659 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
661 #define CATEGORY_MASK_ISO_8BIT \
662 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
664 #define CATEGORY_MASK_ISO_ELSE \
665 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
667 #define CATEGORY_MASK_ISO_ESCAPE \
668 (CATEGORY_MASK_ISO_7 \
669 | CATEGORY_MASK_ISO_7_TIGHT \
670 | CATEGORY_MASK_ISO_7_ELSE \
671 | CATEGORY_MASK_ISO_8_ELSE)
673 #define CATEGORY_MASK_ISO \
674 ( CATEGORY_MASK_ISO_7BIT \
675 | CATEGORY_MASK_ISO_8BIT \
676 | CATEGORY_MASK_ISO_ELSE)
678 #define CATEGORY_MASK_UTF_16 \
679 (CATEGORY_MASK_UTF_16_AUTO \
680 | CATEGORY_MASK_UTF_16_BE \
681 | CATEGORY_MASK_UTF_16_LE \
682 | CATEGORY_MASK_UTF_16_BE_NOSIG \
683 | CATEGORY_MASK_UTF_16_LE_NOSIG)
685 #define CATEGORY_MASK_UTF_8 \
686 (CATEGORY_MASK_UTF_8_AUTO \
687 | CATEGORY_MASK_UTF_8_NOSIG \
688 | CATEGORY_MASK_UTF_8_SIG)
690 /* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692 static Lisp_Object Vcoding_category_list;
694 /* Table of coding categories (Lisp symbols). This variable is for
695 internal use oly. */
696 static Lisp_Object Vcoding_category_table;
698 /* Table of coding-categories ordered by priority. */
699 static enum coding_category coding_priorities[coding_category_max];
701 /* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703 static struct coding_system coding_categories[coding_category_max];
705 /*** Commonly used macros and functions ***/
707 #ifndef min
708 #define min(a, b) ((a) < (b) ? (a) : (b))
709 #endif
710 #ifndef max
711 #define max(a, b) ((a) > (b) ? (a) : (b))
712 #endif
714 #define CODING_GET_INFO(coding, attrs, charset_list) \
715 do { \
716 (attrs) = CODING_ID_ATTRS ((coding)->id); \
717 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
718 } while (0)
721 /* Safely get one byte from the source text pointed by SRC which ends
722 at SRC_END, and set C to that byte. If there are not enough bytes
723 in the source, it jumps to `no_more_source'. If multibytep is
724 nonzero, and a multibyte character is found at SRC, set C to the
725 negative value of the character code. The caller should declare
726 and set these variables appropriately in advance:
727 src, src_end, multibytep */
729 #define ONE_MORE_BYTE(c) \
730 do { \
731 if (src == src_end) \
733 if (src_base < src) \
734 record_conversion_result \
735 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
736 goto no_more_source; \
738 c = *src++; \
739 if (multibytep && (c & 0x80)) \
741 if ((c & 0xFE) == 0xC0) \
742 c = ((c & 1) << 6) | *src++; \
743 else \
745 src--; \
746 c = - string_char (src, &src, NULL); \
747 record_conversion_result \
748 (coding, CODING_RESULT_INVALID_SRC); \
751 consumed_chars++; \
752 } while (0)
754 /* Safely get two bytes from the source text pointed by SRC which ends
755 at SRC_END, and set C1 and C2 to those bytes while skipping the
756 heading multibyte characters. If there are not enough bytes in the
757 source, it jumps to `no_more_source'. If multibytep is nonzero and
758 a multibyte character is found for C2, set C2 to the negative value
759 of the character code. The caller should declare and set these
760 variables appropriately in advance:
761 src, src_end, multibytep
762 It is intended that this macro is used in detect_coding_utf_16. */
764 #define TWO_MORE_BYTES(c1, c2) \
765 do { \
766 do { \
767 if (src == src_end) \
768 goto no_more_source; \
769 c1 = *src++; \
770 if (multibytep && (c1 & 0x80)) \
772 if ((c1 & 0xFE) == 0xC0) \
773 c1 = ((c1 & 1) << 6) | *src++; \
774 else \
776 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
777 c1 = -1; \
780 } while (c1 < 0); \
781 if (src == src_end) \
782 goto no_more_source; \
783 c2 = *src++; \
784 if (multibytep && (c2 & 0x80)) \
786 if ((c2 & 0xFE) == 0xC0) \
787 c2 = ((c2 & 1) << 6) | *src++; \
788 else \
789 c2 = -1; \
791 } while (0)
794 #define ONE_MORE_BYTE_NO_CHECK(c) \
795 do { \
796 c = *src++; \
797 if (multibytep && (c & 0x80)) \
799 if ((c & 0xFE) == 0xC0) \
800 c = ((c & 1) << 6) | *src++; \
801 else \
803 src--; \
804 c = - string_char (src, &src, NULL); \
805 record_conversion_result \
806 (coding, CODING_RESULT_INVALID_SRC); \
809 consumed_chars++; \
810 } while (0)
813 /* Store a byte C in the place pointed by DST and increment DST to the
814 next free point, and increment PRODUCED_CHARS. The caller should
815 assure that C is 0..127, and declare and set the variable `dst'
816 appropriately in advance.
820 #define EMIT_ONE_ASCII_BYTE(c) \
821 do { \
822 produced_chars++; \
823 *dst++ = (c); \
824 } while (0)
827 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
829 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
830 do { \
831 produced_chars += 2; \
832 *dst++ = (c1), *dst++ = (c2); \
833 } while (0)
836 /* Store a byte C in the place pointed by DST and increment DST to the
837 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
838 nonzero, store in an appropriate multibyte from. The caller should
839 declare and set the variables `dst' and `multibytep' appropriately
840 in advance. */
842 #define EMIT_ONE_BYTE(c) \
843 do { \
844 produced_chars++; \
845 if (multibytep) \
847 int ch = (c); \
848 if (ch >= 0x80) \
849 ch = BYTE8_TO_CHAR (ch); \
850 CHAR_STRING_ADVANCE (ch, dst); \
852 else \
853 *dst++ = (c); \
854 } while (0)
857 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
859 #define EMIT_TWO_BYTES(c1, c2) \
860 do { \
861 produced_chars += 2; \
862 if (multibytep) \
864 int ch; \
866 ch = (c1); \
867 if (ch >= 0x80) \
868 ch = BYTE8_TO_CHAR (ch); \
869 CHAR_STRING_ADVANCE (ch, dst); \
870 ch = (c2); \
871 if (ch >= 0x80) \
872 ch = BYTE8_TO_CHAR (ch); \
873 CHAR_STRING_ADVANCE (ch, dst); \
875 else \
877 *dst++ = (c1); \
878 *dst++ = (c2); \
880 } while (0)
883 #define EMIT_THREE_BYTES(c1, c2, c3) \
884 do { \
885 EMIT_ONE_BYTE (c1); \
886 EMIT_TWO_BYTES (c2, c3); \
887 } while (0)
890 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
891 do { \
892 EMIT_TWO_BYTES (c1, c2); \
893 EMIT_TWO_BYTES (c3, c4); \
894 } while (0)
897 /* Prototypes for static functions. */
898 static void record_conversion_result P_ ((struct coding_system *coding,
899 enum coding_result_code result));
900 static int detect_coding_utf_8 P_ ((struct coding_system *,
901 struct coding_detection_info *info));
902 static void decode_coding_utf_8 P_ ((struct coding_system *));
903 static int encode_coding_utf_8 P_ ((struct coding_system *));
905 static int detect_coding_utf_16 P_ ((struct coding_system *,
906 struct coding_detection_info *info));
907 static void decode_coding_utf_16 P_ ((struct coding_system *));
908 static int encode_coding_utf_16 P_ ((struct coding_system *));
910 static int detect_coding_iso_2022 P_ ((struct coding_system *,
911 struct coding_detection_info *info));
912 static void decode_coding_iso_2022 P_ ((struct coding_system *));
913 static int encode_coding_iso_2022 P_ ((struct coding_system *));
915 static int detect_coding_emacs_mule P_ ((struct coding_system *,
916 struct coding_detection_info *info));
917 static void decode_coding_emacs_mule P_ ((struct coding_system *));
918 static int encode_coding_emacs_mule P_ ((struct coding_system *));
920 static int detect_coding_sjis P_ ((struct coding_system *,
921 struct coding_detection_info *info));
922 static void decode_coding_sjis P_ ((struct coding_system *));
923 static int encode_coding_sjis P_ ((struct coding_system *));
925 static int detect_coding_big5 P_ ((struct coding_system *,
926 struct coding_detection_info *info));
927 static void decode_coding_big5 P_ ((struct coding_system *));
928 static int encode_coding_big5 P_ ((struct coding_system *));
930 static int detect_coding_ccl P_ ((struct coding_system *,
931 struct coding_detection_info *info));
932 static void decode_coding_ccl P_ ((struct coding_system *));
933 static int encode_coding_ccl P_ ((struct coding_system *));
935 static void decode_coding_raw_text P_ ((struct coding_system *));
936 static int encode_coding_raw_text P_ ((struct coding_system *));
938 static void coding_set_source P_ ((struct coding_system *));
939 static void coding_set_destination P_ ((struct coding_system *));
940 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
941 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
942 EMACS_INT, EMACS_INT));
943 static unsigned char *alloc_destination P_ ((struct coding_system *,
944 EMACS_INT, unsigned char *));
945 static void setup_iso_safe_charsets P_ ((Lisp_Object));
946 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
947 int *, int *,
948 unsigned char *));
949 static int detect_eol P_ ((const unsigned char *,
950 EMACS_INT, enum coding_category));
951 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
952 static void decode_eol P_ ((struct coding_system *));
953 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
954 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
955 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
956 static INLINE void produce_charset P_ ((struct coding_system *, int *,
957 EMACS_INT));
958 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
959 static int decode_coding P_ ((struct coding_system *));
960 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
961 struct coding_system *,
962 int *, EMACS_INT *));
963 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
964 struct coding_system *,
965 int *, EMACS_INT *));
966 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
967 static int encode_coding P_ ((struct coding_system *));
968 static Lisp_Object make_conversion_work_buffer P_ ((int));
969 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
970 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
971 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
973 static void
974 record_conversion_result (struct coding_system *coding,
975 enum coding_result_code result)
977 coding->result = result;
978 switch (result)
980 case CODING_RESULT_INSUFFICIENT_SRC:
981 Vlast_code_conversion_error = Qinsufficient_source;
982 break;
983 case CODING_RESULT_INCONSISTENT_EOL:
984 Vlast_code_conversion_error = Qinconsistent_eol;
985 break;
986 case CODING_RESULT_INVALID_SRC:
987 Vlast_code_conversion_error = Qinvalid_source;
988 break;
989 case CODING_RESULT_INTERRUPT:
990 Vlast_code_conversion_error = Qinterrupted;
991 break;
992 case CODING_RESULT_INSUFFICIENT_MEM:
993 Vlast_code_conversion_error = Qinsufficient_memory;
994 break;
995 default:
996 Vlast_code_conversion_error = intern ("Unknown error");
1000 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1001 do { \
1002 charset_map_loaded = 0; \
1003 c = DECODE_CHAR (charset, code); \
1004 if (charset_map_loaded) \
1006 const unsigned char *orig = coding->source; \
1007 EMACS_INT offset; \
1009 coding_set_source (coding); \
1010 offset = coding->source - orig; \
1011 src += offset; \
1012 src_base += offset; \
1013 src_end += offset; \
1015 } while (0)
1018 /* If there are at least BYTES length of room at dst, allocate memory
1019 for coding->destination and update dst and dst_end. We don't have
1020 to take care of coding->source which will be relocated. It is
1021 handled by calling coding_set_source in encode_coding. */
1023 #define ASSURE_DESTINATION(bytes) \
1024 do { \
1025 if (dst + (bytes) >= dst_end) \
1027 int more_bytes = charbuf_end - charbuf + (bytes); \
1029 dst = alloc_destination (coding, more_bytes, dst); \
1030 dst_end = coding->destination + coding->dst_bytes; \
1032 } while (0)
1035 /* Store multibyte form of the character C in P, and advance P to the
1036 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
1037 never calls MAYBE_UNIFY_CHAR. */
1039 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
1040 do { \
1041 if ((c) <= MAX_1_BYTE_CHAR) \
1042 *(p)++ = (c); \
1043 else if ((c) <= MAX_2_BYTE_CHAR) \
1044 *(p)++ = (0xC0 | ((c) >> 6)), \
1045 *(p)++ = (0x80 | ((c) & 0x3F)); \
1046 else if ((c) <= MAX_3_BYTE_CHAR) \
1047 *(p)++ = (0xE0 | ((c) >> 12)), \
1048 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1049 *(p)++ = (0x80 | ((c) & 0x3F)); \
1050 else if ((c) <= MAX_4_BYTE_CHAR) \
1051 *(p)++ = (0xF0 | (c >> 18)), \
1052 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1053 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1054 *(p)++ = (0x80 | (c & 0x3F)); \
1055 else if ((c) <= MAX_5_BYTE_CHAR) \
1056 *(p)++ = 0xF8, \
1057 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1058 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1059 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1060 *(p)++ = (0x80 | (c & 0x3F)); \
1061 else \
1062 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1063 } while (0)
1066 /* Return the character code of character whose multibyte form is at
1067 P, and advance P to the end of the multibyte form. This is like
1068 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1070 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1071 (!((p)[0] & 0x80) \
1072 ? *(p)++ \
1073 : ! ((p)[0] & 0x20) \
1074 ? ((p) += 2, \
1075 ((((p)[-2] & 0x1F) << 6) \
1076 | ((p)[-1] & 0x3F) \
1077 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1078 : ! ((p)[0] & 0x10) \
1079 ? ((p) += 3, \
1080 ((((p)[-3] & 0x0F) << 12) \
1081 | (((p)[-2] & 0x3F) << 6) \
1082 | ((p)[-1] & 0x3F))) \
1083 : ! ((p)[0] & 0x08) \
1084 ? ((p) += 4, \
1085 ((((p)[-4] & 0xF) << 18) \
1086 | (((p)[-3] & 0x3F) << 12) \
1087 | (((p)[-2] & 0x3F) << 6) \
1088 | ((p)[-1] & 0x3F))) \
1089 : ((p) += 5, \
1090 ((((p)[-4] & 0x3F) << 18) \
1091 | (((p)[-3] & 0x3F) << 12) \
1092 | (((p)[-2] & 0x3F) << 6) \
1093 | ((p)[-1] & 0x3F))))
1096 static void
1097 coding_set_source (coding)
1098 struct coding_system *coding;
1100 if (BUFFERP (coding->src_object))
1102 struct buffer *buf = XBUFFER (coding->src_object);
1104 if (coding->src_pos < 0)
1105 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1106 else
1107 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1109 else if (STRINGP (coding->src_object))
1111 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1113 else
1114 /* Otherwise, the source is C string and is never relocated
1115 automatically. Thus we don't have to update anything. */
1119 static void
1120 coding_set_destination (coding)
1121 struct coding_system *coding;
1123 if (BUFFERP (coding->dst_object))
1125 if (coding->src_pos < 0)
1127 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1128 coding->dst_bytes = (GAP_END_ADDR
1129 - (coding->src_bytes - coding->consumed)
1130 - coding->destination);
1132 else
1134 /* We are sure that coding->dst_pos_byte is before the gap
1135 of the buffer. */
1136 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1137 + coding->dst_pos_byte - BEG_BYTE);
1138 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1139 - coding->destination);
1142 else
1143 /* Otherwise, the destination is C string and is never relocated
1144 automatically. Thus we don't have to update anything. */
1149 static void
1150 coding_alloc_by_realloc (coding, bytes)
1151 struct coding_system *coding;
1152 EMACS_INT bytes;
1154 coding->destination = (unsigned char *) xrealloc (coding->destination,
1155 coding->dst_bytes + bytes);
1156 coding->dst_bytes += bytes;
1159 static void
1160 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1161 struct coding_system *coding;
1162 EMACS_INT gap_head_used, bytes;
1164 if (EQ (coding->src_object, coding->dst_object))
1166 /* The gap may contain the produced data at the head and not-yet
1167 consumed data at the tail. To preserve those data, we at
1168 first make the gap size to zero, then increase the gap
1169 size. */
1170 EMACS_INT add = GAP_SIZE;
1172 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1173 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1174 make_gap (bytes);
1175 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1176 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1178 else
1180 Lisp_Object this_buffer;
1182 this_buffer = Fcurrent_buffer ();
1183 set_buffer_internal (XBUFFER (coding->dst_object));
1184 make_gap (bytes);
1185 set_buffer_internal (XBUFFER (this_buffer));
1190 static unsigned char *
1191 alloc_destination (coding, nbytes, dst)
1192 struct coding_system *coding;
1193 EMACS_INT nbytes;
1194 unsigned char *dst;
1196 EMACS_INT offset = dst - coding->destination;
1198 if (BUFFERP (coding->dst_object))
1200 struct buffer *buf = XBUFFER (coding->dst_object);
1202 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1204 else
1205 coding_alloc_by_realloc (coding, nbytes);
1206 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1207 coding_set_destination (coding);
1208 dst = coding->destination + offset;
1209 return dst;
1212 /** Macros for annotations. */
1214 /* An annotation data is stored in the array coding->charbuf in this
1215 format:
1216 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1217 LENGTH is the number of elements in the annotation.
1218 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1219 NCHARS is the number of characters in the text annotated.
1221 The format of the following elements depend on ANNOTATION_MASK.
1223 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1224 follows:
1225 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1227 NBYTES is the number of bytes specified in the header part of
1228 old-style emacs-mule encoding, or 0 for the other kind of
1229 composition.
1231 METHOD is one of enum composition_method.
1233 Optionnal COMPOSITION-COMPONENTS are characters and composition
1234 rules.
1236 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1237 follows.
1239 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1240 recover from an invalid annotation, and should be skipped by
1241 produce_annotation. */
1243 /* Maximum length of the header of annotation data. */
1244 #define MAX_ANNOTATION_LENGTH 5
1246 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1247 do { \
1248 *(buf)++ = -(len); \
1249 *(buf)++ = (mask); \
1250 *(buf)++ = (nchars); \
1251 coding->annotated = 1; \
1252 } while (0);
1254 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1255 do { \
1256 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1257 *buf++ = nbytes; \
1258 *buf++ = method; \
1259 } while (0)
1262 #define ADD_CHARSET_DATA(buf, nchars, id) \
1263 do { \
1264 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1265 *buf++ = id; \
1266 } while (0)
1269 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1274 /*** 3. UTF-8 ***/
1276 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1277 Check if a text is encoded in UTF-8. If it is, return 1, else
1278 return 0. */
1280 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1281 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1282 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1283 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1284 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1285 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1287 #define UTF_BOM 0xFEFF
1288 #define UTF_8_BOM_1 0xEF
1289 #define UTF_8_BOM_2 0xBB
1290 #define UTF_8_BOM_3 0xBF
1292 static int
1293 detect_coding_utf_8 (coding, detect_info)
1294 struct coding_system *coding;
1295 struct coding_detection_info *detect_info;
1297 const unsigned char *src = coding->source, *src_base;
1298 const unsigned char *src_end = coding->source + coding->src_bytes;
1299 int multibytep = coding->src_multibyte;
1300 int consumed_chars = 0;
1301 int bom_found = 0;
1302 int found = 0;
1304 detect_info->checked |= CATEGORY_MASK_UTF_8;
1305 /* A coding system of this category is always ASCII compatible. */
1306 src += coding->head_ascii;
1308 while (1)
1310 int c, c1, c2, c3, c4;
1312 src_base = src;
1313 ONE_MORE_BYTE (c);
1314 if (c < 0 || UTF_8_1_OCTET_P (c))
1315 continue;
1316 ONE_MORE_BYTE (c1);
1317 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1318 break;
1319 if (UTF_8_2_OCTET_LEADING_P (c))
1321 found = 1;
1322 continue;
1324 ONE_MORE_BYTE (c2);
1325 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1326 break;
1327 if (UTF_8_3_OCTET_LEADING_P (c))
1329 found = 1;
1330 if (src_base == coding->source
1331 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1332 bom_found = 1;
1333 continue;
1335 ONE_MORE_BYTE (c3);
1336 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1337 break;
1338 if (UTF_8_4_OCTET_LEADING_P (c))
1340 found = 1;
1341 continue;
1343 ONE_MORE_BYTE (c4);
1344 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1345 break;
1346 if (UTF_8_5_OCTET_LEADING_P (c))
1348 found = 1;
1349 continue;
1351 break;
1353 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1354 return 0;
1356 no_more_source:
1357 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1359 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1360 return 0;
1362 if (bom_found)
1364 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1365 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1367 else
1369 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1370 if (found)
1371 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1373 return 1;
1377 static void
1378 decode_coding_utf_8 (coding)
1379 struct coding_system *coding;
1381 const unsigned char *src = coding->source + coding->consumed;
1382 const unsigned char *src_end = coding->source + coding->src_bytes;
1383 const unsigned char *src_base;
1384 int *charbuf = coding->charbuf + coding->charbuf_used;
1385 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1386 int consumed_chars = 0, consumed_chars_base = 0;
1387 int multibytep = coding->src_multibyte;
1388 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1389 Lisp_Object attr, charset_list;
1390 int eol_crlf =
1391 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1392 int byte_after_cr = -1;
1394 CODING_GET_INFO (coding, attr, charset_list);
1396 if (bom != utf_without_bom)
1398 int c1, c2, c3;
1400 src_base = src;
1401 ONE_MORE_BYTE (c1);
1402 if (! UTF_8_3_OCTET_LEADING_P (c1))
1403 src = src_base;
1404 else
1406 ONE_MORE_BYTE (c2);
1407 if (! UTF_8_EXTRA_OCTET_P (c2))
1408 src = src_base;
1409 else
1411 ONE_MORE_BYTE (c3);
1412 if (! UTF_8_EXTRA_OCTET_P (c3))
1413 src = src_base;
1414 else
1416 if ((c1 != UTF_8_BOM_1)
1417 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1418 src = src_base;
1419 else
1420 CODING_UTF_8_BOM (coding) = utf_without_bom;
1425 CODING_UTF_8_BOM (coding) = utf_without_bom;
1429 while (1)
1431 int c, c1, c2, c3, c4, c5;
1433 src_base = src;
1434 consumed_chars_base = consumed_chars;
1436 if (charbuf >= charbuf_end)
1438 if (byte_after_cr >= 0)
1439 src_base--;
1440 break;
1443 if (byte_after_cr >= 0)
1444 c1 = byte_after_cr, byte_after_cr = -1;
1445 else
1446 ONE_MORE_BYTE (c1);
1447 if (c1 < 0)
1449 c = - c1;
1451 else if (UTF_8_1_OCTET_P(c1))
1453 if (eol_crlf && c1 == '\r')
1454 ONE_MORE_BYTE (byte_after_cr);
1455 c = c1;
1457 else
1459 ONE_MORE_BYTE (c2);
1460 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1461 goto invalid_code;
1462 if (UTF_8_2_OCTET_LEADING_P (c1))
1464 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1465 /* Reject overlong sequences here and below. Encoders
1466 producing them are incorrect, they can be misleading,
1467 and they mess up read/write invariance. */
1468 if (c < 128)
1469 goto invalid_code;
1471 else
1473 ONE_MORE_BYTE (c3);
1474 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1475 goto invalid_code;
1476 if (UTF_8_3_OCTET_LEADING_P (c1))
1478 c = (((c1 & 0xF) << 12)
1479 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1480 if (c < 0x800
1481 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1482 goto invalid_code;
1484 else
1486 ONE_MORE_BYTE (c4);
1487 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1488 goto invalid_code;
1489 if (UTF_8_4_OCTET_LEADING_P (c1))
1491 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1492 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1493 if (c < 0x10000)
1494 goto invalid_code;
1496 else
1498 ONE_MORE_BYTE (c5);
1499 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1500 goto invalid_code;
1501 if (UTF_8_5_OCTET_LEADING_P (c1))
1503 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1504 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1505 | (c5 & 0x3F));
1506 if ((c > MAX_CHAR) || (c < 0x200000))
1507 goto invalid_code;
1509 else
1510 goto invalid_code;
1516 *charbuf++ = c;
1517 continue;
1519 invalid_code:
1520 src = src_base;
1521 consumed_chars = consumed_chars_base;
1522 ONE_MORE_BYTE (c);
1523 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1524 coding->errors++;
1527 no_more_source:
1528 coding->consumed_char += consumed_chars_base;
1529 coding->consumed = src_base - coding->source;
1530 coding->charbuf_used = charbuf - coding->charbuf;
1534 static int
1535 encode_coding_utf_8 (coding)
1536 struct coding_system *coding;
1538 int multibytep = coding->dst_multibyte;
1539 int *charbuf = coding->charbuf;
1540 int *charbuf_end = charbuf + coding->charbuf_used;
1541 unsigned char *dst = coding->destination + coding->produced;
1542 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1543 int produced_chars = 0;
1544 int c;
1546 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1548 ASSURE_DESTINATION (3);
1549 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1550 CODING_UTF_8_BOM (coding) = utf_without_bom;
1553 if (multibytep)
1555 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1557 while (charbuf < charbuf_end)
1559 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1561 ASSURE_DESTINATION (safe_room);
1562 c = *charbuf++;
1563 if (CHAR_BYTE8_P (c))
1565 c = CHAR_TO_BYTE8 (c);
1566 EMIT_ONE_BYTE (c);
1568 else
1570 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1571 for (p = str; p < pend; p++)
1572 EMIT_ONE_BYTE (*p);
1576 else
1578 int safe_room = MAX_MULTIBYTE_LENGTH;
1580 while (charbuf < charbuf_end)
1582 ASSURE_DESTINATION (safe_room);
1583 c = *charbuf++;
1584 if (CHAR_BYTE8_P (c))
1585 *dst++ = CHAR_TO_BYTE8 (c);
1586 else
1587 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1588 produced_chars++;
1591 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1592 coding->produced_char += produced_chars;
1593 coding->produced = dst - coding->destination;
1594 return 0;
1598 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1599 Check if a text is encoded in one of UTF-16 based coding systems.
1600 If it is, return 1, else return 0. */
1602 #define UTF_16_HIGH_SURROGATE_P(val) \
1603 (((val) & 0xFC00) == 0xD800)
1605 #define UTF_16_LOW_SURROGATE_P(val) \
1606 (((val) & 0xFC00) == 0xDC00)
1608 #define UTF_16_INVALID_P(val) \
1609 (((val) == 0xFFFE) \
1610 || ((val) == 0xFFFF) \
1611 || UTF_16_LOW_SURROGATE_P (val))
1614 static int
1615 detect_coding_utf_16 (coding, detect_info)
1616 struct coding_system *coding;
1617 struct coding_detection_info *detect_info;
1619 const unsigned char *src = coding->source, *src_base = src;
1620 const unsigned char *src_end = coding->source + coding->src_bytes;
1621 int multibytep = coding->src_multibyte;
1622 int consumed_chars = 0;
1623 int c1, c2;
1625 detect_info->checked |= CATEGORY_MASK_UTF_16;
1626 if (coding->mode & CODING_MODE_LAST_BLOCK
1627 && (coding->src_chars & 1))
1629 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1630 return 0;
1633 TWO_MORE_BYTES (c1, c2);
1634 if ((c1 == 0xFF) && (c2 == 0xFE))
1636 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1637 | CATEGORY_MASK_UTF_16_AUTO);
1638 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1639 | CATEGORY_MASK_UTF_16_BE_NOSIG
1640 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1642 else if ((c1 == 0xFE) && (c2 == 0xFF))
1644 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1645 | CATEGORY_MASK_UTF_16_AUTO);
1646 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1647 | CATEGORY_MASK_UTF_16_BE_NOSIG
1648 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1650 else if (c2 < 0)
1652 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1653 return 0;
1655 else
1657 /* We check the dispersion of Eth and Oth bytes where E is even and
1658 O is odd. If both are high, we assume binary data.*/
1659 unsigned char e[256], o[256];
1660 unsigned e_num = 1, o_num = 1;
1662 memset (e, 0, 256);
1663 memset (o, 0, 256);
1664 e[c1] = 1;
1665 o[c2] = 1;
1667 detect_info->rejected
1668 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1670 while (1)
1672 TWO_MORE_BYTES (c1, c2);
1673 if (c2 < 0)
1674 break;
1675 if (! e[c1])
1677 e[c1] = 1;
1678 e_num++;
1679 if (e_num >= 128)
1680 break;
1682 if (! o[c2])
1684 o[c1] = 1;
1685 o_num++;
1686 if (o_num >= 128)
1687 break;
1690 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1691 return 0;
1694 no_more_source:
1695 return 1;
1698 static void
1699 decode_coding_utf_16 (coding)
1700 struct coding_system *coding;
1702 const unsigned char *src = coding->source + coding->consumed;
1703 const unsigned char *src_end = coding->source + coding->src_bytes;
1704 const unsigned char *src_base;
1705 int *charbuf = coding->charbuf + coding->charbuf_used;
1706 /* We may produces at most 3 chars in one loop. */
1707 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1708 int consumed_chars = 0, consumed_chars_base = 0;
1709 int multibytep = coding->src_multibyte;
1710 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1711 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1712 int surrogate = CODING_UTF_16_SURROGATE (coding);
1713 Lisp_Object attr, charset_list;
1714 int eol_crlf =
1715 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1716 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1718 CODING_GET_INFO (coding, attr, charset_list);
1720 if (bom == utf_with_bom)
1722 int c, c1, c2;
1724 src_base = src;
1725 ONE_MORE_BYTE (c1);
1726 ONE_MORE_BYTE (c2);
1727 c = (c1 << 8) | c2;
1729 if (endian == utf_16_big_endian
1730 ? c != 0xFEFF : c != 0xFFFE)
1732 /* The first two bytes are not BOM. Treat them as bytes
1733 for a normal character. */
1734 src = src_base;
1735 coding->errors++;
1737 CODING_UTF_16_BOM (coding) = utf_without_bom;
1739 else if (bom == utf_detect_bom)
1741 /* We have already tried to detect BOM and failed in
1742 detect_coding. */
1743 CODING_UTF_16_BOM (coding) = utf_without_bom;
1746 while (1)
1748 int c, c1, c2;
1750 src_base = src;
1751 consumed_chars_base = consumed_chars;
1753 if (charbuf >= charbuf_end)
1755 if (byte_after_cr1 >= 0)
1756 src_base -= 2;
1757 break;
1760 if (byte_after_cr1 >= 0)
1761 c1 = byte_after_cr1, byte_after_cr1 = -1;
1762 else
1763 ONE_MORE_BYTE (c1);
1764 if (c1 < 0)
1766 *charbuf++ = -c1;
1767 continue;
1769 if (byte_after_cr2 >= 0)
1770 c2 = byte_after_cr2, byte_after_cr2 = -1;
1771 else
1772 ONE_MORE_BYTE (c2);
1773 if (c2 < 0)
1775 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1776 *charbuf++ = -c2;
1777 continue;
1779 c = (endian == utf_16_big_endian
1780 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1782 if (surrogate)
1784 if (! UTF_16_LOW_SURROGATE_P (c))
1786 if (endian == utf_16_big_endian)
1787 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1788 else
1789 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1790 *charbuf++ = c1;
1791 *charbuf++ = c2;
1792 coding->errors++;
1793 if (UTF_16_HIGH_SURROGATE_P (c))
1794 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1795 else
1796 *charbuf++ = c;
1798 else
1800 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1801 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1802 *charbuf++ = 0x10000 + c;
1805 else
1807 if (UTF_16_HIGH_SURROGATE_P (c))
1808 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1809 else
1811 if (eol_crlf && c == '\r')
1813 ONE_MORE_BYTE (byte_after_cr1);
1814 ONE_MORE_BYTE (byte_after_cr2);
1816 *charbuf++ = c;
1821 no_more_source:
1822 coding->consumed_char += consumed_chars_base;
1823 coding->consumed = src_base - coding->source;
1824 coding->charbuf_used = charbuf - coding->charbuf;
1827 static int
1828 encode_coding_utf_16 (coding)
1829 struct coding_system *coding;
1831 int multibytep = coding->dst_multibyte;
1832 int *charbuf = coding->charbuf;
1833 int *charbuf_end = charbuf + coding->charbuf_used;
1834 unsigned char *dst = coding->destination + coding->produced;
1835 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1836 int safe_room = 8;
1837 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1838 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1839 int produced_chars = 0;
1840 Lisp_Object attrs, charset_list;
1841 int c;
1843 CODING_GET_INFO (coding, attrs, charset_list);
1845 if (bom != utf_without_bom)
1847 ASSURE_DESTINATION (safe_room);
1848 if (big_endian)
1849 EMIT_TWO_BYTES (0xFE, 0xFF);
1850 else
1851 EMIT_TWO_BYTES (0xFF, 0xFE);
1852 CODING_UTF_16_BOM (coding) = utf_without_bom;
1855 while (charbuf < charbuf_end)
1857 ASSURE_DESTINATION (safe_room);
1858 c = *charbuf++;
1859 if (c >= MAX_UNICODE_CHAR)
1860 c = coding->default_char;
1862 if (c < 0x10000)
1864 if (big_endian)
1865 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1866 else
1867 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1869 else
1871 int c1, c2;
1873 c -= 0x10000;
1874 c1 = (c >> 10) + 0xD800;
1875 c2 = (c & 0x3FF) + 0xDC00;
1876 if (big_endian)
1877 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1878 else
1879 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1882 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1883 coding->produced = dst - coding->destination;
1884 coding->produced_char += produced_chars;
1885 return 0;
1889 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1891 /* Emacs' internal format for representation of multiple character
1892 sets is a kind of multi-byte encoding, i.e. characters are
1893 represented by variable-length sequences of one-byte codes.
1895 ASCII characters and control characters (e.g. `tab', `newline') are
1896 represented by one-byte sequences which are their ASCII codes, in
1897 the range 0x00 through 0x7F.
1899 8-bit characters of the range 0x80..0x9F are represented by
1900 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1901 code + 0x20).
1903 8-bit characters of the range 0xA0..0xFF are represented by
1904 one-byte sequences which are their 8-bit code.
1906 The other characters are represented by a sequence of `base
1907 leading-code', optional `extended leading-code', and one or two
1908 `position-code's. The length of the sequence is determined by the
1909 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1910 whereas extended leading-code and position-code take the range 0xA0
1911 through 0xFF. See `charset.h' for more details about leading-code
1912 and position-code.
1914 --- CODE RANGE of Emacs' internal format ---
1915 character set range
1916 ------------- -----
1917 ascii 0x00..0x7F
1918 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1919 eight-bit-graphic 0xA0..0xBF
1920 ELSE 0x81..0x9D + [0xA0..0xFF]+
1921 ---------------------------------------------
1923 As this is the internal character representation, the format is
1924 usually not used externally (i.e. in a file or in a data sent to a
1925 process). But, it is possible to have a text externally in this
1926 format (i.e. by encoding by the coding system `emacs-mule').
1928 In that case, a sequence of one-byte codes has a slightly different
1929 form.
1931 At first, all characters in eight-bit-control are represented by
1932 one-byte sequences which are their 8-bit code.
1934 Next, character composition data are represented by the byte
1935 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1936 where,
1937 METHOD is 0xF2 plus one of composition method (enum
1938 composition_method),
1940 BYTES is 0xA0 plus a byte length of this composition data,
1942 CHARS is 0xA0 plus a number of characters composed by this
1943 data,
1945 COMPONENTs are characters of multibye form or composition
1946 rules encoded by two-byte of ASCII codes.
1948 In addition, for backward compatibility, the following formats are
1949 also recognized as composition data on decoding.
1951 0x80 MSEQ ...
1952 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1954 Here,
1955 MSEQ is a multibyte form but in these special format:
1956 ASCII: 0xA0 ASCII_CODE+0x80,
1957 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1958 RULE is a one byte code of the range 0xA0..0xF0 that
1959 represents a composition rule.
1962 char emacs_mule_bytes[256];
1965 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1966 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1967 else return 0. */
1969 static int
1970 detect_coding_emacs_mule (coding, detect_info)
1971 struct coding_system *coding;
1972 struct coding_detection_info *detect_info;
1974 const unsigned char *src = coding->source, *src_base;
1975 const unsigned char *src_end = coding->source + coding->src_bytes;
1976 int multibytep = coding->src_multibyte;
1977 int consumed_chars = 0;
1978 int c;
1979 int found = 0;
1981 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1982 /* A coding system of this category is always ASCII compatible. */
1983 src += coding->head_ascii;
1985 while (1)
1987 src_base = src;
1988 ONE_MORE_BYTE (c);
1989 if (c < 0)
1990 continue;
1991 if (c == 0x80)
1993 /* Perhaps the start of composite character. We simply skip
1994 it because analyzing it is too heavy for detecting. But,
1995 at least, we check that the composite character
1996 constitutes of more than 4 bytes. */
1997 const unsigned char *src_base;
1999 repeat:
2000 src_base = src;
2003 ONE_MORE_BYTE (c);
2005 while (c >= 0xA0);
2007 if (src - src_base <= 4)
2008 break;
2009 found = CATEGORY_MASK_EMACS_MULE;
2010 if (c == 0x80)
2011 goto repeat;
2014 if (c < 0x80)
2016 if (c < 0x20
2017 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2018 break;
2020 else
2022 int more_bytes = emacs_mule_bytes[*src_base] - 1;
2024 while (more_bytes > 0)
2026 ONE_MORE_BYTE (c);
2027 if (c < 0xA0)
2029 src--; /* Unread the last byte. */
2030 break;
2032 more_bytes--;
2034 if (more_bytes != 0)
2035 break;
2036 found = CATEGORY_MASK_EMACS_MULE;
2039 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2040 return 0;
2042 no_more_source:
2043 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2045 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2046 return 0;
2048 detect_info->found |= found;
2049 return 1;
2053 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2054 character. If CMP_STATUS indicates that we must expect MSEQ or
2055 RULE described above, decode it and return the negative value of
2056 the deocded character or rule. If an invalid byte is found, return
2057 -1. If SRC is too short, return -2. */
2060 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2061 struct coding_system *coding;
2062 const unsigned char *src;
2063 int *nbytes, *nchars, *id;
2064 struct composition_status *cmp_status;
2066 const unsigned char *src_end = coding->source + coding->src_bytes;
2067 const unsigned char *src_base = src;
2068 int multibytep = coding->src_multibyte;
2069 struct charset *charset;
2070 unsigned code;
2071 int c;
2072 int consumed_chars = 0;
2073 int mseq_found = 0;
2075 ONE_MORE_BYTE (c);
2076 if (c < 0)
2078 c = -c;
2079 charset = emacs_mule_charset[0];
2081 else
2083 if (c >= 0xA0)
2085 if (cmp_status->state != COMPOSING_NO
2086 && cmp_status->old_form)
2088 if (cmp_status->state == COMPOSING_CHAR)
2090 if (c == 0xA0)
2092 ONE_MORE_BYTE (c);
2093 c -= 0x80;
2094 if (c < 0)
2095 goto invalid_code;
2097 else
2098 c -= 0x20;
2099 mseq_found = 1;
2101 else
2103 *nbytes = src - src_base;
2104 *nchars = consumed_chars;
2105 return -c;
2108 else
2109 goto invalid_code;
2112 switch (emacs_mule_bytes[c])
2114 case 2:
2115 if (! (charset = emacs_mule_charset[c]))
2116 goto invalid_code;
2117 ONE_MORE_BYTE (c);
2118 if (c < 0xA0)
2119 goto invalid_code;
2120 code = c & 0x7F;
2121 break;
2123 case 3:
2124 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2125 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2127 ONE_MORE_BYTE (c);
2128 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2129 goto invalid_code;
2130 ONE_MORE_BYTE (c);
2131 if (c < 0xA0)
2132 goto invalid_code;
2133 code = c & 0x7F;
2135 else
2137 if (! (charset = emacs_mule_charset[c]))
2138 goto invalid_code;
2139 ONE_MORE_BYTE (c);
2140 if (c < 0xA0)
2141 goto invalid_code;
2142 code = (c & 0x7F) << 8;
2143 ONE_MORE_BYTE (c);
2144 if (c < 0xA0)
2145 goto invalid_code;
2146 code |= c & 0x7F;
2148 break;
2150 case 4:
2151 ONE_MORE_BYTE (c);
2152 if (c < 0 || ! (charset = emacs_mule_charset[c]))
2153 goto invalid_code;
2154 ONE_MORE_BYTE (c);
2155 if (c < 0xA0)
2156 goto invalid_code;
2157 code = (c & 0x7F) << 8;
2158 ONE_MORE_BYTE (c);
2159 if (c < 0xA0)
2160 goto invalid_code;
2161 code |= c & 0x7F;
2162 break;
2164 case 1:
2165 code = c;
2166 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2167 ? charset_ascii : charset_eight_bit);
2168 break;
2170 default:
2171 abort ();
2173 c = DECODE_CHAR (charset, code);
2174 if (c < 0)
2175 goto invalid_code;
2177 *nbytes = src - src_base;
2178 *nchars = consumed_chars;
2179 if (id)
2180 *id = charset->id;
2181 return (mseq_found ? -c : c);
2183 no_more_source:
2184 return -2;
2186 invalid_code:
2187 return -1;
2191 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2193 /* Handle these composition sequence ('|': the end of header elements,
2194 BYTES and CHARS >= 0xA0):
2196 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2197 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2198 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2200 and these old form:
2202 (4) relative composition: 0x80 | MSEQ ... MSEQ
2203 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2205 When the starter 0x80 and the following header elements are found,
2206 this annotation header is produced.
2208 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2210 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2211 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2213 Then, upon reading the following elements, these codes are produced
2214 until the composition end is found:
2216 (1) CHAR ... CHAR
2217 (2) ALT ... ALT CHAR ... CHAR
2218 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2219 (4) CHAR ... CHAR
2220 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2222 When the composition end is found, LENGTH and NCHARS in the
2223 annotation header is updated as below:
2225 (1) LENGTH: unchanged, NCHARS: unchanged
2226 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2227 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2228 (4) LENGTH: unchanged, NCHARS: number of CHARs
2229 (5) LENGTH: unchanged, NCHARS: number of CHARs
2231 If an error is found while composing, the annotation header is
2232 changed to the original composition header (plus filler -1s) as
2233 below:
2235 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2236 (5) [ 0x80 0xFF -1 -1- -1 ]
2238 and the sequence [ -2 DECODED-RULE ] is changed to the original
2239 byte sequence as below:
2240 o the original byte sequence is B: [ B -1 ]
2241 o the original byte sequence is B1 B2: [ B1 B2 ]
2243 Most of the routines are implemented by macros because many
2244 variables and labels in the caller decode_coding_emacs_mule must be
2245 accessible, and they are usually called just once (thus doesn't
2246 increase the size of compiled object). */
2248 /* Decode a composition rule represented by C as a component of
2249 composition sequence of Emacs 20 style. Set RULE to the decoded
2250 rule. */
2252 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2253 do { \
2254 int gref, nref; \
2256 c -= 0xA0; \
2257 if (c < 0 || c >= 81) \
2258 goto invalid_code; \
2259 gref = c / 9, nref = c % 9; \
2260 if (gref == 4) gref = 10; \
2261 if (nref == 4) nref = 10; \
2262 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2263 } while (0)
2266 /* Decode a composition rule represented by C and the following byte
2267 at SRC as a component of composition sequence of Emacs 21 style.
2268 Set RULE to the decoded rule. */
2270 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2271 do { \
2272 int gref, nref; \
2274 gref = c - 0x20; \
2275 if (gref < 0 || gref >= 81) \
2276 goto invalid_code; \
2277 ONE_MORE_BYTE (c); \
2278 nref = c - 0x20; \
2279 if (nref < 0 || nref >= 81) \
2280 goto invalid_code; \
2281 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2282 } while (0)
2285 /* Start of Emacs 21 style format. The first three bytes at SRC are
2286 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2287 byte length of this composition information, CHARS is the number of
2288 characters composed by this composition. */
2290 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2291 do { \
2292 enum composition_method method = c - 0xF2; \
2293 int *charbuf_base = charbuf; \
2294 int nbytes, nchars; \
2296 ONE_MORE_BYTE (c); \
2297 if (c < 0) \
2298 goto invalid_code; \
2299 nbytes = c - 0xA0; \
2300 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2301 goto invalid_code; \
2302 ONE_MORE_BYTE (c); \
2303 nchars = c - 0xA0; \
2304 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2305 goto invalid_code; \
2306 cmp_status->old_form = 0; \
2307 cmp_status->method = method; \
2308 if (method == COMPOSITION_RELATIVE) \
2309 cmp_status->state = COMPOSING_CHAR; \
2310 else \
2311 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2312 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2313 cmp_status->nchars = nchars; \
2314 cmp_status->ncomps = nbytes - 4; \
2315 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2316 } while (0)
2319 /* Start of Emacs 20 style format for relative composition. */
2321 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2322 do { \
2323 cmp_status->old_form = 1; \
2324 cmp_status->method = COMPOSITION_RELATIVE; \
2325 cmp_status->state = COMPOSING_CHAR; \
2326 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2327 cmp_status->nchars = cmp_status->ncomps = 0; \
2328 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2329 } while (0)
2332 /* Start of Emacs 20 style format for rule-base composition. */
2334 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2335 do { \
2336 cmp_status->old_form = 1; \
2337 cmp_status->method = COMPOSITION_WITH_RULE; \
2338 cmp_status->state = COMPOSING_CHAR; \
2339 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2340 cmp_status->nchars = cmp_status->ncomps = 0; \
2341 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2342 } while (0)
2345 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2346 do { \
2347 const unsigned char *current_src = src; \
2349 ONE_MORE_BYTE (c); \
2350 if (c < 0) \
2351 goto invalid_code; \
2352 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2353 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2354 DECODE_EMACS_MULE_21_COMPOSITION (); \
2355 else if (c < 0xA0) \
2356 goto invalid_code; \
2357 else if (c < 0xC0) \
2359 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2360 /* Re-read C as a composition component. */ \
2361 src = current_src; \
2363 else if (c == 0xFF) \
2364 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2365 else \
2366 goto invalid_code; \
2367 } while (0)
2369 #define EMACS_MULE_COMPOSITION_END() \
2370 do { \
2371 int idx = - cmp_status->length; \
2373 if (cmp_status->old_form) \
2374 charbuf[idx + 2] = cmp_status->nchars; \
2375 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2376 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2377 cmp_status->state = COMPOSING_NO; \
2378 } while (0)
2381 static int
2382 emacs_mule_finish_composition (charbuf, cmp_status)
2383 int *charbuf;
2384 struct composition_status *cmp_status;
2386 int idx = - cmp_status->length;
2387 int new_chars;
2389 if (cmp_status->old_form && cmp_status->nchars > 0)
2391 charbuf[idx + 2] = cmp_status->nchars;
2392 new_chars = 0;
2393 if (cmp_status->method == COMPOSITION_WITH_RULE
2394 && cmp_status->state == COMPOSING_CHAR)
2396 /* The last rule was invalid. */
2397 int rule = charbuf[-1] + 0xA0;
2399 charbuf[-2] = BYTE8_TO_CHAR (rule);
2400 charbuf[-1] = -1;
2401 new_chars = 1;
2404 else
2406 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2408 if (cmp_status->method == COMPOSITION_WITH_RULE)
2410 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2411 charbuf[idx++] = -3;
2412 charbuf[idx++] = 0;
2413 new_chars = 1;
2415 else
2417 int nchars = charbuf[idx + 1] + 0xA0;
2418 int nbytes = charbuf[idx + 2] + 0xA0;
2420 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2421 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2422 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2423 charbuf[idx++] = -1;
2424 new_chars = 4;
2427 cmp_status->state = COMPOSING_NO;
2428 return new_chars;
2431 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2432 do { \
2433 if (cmp_status->state != COMPOSING_NO) \
2434 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2435 } while (0)
2438 static void
2439 decode_coding_emacs_mule (coding)
2440 struct coding_system *coding;
2442 const unsigned char *src = coding->source + coding->consumed;
2443 const unsigned char *src_end = coding->source + coding->src_bytes;
2444 const unsigned char *src_base;
2445 int *charbuf = coding->charbuf + coding->charbuf_used;
2446 /* We may produce two annocations (charset and composition) in one
2447 loop and one more charset annocation at the end. */
2448 int *charbuf_end
2449 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2450 int consumed_chars = 0, consumed_chars_base;
2451 int multibytep = coding->src_multibyte;
2452 Lisp_Object attrs, charset_list;
2453 int char_offset = coding->produced_char;
2454 int last_offset = char_offset;
2455 int last_id = charset_ascii;
2456 int eol_crlf =
2457 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2458 int byte_after_cr = -1;
2459 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2461 CODING_GET_INFO (coding, attrs, charset_list);
2463 if (cmp_status->state != COMPOSING_NO)
2465 int i;
2467 for (i = 0; i < cmp_status->length; i++)
2468 *charbuf++ = cmp_status->carryover[i];
2469 coding->annotated = 1;
2472 while (1)
2474 int c, id;
2476 src_base = src;
2477 consumed_chars_base = consumed_chars;
2479 if (charbuf >= charbuf_end)
2481 if (byte_after_cr >= 0)
2482 src_base--;
2483 break;
2486 if (byte_after_cr >= 0)
2487 c = byte_after_cr, byte_after_cr = -1;
2488 else
2489 ONE_MORE_BYTE (c);
2491 if (c < 0 || c == 0x80)
2493 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2494 if (c < 0)
2496 *charbuf++ = -c;
2497 char_offset++;
2499 else
2500 DECODE_EMACS_MULE_COMPOSITION_START ();
2501 continue;
2504 if (c < 0x80)
2506 if (eol_crlf && c == '\r')
2507 ONE_MORE_BYTE (byte_after_cr);
2508 id = charset_ascii;
2509 if (cmp_status->state != COMPOSING_NO)
2511 if (cmp_status->old_form)
2512 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2514 cmp_status->ncomps--;
2517 else
2519 int nchars, nbytes;
2521 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2522 cmp_status);
2523 if (c < 0)
2525 if (c == -1)
2526 goto invalid_code;
2527 if (c == -2)
2528 break;
2530 src = src_base + nbytes;
2531 consumed_chars = consumed_chars_base + nchars;
2532 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2533 cmp_status->ncomps -= nchars;
2536 /* Now if C >= 0, we found a normally encoded characer, if C <
2537 0, we found an old-style composition component character or
2538 rule. */
2540 if (cmp_status->state == COMPOSING_NO)
2542 if (last_id != id)
2544 if (last_id != charset_ascii)
2545 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2546 last_id);
2547 last_id = id;
2548 last_offset = char_offset;
2550 *charbuf++ = c;
2551 char_offset++;
2553 else if (cmp_status->state == COMPOSING_CHAR)
2555 if (cmp_status->old_form)
2557 if (c >= 0)
2559 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560 *charbuf++ = c;
2561 char_offset++;
2563 else
2565 *charbuf++ = -c;
2566 cmp_status->nchars++;
2567 cmp_status->length++;
2568 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2569 EMACS_MULE_COMPOSITION_END ();
2570 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2571 cmp_status->state = COMPOSING_RULE;
2574 else
2576 *charbuf++ = c;
2577 cmp_status->length++;
2578 cmp_status->nchars--;
2579 if (cmp_status->nchars == 0)
2580 EMACS_MULE_COMPOSITION_END ();
2583 else if (cmp_status->state == COMPOSING_RULE)
2585 int rule;
2587 if (c >= 0)
2589 EMACS_MULE_COMPOSITION_END ();
2590 *charbuf++ = c;
2591 char_offset++;
2593 else
2595 c = -c;
2596 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2597 if (rule < 0)
2598 goto invalid_code;
2599 *charbuf++ = -2;
2600 *charbuf++ = rule;
2601 cmp_status->length += 2;
2602 cmp_status->state = COMPOSING_CHAR;
2605 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2607 *charbuf++ = c;
2608 cmp_status->length++;
2609 if (cmp_status->ncomps == 0)
2610 cmp_status->state = COMPOSING_CHAR;
2611 else if (cmp_status->ncomps > 0)
2613 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2614 cmp_status->state = COMPOSING_COMPONENT_RULE;
2616 else
2617 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2619 else /* COMPOSING_COMPONENT_RULE */
2621 int rule;
2623 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2624 if (rule < 0)
2625 goto invalid_code;
2626 *charbuf++ = -2;
2627 *charbuf++ = rule;
2628 cmp_status->length += 2;
2629 cmp_status->ncomps--;
2630 if (cmp_status->ncomps > 0)
2631 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2632 else
2633 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2635 continue;
2637 retry:
2638 src = src_base;
2639 consumed_chars = consumed_chars_base;
2640 continue;
2642 invalid_code:
2643 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644 src = src_base;
2645 consumed_chars = consumed_chars_base;
2646 ONE_MORE_BYTE (c);
2647 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2648 char_offset++;
2649 coding->errors++;
2652 no_more_source:
2653 if (cmp_status->state != COMPOSING_NO)
2655 if (coding->mode & CODING_MODE_LAST_BLOCK)
2656 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2657 else
2659 int i;
2661 charbuf -= cmp_status->length;
2662 for (i = 0; i < cmp_status->length; i++)
2663 cmp_status->carryover[i] = charbuf[i];
2666 if (last_id != charset_ascii)
2667 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2668 coding->consumed_char += consumed_chars_base;
2669 coding->consumed = src_base - coding->source;
2670 coding->charbuf_used = charbuf - coding->charbuf;
2674 #define EMACS_MULE_LEADING_CODES(id, codes) \
2675 do { \
2676 if (id < 0xA0) \
2677 codes[0] = id, codes[1] = 0; \
2678 else if (id < 0xE0) \
2679 codes[0] = 0x9A, codes[1] = id; \
2680 else if (id < 0xF0) \
2681 codes[0] = 0x9B, codes[1] = id; \
2682 else if (id < 0xF5) \
2683 codes[0] = 0x9C, codes[1] = id; \
2684 else \
2685 codes[0] = 0x9D, codes[1] = id; \
2686 } while (0);
2689 static int
2690 encode_coding_emacs_mule (coding)
2691 struct coding_system *coding;
2693 int multibytep = coding->dst_multibyte;
2694 int *charbuf = coding->charbuf;
2695 int *charbuf_end = charbuf + coding->charbuf_used;
2696 unsigned char *dst = coding->destination + coding->produced;
2697 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2698 int safe_room = 8;
2699 int produced_chars = 0;
2700 Lisp_Object attrs, charset_list;
2701 int c;
2702 int preferred_charset_id = -1;
2704 CODING_GET_INFO (coding, attrs, charset_list);
2705 if (! EQ (charset_list, Vemacs_mule_charset_list))
2707 CODING_ATTR_CHARSET_LIST (attrs)
2708 = charset_list = Vemacs_mule_charset_list;
2711 while (charbuf < charbuf_end)
2713 ASSURE_DESTINATION (safe_room);
2714 c = *charbuf++;
2716 if (c < 0)
2718 /* Handle an annotation. */
2719 switch (*charbuf)
2721 case CODING_ANNOTATE_COMPOSITION_MASK:
2722 /* Not yet implemented. */
2723 break;
2724 case CODING_ANNOTATE_CHARSET_MASK:
2725 preferred_charset_id = charbuf[3];
2726 if (preferred_charset_id >= 0
2727 && NILP (Fmemq (make_number (preferred_charset_id),
2728 charset_list)))
2729 preferred_charset_id = -1;
2730 break;
2731 default:
2732 abort ();
2734 charbuf += -c - 1;
2735 continue;
2738 if (ASCII_CHAR_P (c))
2739 EMIT_ONE_ASCII_BYTE (c);
2740 else if (CHAR_BYTE8_P (c))
2742 c = CHAR_TO_BYTE8 (c);
2743 EMIT_ONE_BYTE (c);
2745 else
2747 struct charset *charset;
2748 unsigned code;
2749 int dimension;
2750 int emacs_mule_id;
2751 unsigned char leading_codes[2];
2753 if (preferred_charset_id >= 0)
2755 charset = CHARSET_FROM_ID (preferred_charset_id);
2756 if (CHAR_CHARSET_P (c, charset))
2757 code = ENCODE_CHAR (charset, c);
2758 else
2759 charset = char_charset (c, charset_list, &code);
2761 else
2762 charset = char_charset (c, charset_list, &code);
2763 if (! charset)
2765 c = coding->default_char;
2766 if (ASCII_CHAR_P (c))
2768 EMIT_ONE_ASCII_BYTE (c);
2769 continue;
2771 charset = char_charset (c, charset_list, &code);
2773 dimension = CHARSET_DIMENSION (charset);
2774 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2775 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2776 EMIT_ONE_BYTE (leading_codes[0]);
2777 if (leading_codes[1])
2778 EMIT_ONE_BYTE (leading_codes[1]);
2779 if (dimension == 1)
2780 EMIT_ONE_BYTE (code | 0x80);
2781 else
2783 code |= 0x8080;
2784 EMIT_ONE_BYTE (code >> 8);
2785 EMIT_ONE_BYTE (code & 0xFF);
2789 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2790 coding->produced_char += produced_chars;
2791 coding->produced = dst - coding->destination;
2792 return 0;
2796 /*** 7. ISO2022 handlers ***/
2798 /* The following note describes the coding system ISO2022 briefly.
2799 Since the intention of this note is to help understand the
2800 functions in this file, some parts are NOT ACCURATE or are OVERLY
2801 SIMPLIFIED. For thorough understanding, please refer to the
2802 original document of ISO2022. This is equivalent to the standard
2803 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2805 ISO2022 provides many mechanisms to encode several character sets
2806 in 7-bit and 8-bit environments. For 7-bit environments, all text
2807 is encoded using bytes less than 128. This may make the encoded
2808 text a little bit longer, but the text passes more easily through
2809 several types of gateway, some of which strip off the MSB (Most
2810 Significant Bit).
2812 There are two kinds of character sets: control character sets and
2813 graphic character sets. The former contain control characters such
2814 as `newline' and `escape' to provide control functions (control
2815 functions are also provided by escape sequences). The latter
2816 contain graphic characters such as 'A' and '-'. Emacs recognizes
2817 two control character sets and many graphic character sets.
2819 Graphic character sets are classified into one of the following
2820 four classes, according to the number of bytes (DIMENSION) and
2821 number of characters in one dimension (CHARS) of the set:
2822 - DIMENSION1_CHARS94
2823 - DIMENSION1_CHARS96
2824 - DIMENSION2_CHARS94
2825 - DIMENSION2_CHARS96
2827 In addition, each character set is assigned an identification tag,
2828 unique for each set, called the "final character" (denoted as <F>
2829 hereafter). The <F> of each character set is decided by ECMA(*)
2830 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2831 (0x30..0x3F are for private use only).
2833 Note (*): ECMA = European Computer Manufacturers Association
2835 Here are examples of graphic character sets [NAME(<F>)]:
2836 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2837 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2838 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2839 o DIMENSION2_CHARS96 -- none for the moment
2841 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2842 C0 [0x00..0x1F] -- control character plane 0
2843 GL [0x20..0x7F] -- graphic character plane 0
2844 C1 [0x80..0x9F] -- control character plane 1
2845 GR [0xA0..0xFF] -- graphic character plane 1
2847 A control character set is directly designated and invoked to C0 or
2848 C1 by an escape sequence. The most common case is that:
2849 - ISO646's control character set is designated/invoked to C0, and
2850 - ISO6429's control character set is designated/invoked to C1,
2851 and usually these designations/invocations are omitted in encoded
2852 text. In a 7-bit environment, only C0 can be used, and a control
2853 character for C1 is encoded by an appropriate escape sequence to
2854 fit into the environment. All control characters for C1 are
2855 defined to have corresponding escape sequences.
2857 A graphic character set is at first designated to one of four
2858 graphic registers (G0 through G3), then these graphic registers are
2859 invoked to GL or GR. These designations and invocations can be
2860 done independently. The most common case is that G0 is invoked to
2861 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2862 these invocations and designations are omitted in encoded text.
2863 In a 7-bit environment, only GL can be used.
2865 When a graphic character set of CHARS94 is invoked to GL, codes
2866 0x20 and 0x7F of the GL area work as control characters SPACE and
2867 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2868 be used.
2870 There are two ways of invocation: locking-shift and single-shift.
2871 With locking-shift, the invocation lasts until the next different
2872 invocation, whereas with single-shift, the invocation affects the
2873 following character only and doesn't affect the locking-shift
2874 state. Invocations are done by the following control characters or
2875 escape sequences:
2877 ----------------------------------------------------------------------
2878 abbrev function cntrl escape seq description
2879 ----------------------------------------------------------------------
2880 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2881 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2882 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2883 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2884 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2885 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2886 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2887 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2888 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2889 ----------------------------------------------------------------------
2890 (*) These are not used by any known coding system.
2892 Control characters for these functions are defined by macros
2893 ISO_CODE_XXX in `coding.h'.
2895 Designations are done by the following escape sequences:
2896 ----------------------------------------------------------------------
2897 escape sequence description
2898 ----------------------------------------------------------------------
2899 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2900 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2901 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2902 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2903 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2904 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2905 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2906 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2907 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2908 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2909 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2910 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2911 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2912 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2913 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2914 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2915 ----------------------------------------------------------------------
2917 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2918 of dimension 1, chars 94, and final character <F>, etc...
2920 Note (*): Although these designations are not allowed in ISO2022,
2921 Emacs accepts them on decoding, and produces them on encoding
2922 CHARS96 character sets in a coding system which is characterized as
2923 7-bit environment, non-locking-shift, and non-single-shift.
2925 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2926 '(' must be omitted. We refer to this as "short-form" hereafter.
2928 Now you may notice that there are a lot of ways of encoding the
2929 same multilingual text in ISO2022. Actually, there exist many
2930 coding systems such as Compound Text (used in X11's inter client
2931 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2932 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2933 localized platforms), and all of these are variants of ISO2022.
2935 In addition to the above, Emacs handles two more kinds of escape
2936 sequences: ISO6429's direction specification and Emacs' private
2937 sequence for specifying character composition.
2939 ISO6429's direction specification takes the following form:
2940 o CSI ']' -- end of the current direction
2941 o CSI '0' ']' -- end of the current direction
2942 o CSI '1' ']' -- start of left-to-right text
2943 o CSI '2' ']' -- start of right-to-left text
2944 The control character CSI (0x9B: control sequence introducer) is
2945 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2947 Character composition specification takes the following form:
2948 o ESC '0' -- start relative composition
2949 o ESC '1' -- end composition
2950 o ESC '2' -- start rule-base composition (*)
2951 o ESC '3' -- start relative composition with alternate chars (**)
2952 o ESC '4' -- start rule-base composition with alternate chars (**)
2953 Since these are not standard escape sequences of any ISO standard,
2954 the use of them with these meanings is restricted to Emacs only.
2956 (*) This form is used only in Emacs 20.7 and older versions,
2957 but newer versions can safely decode it.
2958 (**) This form is used only in Emacs 21.1 and newer versions,
2959 and older versions can't decode it.
2961 Here's a list of example usages of these composition escape
2962 sequences (categorized by `enum composition_method').
2964 COMPOSITION_RELATIVE:
2965 ESC 0 CHAR [ CHAR ] ESC 1
2966 COMPOSITION_WITH_RULE:
2967 ESC 2 CHAR [ RULE CHAR ] ESC 1
2968 COMPOSITION_WITH_ALTCHARS:
2969 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2970 COMPOSITION_WITH_RULE_ALTCHARS:
2971 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2973 enum iso_code_class_type iso_code_class[256];
2975 #define SAFE_CHARSET_P(coding, id) \
2976 ((id) <= (coding)->max_charset_id \
2977 && (coding)->safe_charsets[id] != 255)
2980 #define SHIFT_OUT_OK(category) \
2981 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2983 static void
2984 setup_iso_safe_charsets (attrs)
2985 Lisp_Object attrs;
2987 Lisp_Object charset_list, safe_charsets;
2988 Lisp_Object request;
2989 Lisp_Object reg_usage;
2990 Lisp_Object tail;
2991 int reg94, reg96;
2992 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2993 int max_charset_id;
2995 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2996 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2997 && ! EQ (charset_list, Viso_2022_charset_list))
2999 CODING_ATTR_CHARSET_LIST (attrs)
3000 = charset_list = Viso_2022_charset_list;
3001 ASET (attrs, coding_attr_safe_charsets, Qnil);
3004 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3005 return;
3007 max_charset_id = 0;
3008 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3010 int id = XINT (XCAR (tail));
3011 if (max_charset_id < id)
3012 max_charset_id = id;
3015 safe_charsets = make_uninit_string (max_charset_id + 1);
3016 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3017 request = AREF (attrs, coding_attr_iso_request);
3018 reg_usage = AREF (attrs, coding_attr_iso_usage);
3019 reg94 = XINT (XCAR (reg_usage));
3020 reg96 = XINT (XCDR (reg_usage));
3022 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3024 Lisp_Object id;
3025 Lisp_Object reg;
3026 struct charset *charset;
3028 id = XCAR (tail);
3029 charset = CHARSET_FROM_ID (XINT (id));
3030 reg = Fcdr (Fassq (id, request));
3031 if (! NILP (reg))
3032 SSET (safe_charsets, XINT (id), XINT (reg));
3033 else if (charset->iso_chars_96)
3035 if (reg96 < 4)
3036 SSET (safe_charsets, XINT (id), reg96);
3038 else
3040 if (reg94 < 4)
3041 SSET (safe_charsets, XINT (id), reg94);
3044 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3048 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3049 Check if a text is encoded in one of ISO-2022 based codig systems.
3050 If it is, return 1, else return 0. */
3052 static int
3053 detect_coding_iso_2022 (coding, detect_info)
3054 struct coding_system *coding;
3055 struct coding_detection_info *detect_info;
3057 const unsigned char *src = coding->source, *src_base = src;
3058 const unsigned char *src_end = coding->source + coding->src_bytes;
3059 int multibytep = coding->src_multibyte;
3060 int single_shifting = 0;
3061 int id;
3062 int c, c1;
3063 int consumed_chars = 0;
3064 int i;
3065 int rejected = 0;
3066 int found = 0;
3067 int composition_count = -1;
3069 detect_info->checked |= CATEGORY_MASK_ISO;
3071 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3073 struct coding_system *this = &(coding_categories[i]);
3074 Lisp_Object attrs, val;
3076 if (this->id < 0)
3077 continue;
3078 attrs = CODING_ID_ATTRS (this->id);
3079 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3080 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3081 setup_iso_safe_charsets (attrs);
3082 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3083 this->max_charset_id = SCHARS (val) - 1;
3084 this->safe_charsets = SDATA (val);
3087 /* A coding system of this category is always ASCII compatible. */
3088 src += coding->head_ascii;
3090 while (rejected != CATEGORY_MASK_ISO)
3092 src_base = src;
3093 ONE_MORE_BYTE (c);
3094 switch (c)
3096 case ISO_CODE_ESC:
3097 if (inhibit_iso_escape_detection)
3098 break;
3099 single_shifting = 0;
3100 ONE_MORE_BYTE (c);
3101 if (c >= '(' && c <= '/')
3103 /* Designation sequence for a charset of dimension 1. */
3104 ONE_MORE_BYTE (c1);
3105 if (c1 < ' ' || c1 >= 0x80
3106 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3107 /* Invalid designation sequence. Just ignore. */
3108 break;
3110 else if (c == '$')
3112 /* Designation sequence for a charset of dimension 2. */
3113 ONE_MORE_BYTE (c);
3114 if (c >= '@' && c <= 'B')
3115 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3116 id = iso_charset_table[1][0][c];
3117 else if (c >= '(' && c <= '/')
3119 ONE_MORE_BYTE (c1);
3120 if (c1 < ' ' || c1 >= 0x80
3121 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3122 /* Invalid designation sequence. Just ignore. */
3123 break;
3125 else
3126 /* Invalid designation sequence. Just ignore it. */
3127 break;
3129 else if (c == 'N' || c == 'O')
3131 /* ESC <Fe> for SS2 or SS3. */
3132 single_shifting = 1;
3133 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3134 break;
3136 else if (c == '1')
3138 /* End of composition. */
3139 if (composition_count < 0
3140 || composition_count > MAX_COMPOSITION_COMPONENTS)
3141 /* Invalid */
3142 break;
3143 composition_count = -1;
3144 found |= CATEGORY_MASK_ISO;
3146 else if (c >= '0' && c <= '4')
3148 /* ESC <Fp> for start/end composition. */
3149 composition_count = 0;
3150 break;
3152 else
3154 /* Invalid escape sequence. Just ignore it. */
3155 break;
3158 /* We found a valid designation sequence for CHARSET. */
3159 rejected |= CATEGORY_MASK_ISO_8BIT;
3160 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3161 id))
3162 found |= CATEGORY_MASK_ISO_7;
3163 else
3164 rejected |= CATEGORY_MASK_ISO_7;
3165 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3166 id))
3167 found |= CATEGORY_MASK_ISO_7_TIGHT;
3168 else
3169 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3170 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3171 id))
3172 found |= CATEGORY_MASK_ISO_7_ELSE;
3173 else
3174 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3175 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3176 id))
3177 found |= CATEGORY_MASK_ISO_8_ELSE;
3178 else
3179 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3180 break;
3182 case ISO_CODE_SO:
3183 case ISO_CODE_SI:
3184 /* Locking shift out/in. */
3185 if (inhibit_iso_escape_detection)
3186 break;
3187 single_shifting = 0;
3188 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3189 break;
3191 case ISO_CODE_CSI:
3192 /* Control sequence introducer. */
3193 single_shifting = 0;
3194 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3195 found |= CATEGORY_MASK_ISO_8_ELSE;
3196 goto check_extra_latin;
3198 case ISO_CODE_SS2:
3199 case ISO_CODE_SS3:
3200 /* Single shift. */
3201 if (inhibit_iso_escape_detection)
3202 break;
3203 single_shifting = 0;
3204 rejected |= CATEGORY_MASK_ISO_7BIT;
3205 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3206 & CODING_ISO_FLAG_SINGLE_SHIFT)
3207 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3208 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3209 & CODING_ISO_FLAG_SINGLE_SHIFT)
3210 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3211 if (single_shifting)
3212 break;
3213 goto check_extra_latin;
3215 default:
3216 if (c < 0)
3217 continue;
3218 if (c < 0x80)
3220 if (composition_count >= 0)
3221 composition_count++;
3222 single_shifting = 0;
3223 break;
3225 if (c >= 0xA0)
3227 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3228 found |= CATEGORY_MASK_ISO_8_1;
3229 /* Check the length of succeeding codes of the range
3230 0xA0..0FF. If the byte length is even, we include
3231 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3232 only when we are not single shifting. */
3233 if (! single_shifting
3234 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3236 int i = 1;
3237 while (src < src_end)
3239 ONE_MORE_BYTE (c);
3240 if (c < 0xA0)
3241 break;
3242 i++;
3245 if (i & 1 && src < src_end)
3247 rejected |= CATEGORY_MASK_ISO_8_2;
3248 if (composition_count >= 0)
3249 composition_count += i;
3251 else
3253 found |= CATEGORY_MASK_ISO_8_2;
3254 if (composition_count >= 0)
3255 composition_count += i / 2;
3258 break;
3260 check_extra_latin:
3261 single_shifting = 0;
3262 if (! VECTORP (Vlatin_extra_code_table)
3263 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3265 rejected = CATEGORY_MASK_ISO;
3266 break;
3268 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3269 & CODING_ISO_FLAG_LATIN_EXTRA)
3270 found |= CATEGORY_MASK_ISO_8_1;
3271 else
3272 rejected |= CATEGORY_MASK_ISO_8_1;
3273 rejected |= CATEGORY_MASK_ISO_8_2;
3276 detect_info->rejected |= CATEGORY_MASK_ISO;
3277 return 0;
3279 no_more_source:
3280 detect_info->rejected |= rejected;
3281 detect_info->found |= (found & ~rejected);
3282 return 1;
3286 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3287 escape sequence should be kept. */
3288 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3289 do { \
3290 int id, prev; \
3292 if (final < '0' || final >= 128 \
3293 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3294 || !SAFE_CHARSET_P (coding, id)) \
3296 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3297 chars_96 = -1; \
3298 break; \
3300 prev = CODING_ISO_DESIGNATION (coding, reg); \
3301 if (id == charset_jisx0201_roman) \
3303 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3304 id = charset_ascii; \
3306 else if (id == charset_jisx0208_1978) \
3308 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3309 id = charset_jisx0208; \
3311 CODING_ISO_DESIGNATION (coding, reg) = id; \
3312 /* If there was an invalid designation to REG previously, and this \
3313 designation is ASCII to REG, we should keep this designation \
3314 sequence. */ \
3315 if (prev == -2 && id == charset_ascii) \
3316 chars_96 = -1; \
3317 } while (0)
3320 /* Handle these composition sequence (ALT: alternate char):
3322 (1) relative composition: ESC 0 CHAR ... ESC 1
3323 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3324 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3325 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3327 When the start sequence (ESC 0/2/3/4) is found, this annotation
3328 header is produced.
3330 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3332 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3333 produced until the end sequence (ESC 1) is found:
3335 (1) CHAR ... CHAR
3336 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3337 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3338 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3340 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3341 annotation header is updated as below:
3343 (1) LENGTH: unchanged, NCHARS: number of CHARs
3344 (2) LENGTH: unchanged, NCHARS: number of CHARs
3345 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3346 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3348 If an error is found while composing, the annotation header is
3349 changed to:
3351 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3353 and the sequence [ -2 DECODED-RULE ] is changed to the original
3354 byte sequence as below:
3355 o the original byte sequence is B: [ B -1 ]
3356 o the original byte sequence is B1 B2: [ B1 B2 ]
3357 and the sequence [ -1 -1 ] is changed to the original byte
3358 sequence:
3359 [ ESC '0' ]
3362 /* Decode a composition rule C1 and maybe one more byte from the
3363 source, and set RULE to the encoded composition rule, NBYTES to the
3364 length of the composition rule. If the rule is invalid, set RULE
3365 to some negative value. */
3367 #define DECODE_COMPOSITION_RULE(rule, nbytes) \
3368 do { \
3369 rule = c1 - 32; \
3370 if (rule < 0) \
3371 break; \
3372 if (rule < 81) /* old format (before ver.21) */ \
3374 int gref = (rule) / 9; \
3375 int nref = (rule) % 9; \
3376 if (gref == 4) gref = 10; \
3377 if (nref == 4) nref = 10; \
3378 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3379 nbytes = 1; \
3381 else /* new format (after ver.21) */ \
3383 int c; \
3385 ONE_MORE_BYTE (c); \
3386 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3387 if (rule >= 0) \
3388 rule += 0x100; /* to destinguish it from the old format */ \
3389 nbytes = 2; \
3391 } while (0)
3393 #define ENCODE_COMPOSITION_RULE(rule) \
3394 do { \
3395 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3397 if (rule < 0x100) /* old format */ \
3399 if (gref == 10) gref = 4; \
3400 if (nref == 10) nref = 4; \
3401 charbuf[idx] = 32 + gref * 9 + nref; \
3402 charbuf[idx + 1] = -1; \
3403 new_chars++; \
3405 else /* new format */ \
3407 charbuf[idx] = 32 + 81 + gref; \
3408 charbuf[idx + 1] = 32 + nref; \
3409 new_chars += 2; \
3411 } while (0)
3413 /* Finish the current composition as invalid. */
3415 static int finish_composition P_ ((int *, struct composition_status *));
3417 static int
3418 finish_composition (charbuf, cmp_status)
3419 int *charbuf;
3420 struct composition_status *cmp_status;
3422 int idx = - cmp_status->length;
3423 int new_chars;
3425 /* Recover the original ESC sequence */
3426 charbuf[idx++] = ISO_CODE_ESC;
3427 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3428 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3429 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3430 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3431 : '4');
3432 charbuf[idx++] = -2;
3433 charbuf[idx++] = 0;
3434 charbuf[idx++] = -1;
3435 new_chars = cmp_status->nchars;
3436 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3437 for (; idx < 0; idx++)
3439 int elt = charbuf[idx];
3441 if (elt == -2)
3443 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3444 idx++;
3446 else if (elt == -1)
3448 charbuf[idx++] = ISO_CODE_ESC;
3449 charbuf[idx] = '0';
3450 new_chars += 2;
3453 cmp_status->state = COMPOSING_NO;
3454 return new_chars;
3457 /* If characers are under composition, finish the composition. */
3458 #define MAYBE_FINISH_COMPOSITION() \
3459 do { \
3460 if (cmp_status->state != COMPOSING_NO) \
3461 char_offset += finish_composition (charbuf, cmp_status); \
3462 } while (0)
3464 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3466 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3467 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3468 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3469 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3471 Produce this annotation sequence now:
3473 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3476 #define DECODE_COMPOSITION_START(c1) \
3477 do { \
3478 if (c1 == '0' \
3479 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3480 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3481 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3482 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3484 *charbuf++ = -1; \
3485 *charbuf++= -1; \
3486 cmp_status->state = COMPOSING_CHAR; \
3487 cmp_status->length += 2; \
3489 else \
3491 MAYBE_FINISH_COMPOSITION (); \
3492 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3493 : c1 == '2' ? COMPOSITION_WITH_RULE \
3494 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3495 : COMPOSITION_WITH_RULE_ALTCHARS); \
3496 cmp_status->state \
3497 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3498 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3499 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3500 cmp_status->nchars = cmp_status->ncomps = 0; \
3501 coding->annotated = 1; \
3503 } while (0)
3506 /* Handle composition end sequence ESC 1. */
3508 #define DECODE_COMPOSITION_END() \
3509 do { \
3510 if (cmp_status->nchars == 0 \
3511 || ((cmp_status->state == COMPOSING_CHAR) \
3512 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3514 MAYBE_FINISH_COMPOSITION (); \
3515 goto invalid_code; \
3517 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3518 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3519 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3520 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3521 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3522 char_offset += cmp_status->nchars; \
3523 cmp_status->state = COMPOSING_NO; \
3524 } while (0)
3526 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3528 #define STORE_COMPOSITION_RULE(rule) \
3529 do { \
3530 *charbuf++ = -2; \
3531 *charbuf++ = rule; \
3532 cmp_status->length += 2; \
3533 cmp_status->state--; \
3534 } while (0)
3536 /* Store a composed char or a component char C in charbuf, and update
3537 cmp_status. */
3539 #define STORE_COMPOSITION_CHAR(c) \
3540 do { \
3541 *charbuf++ = (c); \
3542 cmp_status->length++; \
3543 if (cmp_status->state == COMPOSING_CHAR) \
3544 cmp_status->nchars++; \
3545 else \
3546 cmp_status->ncomps++; \
3547 if (cmp_status->method == COMPOSITION_WITH_RULE \
3548 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3549 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3550 cmp_status->state++; \
3551 } while (0)
3554 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3556 static void
3557 decode_coding_iso_2022 (coding)
3558 struct coding_system *coding;
3560 const unsigned char *src = coding->source + coding->consumed;
3561 const unsigned char *src_end = coding->source + coding->src_bytes;
3562 const unsigned char *src_base;
3563 int *charbuf = coding->charbuf + coding->charbuf_used;
3564 /* We may produce two annocations (charset and composition) in one
3565 loop and one more charset annocation at the end. */
3566 int *charbuf_end
3567 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3568 int consumed_chars = 0, consumed_chars_base;
3569 int multibytep = coding->src_multibyte;
3570 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3571 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3572 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3573 int charset_id_2, charset_id_3;
3574 struct charset *charset;
3575 int c;
3576 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3577 Lisp_Object attrs, charset_list;
3578 int char_offset = coding->produced_char;
3579 int last_offset = char_offset;
3580 int last_id = charset_ascii;
3581 int eol_crlf =
3582 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3583 int byte_after_cr = -1;
3584 int i;
3586 CODING_GET_INFO (coding, attrs, charset_list);
3587 setup_iso_safe_charsets (attrs);
3588 /* Charset list may have been changed. */
3589 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3590 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3592 if (cmp_status->state != COMPOSING_NO)
3594 for (i = 0; i < cmp_status->length; i++)
3595 *charbuf++ = cmp_status->carryover[i];
3596 coding->annotated = 1;
3599 while (1)
3601 int c1, c2;
3603 src_base = src;
3604 consumed_chars_base = consumed_chars;
3606 if (charbuf >= charbuf_end)
3608 if (byte_after_cr >= 0)
3609 src_base--;
3610 break;
3613 if (byte_after_cr >= 0)
3614 c1 = byte_after_cr, byte_after_cr = -1;
3615 else
3616 ONE_MORE_BYTE (c1);
3617 if (c1 < 0)
3618 goto invalid_code;
3620 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3622 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3623 char_offset++;
3624 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3625 continue;
3628 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3630 if (c1 == ISO_CODE_ESC)
3632 if (src + 1 >= src_end)
3633 goto no_more_source;
3634 *charbuf++ = ISO_CODE_ESC;
3635 char_offset++;
3636 if (src[0] == '%' && src[1] == '@')
3638 src += 2;
3639 consumed_chars += 2;
3640 char_offset += 2;
3641 /* We are sure charbuf can contain two more chars. */
3642 *charbuf++ = '%';
3643 *charbuf++ = '@';
3644 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3647 else
3649 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3650 char_offset++;
3652 continue;
3655 if ((cmp_status->state == COMPOSING_RULE
3656 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3657 && c1 != ISO_CODE_ESC)
3659 int rule, nbytes;
3661 DECODE_COMPOSITION_RULE (rule, nbytes);
3662 if (rule < 0)
3663 goto invalid_code;
3664 STORE_COMPOSITION_RULE (rule);
3665 continue;
3668 /* We produce at most one character. */
3669 switch (iso_code_class [c1])
3671 case ISO_0x20_or_0x7F:
3672 if (charset_id_0 < 0
3673 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3674 /* This is SPACE or DEL. */
3675 charset = CHARSET_FROM_ID (charset_ascii);
3676 else
3677 charset = CHARSET_FROM_ID (charset_id_0);
3678 break;
3680 case ISO_graphic_plane_0:
3681 if (charset_id_0 < 0)
3682 charset = CHARSET_FROM_ID (charset_ascii);
3683 else
3684 charset = CHARSET_FROM_ID (charset_id_0);
3685 break;
3687 case ISO_0xA0_or_0xFF:
3688 if (charset_id_1 < 0
3689 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3690 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3691 goto invalid_code;
3692 /* This is a graphic character, we fall down ... */
3694 case ISO_graphic_plane_1:
3695 if (charset_id_1 < 0)
3696 goto invalid_code;
3697 charset = CHARSET_FROM_ID (charset_id_1);
3698 break;
3700 case ISO_control_0:
3701 if (eol_crlf && c1 == '\r')
3702 ONE_MORE_BYTE (byte_after_cr);
3703 MAYBE_FINISH_COMPOSITION ();
3704 charset = CHARSET_FROM_ID (charset_ascii);
3705 break;
3707 case ISO_control_1:
3708 goto invalid_code;
3710 case ISO_shift_out:
3711 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3713 goto invalid_code;
3714 CODING_ISO_INVOCATION (coding, 0) = 1;
3715 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3716 continue;
3718 case ISO_shift_in:
3719 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3720 goto invalid_code;
3721 CODING_ISO_INVOCATION (coding, 0) = 0;
3722 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3723 continue;
3725 case ISO_single_shift_2_7:
3726 case ISO_single_shift_2:
3727 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3728 goto invalid_code;
3729 /* SS2 is handled as an escape sequence of ESC 'N' */
3730 c1 = 'N';
3731 goto label_escape_sequence;
3733 case ISO_single_shift_3:
3734 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3735 goto invalid_code;
3736 /* SS2 is handled as an escape sequence of ESC 'O' */
3737 c1 = 'O';
3738 goto label_escape_sequence;
3740 case ISO_control_sequence_introducer:
3741 /* CSI is handled as an escape sequence of ESC '[' ... */
3742 c1 = '[';
3743 goto label_escape_sequence;
3745 case ISO_escape:
3746 ONE_MORE_BYTE (c1);
3747 label_escape_sequence:
3748 /* Escape sequences handled here are invocation,
3749 designation, direction specification, and character
3750 composition specification. */
3751 switch (c1)
3753 case '&': /* revision of following character set */
3754 ONE_MORE_BYTE (c1);
3755 if (!(c1 >= '@' && c1 <= '~'))
3756 goto invalid_code;
3757 ONE_MORE_BYTE (c1);
3758 if (c1 != ISO_CODE_ESC)
3759 goto invalid_code;
3760 ONE_MORE_BYTE (c1);
3761 goto label_escape_sequence;
3763 case '$': /* designation of 2-byte character set */
3764 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3765 goto invalid_code;
3767 int reg, chars96;
3769 ONE_MORE_BYTE (c1);
3770 if (c1 >= '@' && c1 <= 'B')
3771 { /* designation of JISX0208.1978, GB2312.1980,
3772 or JISX0208.1980 */
3773 reg = 0, chars96 = 0;
3775 else if (c1 >= 0x28 && c1 <= 0x2B)
3776 { /* designation of DIMENSION2_CHARS94 character set */
3777 reg = c1 - 0x28, chars96 = 0;
3778 ONE_MORE_BYTE (c1);
3780 else if (c1 >= 0x2C && c1 <= 0x2F)
3781 { /* designation of DIMENSION2_CHARS96 character set */
3782 reg = c1 - 0x2C, chars96 = 1;
3783 ONE_MORE_BYTE (c1);
3785 else
3786 goto invalid_code;
3787 DECODE_DESIGNATION (reg, 2, chars96, c1);
3788 /* We must update these variables now. */
3789 if (reg == 0)
3790 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3791 else if (reg == 1)
3792 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3793 if (chars96 < 0)
3794 goto invalid_code;
3796 continue;
3798 case 'n': /* invocation of locking-shift-2 */
3799 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3800 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3801 goto invalid_code;
3802 CODING_ISO_INVOCATION (coding, 0) = 2;
3803 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3804 continue;
3806 case 'o': /* invocation of locking-shift-3 */
3807 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3808 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3809 goto invalid_code;
3810 CODING_ISO_INVOCATION (coding, 0) = 3;
3811 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3812 continue;
3814 case 'N': /* invocation of single-shift-2 */
3815 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3816 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3817 goto invalid_code;
3818 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3819 if (charset_id_2 < 0)
3820 charset = CHARSET_FROM_ID (charset_ascii);
3821 else
3822 charset = CHARSET_FROM_ID (charset_id_2);
3823 ONE_MORE_BYTE (c1);
3824 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3825 goto invalid_code;
3826 break;
3828 case 'O': /* invocation of single-shift-3 */
3829 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3830 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3831 goto invalid_code;
3832 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3833 if (charset_id_3 < 0)
3834 charset = CHARSET_FROM_ID (charset_ascii);
3835 else
3836 charset = CHARSET_FROM_ID (charset_id_3);
3837 ONE_MORE_BYTE (c1);
3838 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3839 goto invalid_code;
3840 break;
3842 case '0': case '2': case '3': case '4': /* start composition */
3843 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3844 goto invalid_code;
3845 if (last_id != charset_ascii)
3847 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3848 last_id = charset_ascii;
3849 last_offset = char_offset;
3851 DECODE_COMPOSITION_START (c1);
3852 continue;
3854 case '1': /* end composition */
3855 if (cmp_status->state == COMPOSING_NO)
3856 goto invalid_code;
3857 DECODE_COMPOSITION_END ();
3858 continue;
3860 case '[': /* specification of direction */
3861 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3862 goto invalid_code;
3863 /* For the moment, nested direction is not supported.
3864 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3865 left-to-right, and nozero means right-to-left. */
3866 ONE_MORE_BYTE (c1);
3867 switch (c1)
3869 case ']': /* end of the current direction */
3870 coding->mode &= ~CODING_MODE_DIRECTION;
3872 case '0': /* end of the current direction */
3873 case '1': /* start of left-to-right direction */
3874 ONE_MORE_BYTE (c1);
3875 if (c1 == ']')
3876 coding->mode &= ~CODING_MODE_DIRECTION;
3877 else
3878 goto invalid_code;
3879 break;
3881 case '2': /* start of right-to-left direction */
3882 ONE_MORE_BYTE (c1);
3883 if (c1 == ']')
3884 coding->mode |= CODING_MODE_DIRECTION;
3885 else
3886 goto invalid_code;
3887 break;
3889 default:
3890 goto invalid_code;
3892 continue;
3894 case '%':
3895 ONE_MORE_BYTE (c1);
3896 if (c1 == '/')
3898 /* CTEXT extended segment:
3899 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3900 We keep these bytes as is for the moment.
3901 They may be decoded by post-read-conversion. */
3902 int dim, M, L;
3903 int size;
3905 ONE_MORE_BYTE (dim);
3906 if (dim < 0 || dim > 4)
3907 goto invalid_code;
3908 ONE_MORE_BYTE (M);
3909 if (M < 128)
3910 goto invalid_code;
3911 ONE_MORE_BYTE (L);
3912 if (L < 128)
3913 goto invalid_code;
3914 size = ((M - 128) * 128) + (L - 128);
3915 if (charbuf + 6 > charbuf_end)
3916 goto break_loop;
3917 *charbuf++ = ISO_CODE_ESC;
3918 *charbuf++ = '%';
3919 *charbuf++ = '/';
3920 *charbuf++ = dim;
3921 *charbuf++ = BYTE8_TO_CHAR (M);
3922 *charbuf++ = BYTE8_TO_CHAR (L);
3923 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3925 else if (c1 == 'G')
3927 /* XFree86 extension for embedding UTF-8 in CTEXT:
3928 ESC % G --UTF-8-BYTES-- ESC % @
3929 We keep these bytes as is for the moment.
3930 They may be decoded by post-read-conversion. */
3931 if (charbuf + 3 > charbuf_end)
3932 goto break_loop;
3933 *charbuf++ = ISO_CODE_ESC;
3934 *charbuf++ = '%';
3935 *charbuf++ = 'G';
3936 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3938 else
3939 goto invalid_code;
3940 continue;
3941 break;
3943 default:
3944 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3945 goto invalid_code;
3947 int reg, chars96;
3949 if (c1 >= 0x28 && c1 <= 0x2B)
3950 { /* designation of DIMENSION1_CHARS94 character set */
3951 reg = c1 - 0x28, chars96 = 0;
3952 ONE_MORE_BYTE (c1);
3954 else if (c1 >= 0x2C && c1 <= 0x2F)
3955 { /* designation of DIMENSION1_CHARS96 character set */
3956 reg = c1 - 0x2C, chars96 = 1;
3957 ONE_MORE_BYTE (c1);
3959 else
3960 goto invalid_code;
3961 DECODE_DESIGNATION (reg, 1, chars96, c1);
3962 /* We must update these variables now. */
3963 if (reg == 0)
3964 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3965 else if (reg == 1)
3966 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3967 if (chars96 < 0)
3968 goto invalid_code;
3970 continue;
3974 if (cmp_status->state == COMPOSING_NO
3975 && charset->id != charset_ascii
3976 && last_id != charset->id)
3978 if (last_id != charset_ascii)
3979 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3980 last_id = charset->id;
3981 last_offset = char_offset;
3984 /* Now we know CHARSET and 1st position code C1 of a character.
3985 Produce a decoded character while getting 2nd position code
3986 C2 if necessary. */
3987 c1 &= 0x7F;
3988 if (CHARSET_DIMENSION (charset) > 1)
3990 ONE_MORE_BYTE (c2);
3991 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3992 /* C2 is not in a valid range. */
3993 goto invalid_code;
3994 c1 = (c1 << 8) | (c2 & 0x7F);
3995 if (CHARSET_DIMENSION (charset) > 2)
3997 ONE_MORE_BYTE (c2);
3998 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3999 /* C2 is not in a valid range. */
4000 goto invalid_code;
4001 c1 = (c1 << 8) | (c2 & 0x7F);
4005 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4006 if (c < 0)
4008 MAYBE_FINISH_COMPOSITION ();
4009 for (; src_base < src; src_base++, char_offset++)
4011 if (ASCII_BYTE_P (*src_base))
4012 *charbuf++ = *src_base;
4013 else
4014 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4017 else if (cmp_status->state == COMPOSING_NO)
4019 *charbuf++ = c;
4020 char_offset++;
4022 else if ((cmp_status->state == COMPOSING_CHAR
4023 ? cmp_status->nchars
4024 : cmp_status->ncomps)
4025 >= MAX_COMPOSITION_COMPONENTS)
4027 /* Too long composition. */
4028 MAYBE_FINISH_COMPOSITION ();
4029 *charbuf++ = c;
4030 char_offset++;
4032 else
4033 STORE_COMPOSITION_CHAR (c);
4034 continue;
4036 invalid_code:
4037 MAYBE_FINISH_COMPOSITION ();
4038 src = src_base;
4039 consumed_chars = consumed_chars_base;
4040 ONE_MORE_BYTE (c);
4041 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4042 char_offset++;
4043 coding->errors++;
4044 continue;
4046 break_loop:
4047 break;
4050 no_more_source:
4051 if (cmp_status->state != COMPOSING_NO)
4053 if (coding->mode & CODING_MODE_LAST_BLOCK)
4054 MAYBE_FINISH_COMPOSITION ();
4055 else
4057 charbuf -= cmp_status->length;
4058 for (i = 0; i < cmp_status->length; i++)
4059 cmp_status->carryover[i] = charbuf[i];
4062 else if (last_id != charset_ascii)
4063 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4064 coding->consumed_char += consumed_chars_base;
4065 coding->consumed = src_base - coding->source;
4066 coding->charbuf_used = charbuf - coding->charbuf;
4070 /* ISO2022 encoding stuff. */
4073 It is not enough to say just "ISO2022" on encoding, we have to
4074 specify more details. In Emacs, each coding system of ISO2022
4075 variant has the following specifications:
4076 1. Initial designation to G0 thru G3.
4077 2. Allows short-form designation?
4078 3. ASCII should be designated to G0 before control characters?
4079 4. ASCII should be designated to G0 at end of line?
4080 5. 7-bit environment or 8-bit environment?
4081 6. Use locking-shift?
4082 7. Use Single-shift?
4083 And the following two are only for Japanese:
4084 8. Use ASCII in place of JIS0201-1976-Roman?
4085 9. Use JISX0208-1983 in place of JISX0208-1978?
4086 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4087 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4088 details.
4091 /* Produce codes (escape sequence) for designating CHARSET to graphic
4092 register REG at DST, and increment DST. If <final-char> of CHARSET is
4093 '@', 'A', or 'B' and the coding system CODING allows, produce
4094 designation sequence of short-form. */
4096 #define ENCODE_DESIGNATION(charset, reg, coding) \
4097 do { \
4098 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4099 char *intermediate_char_94 = "()*+"; \
4100 char *intermediate_char_96 = ",-./"; \
4101 int revision = -1; \
4102 int c; \
4104 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4105 revision = CHARSET_ISO_REVISION (charset); \
4107 if (revision >= 0) \
4109 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4110 EMIT_ONE_BYTE ('@' + revision); \
4112 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4113 if (CHARSET_DIMENSION (charset) == 1) \
4115 if (! CHARSET_ISO_CHARS_96 (charset)) \
4116 c = intermediate_char_94[reg]; \
4117 else \
4118 c = intermediate_char_96[reg]; \
4119 EMIT_ONE_ASCII_BYTE (c); \
4121 else \
4123 EMIT_ONE_ASCII_BYTE ('$'); \
4124 if (! CHARSET_ISO_CHARS_96 (charset)) \
4126 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4127 || reg != 0 \
4128 || final_char < '@' || final_char > 'B') \
4129 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4131 else \
4132 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4134 EMIT_ONE_ASCII_BYTE (final_char); \
4136 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4137 } while (0)
4140 /* The following two macros produce codes (control character or escape
4141 sequence) for ISO2022 single-shift functions (single-shift-2 and
4142 single-shift-3). */
4144 #define ENCODE_SINGLE_SHIFT_2 \
4145 do { \
4146 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4147 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4148 else \
4149 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4150 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4151 } while (0)
4154 #define ENCODE_SINGLE_SHIFT_3 \
4155 do { \
4156 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4157 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4158 else \
4159 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4160 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4161 } while (0)
4164 /* The following four macros produce codes (control character or
4165 escape sequence) for ISO2022 locking-shift functions (shift-in,
4166 shift-out, locking-shift-2, and locking-shift-3). */
4168 #define ENCODE_SHIFT_IN \
4169 do { \
4170 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4171 CODING_ISO_INVOCATION (coding, 0) = 0; \
4172 } while (0)
4175 #define ENCODE_SHIFT_OUT \
4176 do { \
4177 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4178 CODING_ISO_INVOCATION (coding, 0) = 1; \
4179 } while (0)
4182 #define ENCODE_LOCKING_SHIFT_2 \
4183 do { \
4184 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4185 CODING_ISO_INVOCATION (coding, 0) = 2; \
4186 } while (0)
4189 #define ENCODE_LOCKING_SHIFT_3 \
4190 do { \
4191 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4192 CODING_ISO_INVOCATION (coding, 0) = 3; \
4193 } while (0)
4196 /* Produce codes for a DIMENSION1 character whose character set is
4197 CHARSET and whose position-code is C1. Designation and invocation
4198 sequences are also produced in advance if necessary. */
4200 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4201 do { \
4202 int id = CHARSET_ID (charset); \
4204 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4205 && id == charset_ascii) \
4207 id = charset_jisx0201_roman; \
4208 charset = CHARSET_FROM_ID (id); \
4211 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4213 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4214 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4215 else \
4216 EMIT_ONE_BYTE (c1 | 0x80); \
4217 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4218 break; \
4220 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4222 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4223 break; \
4225 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4227 EMIT_ONE_BYTE (c1 | 0x80); \
4228 break; \
4230 else \
4231 /* Since CHARSET is not yet invoked to any graphic planes, we \
4232 must invoke it, or, at first, designate it to some graphic \
4233 register. Then repeat the loop to actually produce the \
4234 character. */ \
4235 dst = encode_invocation_designation (charset, coding, dst, \
4236 &produced_chars); \
4237 } while (1)
4240 /* Produce codes for a DIMENSION2 character whose character set is
4241 CHARSET and whose position-codes are C1 and C2. Designation and
4242 invocation codes are also produced in advance if necessary. */
4244 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4245 do { \
4246 int id = CHARSET_ID (charset); \
4248 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4249 && id == charset_jisx0208) \
4251 id = charset_jisx0208_1978; \
4252 charset = CHARSET_FROM_ID (id); \
4255 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4257 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4258 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4259 else \
4260 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4261 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4262 break; \
4264 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4266 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4267 break; \
4269 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4271 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4272 break; \
4274 else \
4275 /* Since CHARSET is not yet invoked to any graphic planes, we \
4276 must invoke it, or, at first, designate it to some graphic \
4277 register. Then repeat the loop to actually produce the \
4278 character. */ \
4279 dst = encode_invocation_designation (charset, coding, dst, \
4280 &produced_chars); \
4281 } while (1)
4284 #define ENCODE_ISO_CHARACTER(charset, c) \
4285 do { \
4286 int code = ENCODE_CHAR ((charset),(c)); \
4288 if (CHARSET_DIMENSION (charset) == 1) \
4289 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4290 else \
4291 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4292 } while (0)
4295 /* Produce designation and invocation codes at a place pointed by DST
4296 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4297 Return new DST. */
4299 unsigned char *
4300 encode_invocation_designation (charset, coding, dst, p_nchars)
4301 struct charset *charset;
4302 struct coding_system *coding;
4303 unsigned char *dst;
4304 int *p_nchars;
4306 int multibytep = coding->dst_multibyte;
4307 int produced_chars = *p_nchars;
4308 int reg; /* graphic register number */
4309 int id = CHARSET_ID (charset);
4311 /* At first, check designations. */
4312 for (reg = 0; reg < 4; reg++)
4313 if (id == CODING_ISO_DESIGNATION (coding, reg))
4314 break;
4316 if (reg >= 4)
4318 /* CHARSET is not yet designated to any graphic registers. */
4319 /* At first check the requested designation. */
4320 reg = CODING_ISO_REQUEST (coding, id);
4321 if (reg < 0)
4322 /* Since CHARSET requests no special designation, designate it
4323 to graphic register 0. */
4324 reg = 0;
4326 ENCODE_DESIGNATION (charset, reg, coding);
4329 if (CODING_ISO_INVOCATION (coding, 0) != reg
4330 && CODING_ISO_INVOCATION (coding, 1) != reg)
4332 /* Since the graphic register REG is not invoked to any graphic
4333 planes, invoke it to graphic plane 0. */
4334 switch (reg)
4336 case 0: /* graphic register 0 */
4337 ENCODE_SHIFT_IN;
4338 break;
4340 case 1: /* graphic register 1 */
4341 ENCODE_SHIFT_OUT;
4342 break;
4344 case 2: /* graphic register 2 */
4345 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4346 ENCODE_SINGLE_SHIFT_2;
4347 else
4348 ENCODE_LOCKING_SHIFT_2;
4349 break;
4351 case 3: /* graphic register 3 */
4352 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4353 ENCODE_SINGLE_SHIFT_3;
4354 else
4355 ENCODE_LOCKING_SHIFT_3;
4356 break;
4360 *p_nchars = produced_chars;
4361 return dst;
4364 /* The following three macros produce codes for indicating direction
4365 of text. */
4366 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
4367 do { \
4368 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4369 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
4370 else \
4371 EMIT_ONE_BYTE (ISO_CODE_CSI); \
4372 } while (0)
4375 #define ENCODE_DIRECTION_R2L() \
4376 do { \
4377 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4378 EMIT_TWO_ASCII_BYTES ('2', ']'); \
4379 } while (0)
4382 #define ENCODE_DIRECTION_L2R() \
4383 do { \
4384 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4385 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4386 } while (0)
4389 /* Produce codes for designation and invocation to reset the graphic
4390 planes and registers to initial state. */
4391 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4392 do { \
4393 int reg; \
4394 struct charset *charset; \
4396 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4397 ENCODE_SHIFT_IN; \
4398 for (reg = 0; reg < 4; reg++) \
4399 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4400 && (CODING_ISO_DESIGNATION (coding, reg) \
4401 != CODING_ISO_INITIAL (coding, reg))) \
4403 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4404 ENCODE_DESIGNATION (charset, reg, coding); \
4406 } while (0)
4409 /* Produce designation sequences of charsets in the line started from
4410 SRC to a place pointed by DST, and return updated DST.
4412 If the current block ends before any end-of-line, we may fail to
4413 find all the necessary designations. */
4415 static unsigned char *
4416 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4417 struct coding_system *coding;
4418 int *charbuf, *charbuf_end;
4419 unsigned char *dst;
4421 struct charset *charset;
4422 /* Table of charsets to be designated to each graphic register. */
4423 int r[4];
4424 int c, found = 0, reg;
4425 int produced_chars = 0;
4426 int multibytep = coding->dst_multibyte;
4427 Lisp_Object attrs;
4428 Lisp_Object charset_list;
4430 attrs = CODING_ID_ATTRS (coding->id);
4431 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4432 if (EQ (charset_list, Qiso_2022))
4433 charset_list = Viso_2022_charset_list;
4435 for (reg = 0; reg < 4; reg++)
4436 r[reg] = -1;
4438 while (found < 4)
4440 int id;
4442 c = *charbuf++;
4443 if (c == '\n')
4444 break;
4445 charset = char_charset (c, charset_list, NULL);
4446 id = CHARSET_ID (charset);
4447 reg = CODING_ISO_REQUEST (coding, id);
4448 if (reg >= 0 && r[reg] < 0)
4450 found++;
4451 r[reg] = id;
4455 if (found)
4457 for (reg = 0; reg < 4; reg++)
4458 if (r[reg] >= 0
4459 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4460 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4463 return dst;
4466 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4468 static int
4469 encode_coding_iso_2022 (coding)
4470 struct coding_system *coding;
4472 int multibytep = coding->dst_multibyte;
4473 int *charbuf = coding->charbuf;
4474 int *charbuf_end = charbuf + coding->charbuf_used;
4475 unsigned char *dst = coding->destination + coding->produced;
4476 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4477 int safe_room = 16;
4478 int bol_designation
4479 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4480 && CODING_ISO_BOL (coding));
4481 int produced_chars = 0;
4482 Lisp_Object attrs, eol_type, charset_list;
4483 int ascii_compatible;
4484 int c;
4485 int preferred_charset_id = -1;
4487 CODING_GET_INFO (coding, attrs, charset_list);
4488 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4489 if (VECTORP (eol_type))
4490 eol_type = Qunix;
4492 setup_iso_safe_charsets (attrs);
4493 /* Charset list may have been changed. */
4494 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4495 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4497 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4499 while (charbuf < charbuf_end)
4501 ASSURE_DESTINATION (safe_room);
4503 if (bol_designation)
4505 unsigned char *dst_prev = dst;
4507 /* We have to produce designation sequences if any now. */
4508 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4509 bol_designation = 0;
4510 /* We are sure that designation sequences are all ASCII bytes. */
4511 produced_chars += dst - dst_prev;
4514 c = *charbuf++;
4516 if (c < 0)
4518 /* Handle an annotation. */
4519 switch (*charbuf)
4521 case CODING_ANNOTATE_COMPOSITION_MASK:
4522 /* Not yet implemented. */
4523 break;
4524 case CODING_ANNOTATE_CHARSET_MASK:
4525 preferred_charset_id = charbuf[2];
4526 if (preferred_charset_id >= 0
4527 && NILP (Fmemq (make_number (preferred_charset_id),
4528 charset_list)))
4529 preferred_charset_id = -1;
4530 break;
4531 default:
4532 abort ();
4534 charbuf += -c - 1;
4535 continue;
4538 /* Now encode the character C. */
4539 if (c < 0x20 || c == 0x7F)
4541 if (c == '\n'
4542 || (c == '\r' && EQ (eol_type, Qmac)))
4544 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4545 ENCODE_RESET_PLANE_AND_REGISTER ();
4546 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4548 int i;
4550 for (i = 0; i < 4; i++)
4551 CODING_ISO_DESIGNATION (coding, i)
4552 = CODING_ISO_INITIAL (coding, i);
4554 bol_designation
4555 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4557 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4558 ENCODE_RESET_PLANE_AND_REGISTER ();
4559 EMIT_ONE_ASCII_BYTE (c);
4561 else if (ASCII_CHAR_P (c))
4563 if (ascii_compatible)
4564 EMIT_ONE_ASCII_BYTE (c);
4565 else
4567 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4568 ENCODE_ISO_CHARACTER (charset, c);
4571 else if (CHAR_BYTE8_P (c))
4573 c = CHAR_TO_BYTE8 (c);
4574 EMIT_ONE_BYTE (c);
4576 else
4578 struct charset *charset;
4580 if (preferred_charset_id >= 0)
4582 charset = CHARSET_FROM_ID (preferred_charset_id);
4583 if (! CHAR_CHARSET_P (c, charset))
4584 charset = char_charset (c, charset_list, NULL);
4586 else
4587 charset = char_charset (c, charset_list, NULL);
4588 if (!charset)
4590 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4592 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4593 charset = CHARSET_FROM_ID (charset_ascii);
4595 else
4597 c = coding->default_char;
4598 charset = char_charset (c, charset_list, NULL);
4601 ENCODE_ISO_CHARACTER (charset, c);
4605 if (coding->mode & CODING_MODE_LAST_BLOCK
4606 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4608 ASSURE_DESTINATION (safe_room);
4609 ENCODE_RESET_PLANE_AND_REGISTER ();
4611 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4612 CODING_ISO_BOL (coding) = bol_designation;
4613 coding->produced_char += produced_chars;
4614 coding->produced = dst - coding->destination;
4615 return 0;
4619 /*** 8,9. SJIS and BIG5 handlers ***/
4621 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4622 quite widely. So, for the moment, Emacs supports them in the bare
4623 C code. But, in the future, they may be supported only by CCL. */
4625 /* SJIS is a coding system encoding three character sets: ASCII, right
4626 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4627 as is. A character of charset katakana-jisx0201 is encoded by
4628 "position-code + 0x80". A character of charset japanese-jisx0208
4629 is encoded in 2-byte but two position-codes are divided and shifted
4630 so that it fit in the range below.
4632 --- CODE RANGE of SJIS ---
4633 (character set) (range)
4634 ASCII 0x00 .. 0x7F
4635 KATAKANA-JISX0201 0xA0 .. 0xDF
4636 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4637 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4638 -------------------------------
4642 /* BIG5 is a coding system encoding two character sets: ASCII and
4643 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4644 character set and is encoded in two-byte.
4646 --- CODE RANGE of BIG5 ---
4647 (character set) (range)
4648 ASCII 0x00 .. 0x7F
4649 Big5 (1st byte) 0xA1 .. 0xFE
4650 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4651 --------------------------
4655 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4656 Check if a text is encoded in SJIS. If it is, return
4657 CATEGORY_MASK_SJIS, else return 0. */
4659 static int
4660 detect_coding_sjis (coding, detect_info)
4661 struct coding_system *coding;
4662 struct coding_detection_info *detect_info;
4664 const unsigned char *src = coding->source, *src_base;
4665 const unsigned char *src_end = coding->source + coding->src_bytes;
4666 int multibytep = coding->src_multibyte;
4667 int consumed_chars = 0;
4668 int found = 0;
4669 int c;
4671 detect_info->checked |= CATEGORY_MASK_SJIS;
4672 /* A coding system of this category is always ASCII compatible. */
4673 src += coding->head_ascii;
4675 while (1)
4677 src_base = src;
4678 ONE_MORE_BYTE (c);
4679 if (c < 0x80)
4680 continue;
4681 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4683 ONE_MORE_BYTE (c);
4684 if (c < 0x40 || c == 0x7F || c > 0xFC)
4685 break;
4686 found = CATEGORY_MASK_SJIS;
4688 else if (c >= 0xA0 && c < 0xE0)
4689 found = CATEGORY_MASK_SJIS;
4690 else
4691 break;
4693 detect_info->rejected |= CATEGORY_MASK_SJIS;
4694 return 0;
4696 no_more_source:
4697 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4699 detect_info->rejected |= CATEGORY_MASK_SJIS;
4700 return 0;
4702 detect_info->found |= found;
4703 return 1;
4706 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4707 Check if a text is encoded in BIG5. If it is, return
4708 CATEGORY_MASK_BIG5, else return 0. */
4710 static int
4711 detect_coding_big5 (coding, detect_info)
4712 struct coding_system *coding;
4713 struct coding_detection_info *detect_info;
4715 const unsigned char *src = coding->source, *src_base;
4716 const unsigned char *src_end = coding->source + coding->src_bytes;
4717 int multibytep = coding->src_multibyte;
4718 int consumed_chars = 0;
4719 int found = 0;
4720 int c;
4722 detect_info->checked |= CATEGORY_MASK_BIG5;
4723 /* A coding system of this category is always ASCII compatible. */
4724 src += coding->head_ascii;
4726 while (1)
4728 src_base = src;
4729 ONE_MORE_BYTE (c);
4730 if (c < 0x80)
4731 continue;
4732 if (c >= 0xA1)
4734 ONE_MORE_BYTE (c);
4735 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4736 return 0;
4737 found = CATEGORY_MASK_BIG5;
4739 else
4740 break;
4742 detect_info->rejected |= CATEGORY_MASK_BIG5;
4743 return 0;
4745 no_more_source:
4746 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4748 detect_info->rejected |= CATEGORY_MASK_BIG5;
4749 return 0;
4751 detect_info->found |= found;
4752 return 1;
4755 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4756 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4758 static void
4759 decode_coding_sjis (coding)
4760 struct coding_system *coding;
4762 const unsigned char *src = coding->source + coding->consumed;
4763 const unsigned char *src_end = coding->source + coding->src_bytes;
4764 const unsigned char *src_base;
4765 int *charbuf = coding->charbuf + coding->charbuf_used;
4766 /* We may produce one charset annocation in one loop and one more at
4767 the end. */
4768 int *charbuf_end
4769 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4770 int consumed_chars = 0, consumed_chars_base;
4771 int multibytep = coding->src_multibyte;
4772 struct charset *charset_roman, *charset_kanji, *charset_kana;
4773 struct charset *charset_kanji2;
4774 Lisp_Object attrs, charset_list, val;
4775 int char_offset = coding->produced_char;
4776 int last_offset = char_offset;
4777 int last_id = charset_ascii;
4778 int eol_crlf =
4779 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4780 int byte_after_cr = -1;
4782 CODING_GET_INFO (coding, attrs, charset_list);
4784 val = charset_list;
4785 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4786 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4787 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4788 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4790 while (1)
4792 int c, c1;
4793 struct charset *charset;
4795 src_base = src;
4796 consumed_chars_base = consumed_chars;
4798 if (charbuf >= charbuf_end)
4800 if (byte_after_cr >= 0)
4801 src_base--;
4802 break;
4805 if (byte_after_cr >= 0)
4806 c = byte_after_cr, byte_after_cr = -1;
4807 else
4808 ONE_MORE_BYTE (c);
4809 if (c < 0)
4810 goto invalid_code;
4811 if (c < 0x80)
4813 if (eol_crlf && c == '\r')
4814 ONE_MORE_BYTE (byte_after_cr);
4815 charset = charset_roman;
4817 else if (c == 0x80 || c == 0xA0)
4818 goto invalid_code;
4819 else if (c >= 0xA1 && c <= 0xDF)
4821 /* SJIS -> JISX0201-Kana */
4822 c &= 0x7F;
4823 charset = charset_kana;
4825 else if (c <= 0xEF)
4827 /* SJIS -> JISX0208 */
4828 ONE_MORE_BYTE (c1);
4829 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4830 goto invalid_code;
4831 c = (c << 8) | c1;
4832 SJIS_TO_JIS (c);
4833 charset = charset_kanji;
4835 else if (c <= 0xFC && charset_kanji2)
4837 /* SJIS -> JISX0213-2 */
4838 ONE_MORE_BYTE (c1);
4839 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4840 goto invalid_code;
4841 c = (c << 8) | c1;
4842 SJIS_TO_JIS2 (c);
4843 charset = charset_kanji2;
4845 else
4846 goto invalid_code;
4847 if (charset->id != charset_ascii
4848 && last_id != charset->id)
4850 if (last_id != charset_ascii)
4851 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4852 last_id = charset->id;
4853 last_offset = char_offset;
4855 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4856 *charbuf++ = c;
4857 char_offset++;
4858 continue;
4860 invalid_code:
4861 src = src_base;
4862 consumed_chars = consumed_chars_base;
4863 ONE_MORE_BYTE (c);
4864 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4865 char_offset++;
4866 coding->errors++;
4869 no_more_source:
4870 if (last_id != charset_ascii)
4871 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4872 coding->consumed_char += consumed_chars_base;
4873 coding->consumed = src_base - coding->source;
4874 coding->charbuf_used = charbuf - coding->charbuf;
4877 static void
4878 decode_coding_big5 (coding)
4879 struct coding_system *coding;
4881 const unsigned char *src = coding->source + coding->consumed;
4882 const unsigned char *src_end = coding->source + coding->src_bytes;
4883 const unsigned char *src_base;
4884 int *charbuf = coding->charbuf + coding->charbuf_used;
4885 /* We may produce one charset annocation in one loop and one more at
4886 the end. */
4887 int *charbuf_end
4888 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4889 int consumed_chars = 0, consumed_chars_base;
4890 int multibytep = coding->src_multibyte;
4891 struct charset *charset_roman, *charset_big5;
4892 Lisp_Object attrs, charset_list, val;
4893 int char_offset = coding->produced_char;
4894 int last_offset = char_offset;
4895 int last_id = charset_ascii;
4896 int eol_crlf =
4897 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4898 int byte_after_cr = -1;
4900 CODING_GET_INFO (coding, attrs, charset_list);
4901 val = charset_list;
4902 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4903 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4905 while (1)
4907 int c, c1;
4908 struct charset *charset;
4910 src_base = src;
4911 consumed_chars_base = consumed_chars;
4913 if (charbuf >= charbuf_end)
4915 if (byte_after_cr >= 0)
4916 src_base--;
4917 break;
4920 if (byte_after_cr >= 0)
4921 c = byte_after_cr, byte_after_cr = -1;
4922 else
4923 ONE_MORE_BYTE (c);
4925 if (c < 0)
4926 goto invalid_code;
4927 if (c < 0x80)
4929 if (eol_crlf && c == '\r')
4930 ONE_MORE_BYTE (byte_after_cr);
4931 charset = charset_roman;
4933 else
4935 /* BIG5 -> Big5 */
4936 if (c < 0xA1 || c > 0xFE)
4937 goto invalid_code;
4938 ONE_MORE_BYTE (c1);
4939 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4940 goto invalid_code;
4941 c = c << 8 | c1;
4942 charset = charset_big5;
4944 if (charset->id != charset_ascii
4945 && last_id != charset->id)
4947 if (last_id != charset_ascii)
4948 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4949 last_id = charset->id;
4950 last_offset = char_offset;
4952 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4953 *charbuf++ = c;
4954 char_offset++;
4955 continue;
4957 invalid_code:
4958 src = src_base;
4959 consumed_chars = consumed_chars_base;
4960 ONE_MORE_BYTE (c);
4961 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4962 char_offset++;
4963 coding->errors++;
4966 no_more_source:
4967 if (last_id != charset_ascii)
4968 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4969 coding->consumed_char += consumed_chars_base;
4970 coding->consumed = src_base - coding->source;
4971 coding->charbuf_used = charbuf - coding->charbuf;
4974 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4975 This function can encode charsets `ascii', `katakana-jisx0201',
4976 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4977 are sure that all these charsets are registered as official charset
4978 (i.e. do not have extended leading-codes). Characters of other
4979 charsets are produced without any encoding. If SJIS_P is 1, encode
4980 SJIS text, else encode BIG5 text. */
4982 static int
4983 encode_coding_sjis (coding)
4984 struct coding_system *coding;
4986 int multibytep = coding->dst_multibyte;
4987 int *charbuf = coding->charbuf;
4988 int *charbuf_end = charbuf + coding->charbuf_used;
4989 unsigned char *dst = coding->destination + coding->produced;
4990 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4991 int safe_room = 4;
4992 int produced_chars = 0;
4993 Lisp_Object attrs, charset_list, val;
4994 int ascii_compatible;
4995 struct charset *charset_roman, *charset_kanji, *charset_kana;
4996 struct charset *charset_kanji2;
4997 int c;
4999 CODING_GET_INFO (coding, attrs, charset_list);
5000 val = charset_list;
5001 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5002 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5003 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5004 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5006 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5008 while (charbuf < charbuf_end)
5010 ASSURE_DESTINATION (safe_room);
5011 c = *charbuf++;
5012 /* Now encode the character C. */
5013 if (ASCII_CHAR_P (c) && ascii_compatible)
5014 EMIT_ONE_ASCII_BYTE (c);
5015 else if (CHAR_BYTE8_P (c))
5017 c = CHAR_TO_BYTE8 (c);
5018 EMIT_ONE_BYTE (c);
5020 else
5022 unsigned code;
5023 struct charset *charset = char_charset (c, charset_list, &code);
5025 if (!charset)
5027 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5029 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5030 charset = CHARSET_FROM_ID (charset_ascii);
5032 else
5034 c = coding->default_char;
5035 charset = char_charset (c, charset_list, &code);
5038 if (code == CHARSET_INVALID_CODE (charset))
5039 abort ();
5040 if (charset == charset_kanji)
5042 int c1, c2;
5043 JIS_TO_SJIS (code);
5044 c1 = code >> 8, c2 = code & 0xFF;
5045 EMIT_TWO_BYTES (c1, c2);
5047 else if (charset == charset_kana)
5048 EMIT_ONE_BYTE (code | 0x80);
5049 else if (charset_kanji2 && charset == charset_kanji2)
5051 int c1, c2;
5053 c1 = code >> 8;
5054 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
5055 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5057 JIS_TO_SJIS2 (code);
5058 c1 = code >> 8, c2 = code & 0xFF;
5059 EMIT_TWO_BYTES (c1, c2);
5061 else
5062 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5064 else
5065 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5068 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5069 coding->produced_char += produced_chars;
5070 coding->produced = dst - coding->destination;
5071 return 0;
5074 static int
5075 encode_coding_big5 (coding)
5076 struct coding_system *coding;
5078 int multibytep = coding->dst_multibyte;
5079 int *charbuf = coding->charbuf;
5080 int *charbuf_end = charbuf + coding->charbuf_used;
5081 unsigned char *dst = coding->destination + coding->produced;
5082 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5083 int safe_room = 4;
5084 int produced_chars = 0;
5085 Lisp_Object attrs, charset_list, val;
5086 int ascii_compatible;
5087 struct charset *charset_roman, *charset_big5;
5088 int c;
5090 CODING_GET_INFO (coding, attrs, charset_list);
5091 val = charset_list;
5092 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5093 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5094 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5096 while (charbuf < charbuf_end)
5098 ASSURE_DESTINATION (safe_room);
5099 c = *charbuf++;
5100 /* Now encode the character C. */
5101 if (ASCII_CHAR_P (c) && ascii_compatible)
5102 EMIT_ONE_ASCII_BYTE (c);
5103 else if (CHAR_BYTE8_P (c))
5105 c = CHAR_TO_BYTE8 (c);
5106 EMIT_ONE_BYTE (c);
5108 else
5110 unsigned code;
5111 struct charset *charset = char_charset (c, charset_list, &code);
5113 if (! charset)
5115 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5117 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5118 charset = CHARSET_FROM_ID (charset_ascii);
5120 else
5122 c = coding->default_char;
5123 charset = char_charset (c, charset_list, &code);
5126 if (code == CHARSET_INVALID_CODE (charset))
5127 abort ();
5128 if (charset == charset_big5)
5130 int c1, c2;
5132 c1 = code >> 8, c2 = code & 0xFF;
5133 EMIT_TWO_BYTES (c1, c2);
5135 else
5136 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5139 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5140 coding->produced_char += produced_chars;
5141 coding->produced = dst - coding->destination;
5142 return 0;
5146 /*** 10. CCL handlers ***/
5148 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5149 Check if a text is encoded in a coding system of which
5150 encoder/decoder are written in CCL program. If it is, return
5151 CATEGORY_MASK_CCL, else return 0. */
5153 static int
5154 detect_coding_ccl (coding, detect_info)
5155 struct coding_system *coding;
5156 struct coding_detection_info *detect_info;
5158 const unsigned char *src = coding->source, *src_base;
5159 const unsigned char *src_end = coding->source + coding->src_bytes;
5160 int multibytep = coding->src_multibyte;
5161 int consumed_chars = 0;
5162 int found = 0;
5163 unsigned char *valids;
5164 int head_ascii = coding->head_ascii;
5165 Lisp_Object attrs;
5167 detect_info->checked |= CATEGORY_MASK_CCL;
5169 coding = &coding_categories[coding_category_ccl];
5170 valids = CODING_CCL_VALIDS (coding);
5171 attrs = CODING_ID_ATTRS (coding->id);
5172 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5173 src += head_ascii;
5175 while (1)
5177 int c;
5179 src_base = src;
5180 ONE_MORE_BYTE (c);
5181 if (c < 0 || ! valids[c])
5182 break;
5183 if ((valids[c] > 1))
5184 found = CATEGORY_MASK_CCL;
5186 detect_info->rejected |= CATEGORY_MASK_CCL;
5187 return 0;
5189 no_more_source:
5190 detect_info->found |= found;
5191 return 1;
5194 static void
5195 decode_coding_ccl (coding)
5196 struct coding_system *coding;
5198 const unsigned char *src = coding->source + coding->consumed;
5199 const unsigned char *src_end = coding->source + coding->src_bytes;
5200 int *charbuf = coding->charbuf + coding->charbuf_used;
5201 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5202 int consumed_chars = 0;
5203 int multibytep = coding->src_multibyte;
5204 struct ccl_program ccl;
5205 int source_charbuf[1024];
5206 int source_byteidx[1024];
5207 Lisp_Object attrs, charset_list;
5209 CODING_GET_INFO (coding, attrs, charset_list);
5210 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
5212 while (src < src_end)
5214 const unsigned char *p = src;
5215 int *source, *source_end;
5216 int i = 0;
5218 if (multibytep)
5219 while (i < 1024 && p < src_end)
5221 source_byteidx[i] = p - src;
5222 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5224 else
5225 while (i < 1024 && p < src_end)
5226 source_charbuf[i++] = *p++;
5228 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5229 ccl.last_block = 1;
5231 source = source_charbuf;
5232 source_end = source + i;
5233 while (source < source_end)
5235 ccl_driver (&ccl, source, charbuf,
5236 source_end - source, charbuf_end - charbuf,
5237 charset_list);
5238 source += ccl.consumed;
5239 charbuf += ccl.produced;
5240 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
5241 break;
5243 if (source < source_end)
5244 src += source_byteidx[source - source_charbuf];
5245 else
5246 src = p;
5247 consumed_chars += source - source_charbuf;
5249 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
5250 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
5251 break;
5254 switch (ccl.status)
5256 case CCL_STAT_SUSPEND_BY_SRC:
5257 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5258 break;
5259 case CCL_STAT_SUSPEND_BY_DST:
5260 break;
5261 case CCL_STAT_QUIT:
5262 case CCL_STAT_INVALID_CMD:
5263 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5264 break;
5265 default:
5266 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5267 break;
5269 coding->consumed_char += consumed_chars;
5270 coding->consumed = src - coding->source;
5271 coding->charbuf_used = charbuf - coding->charbuf;
5274 static int
5275 encode_coding_ccl (coding)
5276 struct coding_system *coding;
5278 struct ccl_program ccl;
5279 int multibytep = coding->dst_multibyte;
5280 int *charbuf = coding->charbuf;
5281 int *charbuf_end = charbuf + coding->charbuf_used;
5282 unsigned char *dst = coding->destination + coding->produced;
5283 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5284 int destination_charbuf[1024];
5285 int i, produced_chars = 0;
5286 Lisp_Object attrs, charset_list;
5288 CODING_GET_INFO (coding, attrs, charset_list);
5289 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5291 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5292 ccl.dst_multibyte = coding->dst_multibyte;
5294 while (charbuf < charbuf_end)
5296 ccl_driver (&ccl, charbuf, destination_charbuf,
5297 charbuf_end - charbuf, 1024, charset_list);
5298 if (multibytep)
5300 ASSURE_DESTINATION (ccl.produced * 2);
5301 for (i = 0; i < ccl.produced; i++)
5302 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5304 else
5306 ASSURE_DESTINATION (ccl.produced);
5307 for (i = 0; i < ccl.produced; i++)
5308 *dst++ = destination_charbuf[i] & 0xFF;
5309 produced_chars += ccl.produced;
5311 charbuf += ccl.consumed;
5312 if (ccl.status == CCL_STAT_QUIT
5313 || ccl.status == CCL_STAT_INVALID_CMD)
5314 break;
5317 switch (ccl.status)
5319 case CCL_STAT_SUSPEND_BY_SRC:
5320 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5321 break;
5322 case CCL_STAT_SUSPEND_BY_DST:
5323 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5324 break;
5325 case CCL_STAT_QUIT:
5326 case CCL_STAT_INVALID_CMD:
5327 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5328 break;
5329 default:
5330 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5331 break;
5334 coding->produced_char += produced_chars;
5335 coding->produced = dst - coding->destination;
5336 return 0;
5341 /*** 10, 11. no-conversion handlers ***/
5343 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5345 static void
5346 decode_coding_raw_text (coding)
5347 struct coding_system *coding;
5349 int eol_crlf =
5350 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5352 coding->chars_at_source = 1;
5353 coding->consumed_char = coding->src_chars;
5354 coding->consumed = coding->src_bytes;
5355 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5357 coding->consumed_char--;
5358 coding->consumed--;
5359 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5361 else
5362 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5365 static int
5366 encode_coding_raw_text (coding)
5367 struct coding_system *coding;
5369 int multibytep = coding->dst_multibyte;
5370 int *charbuf = coding->charbuf;
5371 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5372 unsigned char *dst = coding->destination + coding->produced;
5373 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5374 int produced_chars = 0;
5375 int c;
5377 if (multibytep)
5379 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5381 if (coding->src_multibyte)
5382 while (charbuf < charbuf_end)
5384 ASSURE_DESTINATION (safe_room);
5385 c = *charbuf++;
5386 if (ASCII_CHAR_P (c))
5387 EMIT_ONE_ASCII_BYTE (c);
5388 else if (CHAR_BYTE8_P (c))
5390 c = CHAR_TO_BYTE8 (c);
5391 EMIT_ONE_BYTE (c);
5393 else
5395 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5397 CHAR_STRING_ADVANCE (c, p1);
5398 while (p0 < p1)
5400 EMIT_ONE_BYTE (*p0);
5401 p0++;
5405 else
5406 while (charbuf < charbuf_end)
5408 ASSURE_DESTINATION (safe_room);
5409 c = *charbuf++;
5410 EMIT_ONE_BYTE (c);
5413 else
5415 if (coding->src_multibyte)
5417 int safe_room = MAX_MULTIBYTE_LENGTH;
5419 while (charbuf < charbuf_end)
5421 ASSURE_DESTINATION (safe_room);
5422 c = *charbuf++;
5423 if (ASCII_CHAR_P (c))
5424 *dst++ = c;
5425 else if (CHAR_BYTE8_P (c))
5426 *dst++ = CHAR_TO_BYTE8 (c);
5427 else
5428 CHAR_STRING_ADVANCE (c, dst);
5431 else
5433 ASSURE_DESTINATION (charbuf_end - charbuf);
5434 while (charbuf < charbuf_end && dst < dst_end)
5435 *dst++ = *charbuf++;
5437 produced_chars = dst - (coding->destination + coding->produced);
5439 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5440 coding->produced_char += produced_chars;
5441 coding->produced = dst - coding->destination;
5442 return 0;
5445 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5446 Check if a text is encoded in a charset-based coding system. If it
5447 is, return 1, else return 0. */
5449 static int
5450 detect_coding_charset (coding, detect_info)
5451 struct coding_system *coding;
5452 struct coding_detection_info *detect_info;
5454 const unsigned char *src = coding->source, *src_base;
5455 const unsigned char *src_end = coding->source + coding->src_bytes;
5456 int multibytep = coding->src_multibyte;
5457 int consumed_chars = 0;
5458 Lisp_Object attrs, valids, name;
5459 int found = 0;
5460 int head_ascii = coding->head_ascii;
5461 int check_latin_extra = 0;
5463 detect_info->checked |= CATEGORY_MASK_CHARSET;
5465 coding = &coding_categories[coding_category_charset];
5466 attrs = CODING_ID_ATTRS (coding->id);
5467 valids = AREF (attrs, coding_attr_charset_valids);
5468 name = CODING_ID_NAME (coding->id);
5469 if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5470 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5471 || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5472 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5473 check_latin_extra = 1;
5475 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5476 src += head_ascii;
5478 while (1)
5480 int c;
5481 Lisp_Object val;
5482 struct charset *charset;
5483 int dim, idx;
5485 src_base = src;
5486 ONE_MORE_BYTE (c);
5487 if (c < 0)
5488 continue;
5489 val = AREF (valids, c);
5490 if (NILP (val))
5491 break;
5492 if (c >= 0x80)
5494 if (c < 0xA0
5495 && check_latin_extra
5496 && (!VECTORP (Vlatin_extra_code_table)
5497 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5498 break;
5499 found = CATEGORY_MASK_CHARSET;
5501 if (INTEGERP (val))
5503 charset = CHARSET_FROM_ID (XFASTINT (val));
5504 dim = CHARSET_DIMENSION (charset);
5505 for (idx = 1; idx < dim; idx++)
5507 if (src == src_end)
5508 goto too_short;
5509 ONE_MORE_BYTE (c);
5510 if (c < charset->code_space[(dim - 1 - idx) * 2]
5511 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5512 break;
5514 if (idx < dim)
5515 break;
5517 else
5519 idx = 1;
5520 for (; CONSP (val); val = XCDR (val))
5522 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5523 dim = CHARSET_DIMENSION (charset);
5524 while (idx < dim)
5526 if (src == src_end)
5527 goto too_short;
5528 ONE_MORE_BYTE (c);
5529 if (c < charset->code_space[(dim - 1 - idx) * 4]
5530 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5531 break;
5532 idx++;
5534 if (idx == dim)
5536 val = Qnil;
5537 break;
5540 if (CONSP (val))
5541 break;
5544 too_short:
5545 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5546 return 0;
5548 no_more_source:
5549 detect_info->found |= found;
5550 return 1;
5553 static void
5554 decode_coding_charset (coding)
5555 struct coding_system *coding;
5557 const unsigned char *src = coding->source + coding->consumed;
5558 const unsigned char *src_end = coding->source + coding->src_bytes;
5559 const unsigned char *src_base;
5560 int *charbuf = coding->charbuf + coding->charbuf_used;
5561 /* We may produce one charset annocation in one loop and one more at
5562 the end. */
5563 int *charbuf_end
5564 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5565 int consumed_chars = 0, consumed_chars_base;
5566 int multibytep = coding->src_multibyte;
5567 Lisp_Object attrs, charset_list, valids;
5568 int char_offset = coding->produced_char;
5569 int last_offset = char_offset;
5570 int last_id = charset_ascii;
5571 int eol_crlf =
5572 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5573 int byte_after_cr = -1;
5575 CODING_GET_INFO (coding, attrs, charset_list);
5576 valids = AREF (attrs, coding_attr_charset_valids);
5578 while (1)
5580 int c;
5581 Lisp_Object val;
5582 struct charset *charset;
5583 int dim;
5584 int len = 1;
5585 unsigned code;
5587 src_base = src;
5588 consumed_chars_base = consumed_chars;
5590 if (charbuf >= charbuf_end)
5592 if (byte_after_cr >= 0)
5593 src_base--;
5594 break;
5597 if (byte_after_cr >= 0)
5599 c = byte_after_cr;
5600 byte_after_cr = -1;
5602 else
5604 ONE_MORE_BYTE (c);
5605 if (eol_crlf && c == '\r')
5606 ONE_MORE_BYTE (byte_after_cr);
5608 if (c < 0)
5609 goto invalid_code;
5610 code = c;
5612 val = AREF (valids, c);
5613 if (! INTEGERP (val) && ! CONSP (val))
5614 goto invalid_code;
5615 if (INTEGERP (val))
5617 charset = CHARSET_FROM_ID (XFASTINT (val));
5618 dim = CHARSET_DIMENSION (charset);
5619 while (len < dim)
5621 ONE_MORE_BYTE (c);
5622 code = (code << 8) | c;
5623 len++;
5625 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5626 charset, code, c);
5628 else
5630 /* VAL is a list of charset IDs. It is assured that the
5631 list is sorted by charset dimensions (smaller one
5632 comes first). */
5633 while (CONSP (val))
5635 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5636 dim = CHARSET_DIMENSION (charset);
5637 while (len < dim)
5639 ONE_MORE_BYTE (c);
5640 code = (code << 8) | c;
5641 len++;
5643 CODING_DECODE_CHAR (coding, src, src_base,
5644 src_end, charset, code, c);
5645 if (c >= 0)
5646 break;
5647 val = XCDR (val);
5650 if (c < 0)
5651 goto invalid_code;
5652 if (charset->id != charset_ascii
5653 && last_id != charset->id)
5655 if (last_id != charset_ascii)
5656 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5657 last_id = charset->id;
5658 last_offset = char_offset;
5661 *charbuf++ = c;
5662 char_offset++;
5663 continue;
5665 invalid_code:
5666 src = src_base;
5667 consumed_chars = consumed_chars_base;
5668 ONE_MORE_BYTE (c);
5669 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5670 char_offset++;
5671 coding->errors++;
5674 no_more_source:
5675 if (last_id != charset_ascii)
5676 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5677 coding->consumed_char += consumed_chars_base;
5678 coding->consumed = src_base - coding->source;
5679 coding->charbuf_used = charbuf - coding->charbuf;
5682 static int
5683 encode_coding_charset (coding)
5684 struct coding_system *coding;
5686 int multibytep = coding->dst_multibyte;
5687 int *charbuf = coding->charbuf;
5688 int *charbuf_end = charbuf + coding->charbuf_used;
5689 unsigned char *dst = coding->destination + coding->produced;
5690 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5691 int safe_room = MAX_MULTIBYTE_LENGTH;
5692 int produced_chars = 0;
5693 Lisp_Object attrs, charset_list;
5694 int ascii_compatible;
5695 int c;
5697 CODING_GET_INFO (coding, attrs, charset_list);
5698 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5700 while (charbuf < charbuf_end)
5702 struct charset *charset;
5703 unsigned code;
5705 ASSURE_DESTINATION (safe_room);
5706 c = *charbuf++;
5707 if (ascii_compatible && ASCII_CHAR_P (c))
5708 EMIT_ONE_ASCII_BYTE (c);
5709 else if (CHAR_BYTE8_P (c))
5711 c = CHAR_TO_BYTE8 (c);
5712 EMIT_ONE_BYTE (c);
5714 else
5716 charset = char_charset (c, charset_list, &code);
5717 if (charset)
5719 if (CHARSET_DIMENSION (charset) == 1)
5720 EMIT_ONE_BYTE (code);
5721 else if (CHARSET_DIMENSION (charset) == 2)
5722 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5723 else if (CHARSET_DIMENSION (charset) == 3)
5724 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5725 else
5726 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5727 (code >> 8) & 0xFF, code & 0xFF);
5729 else
5731 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5732 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5733 else
5734 c = coding->default_char;
5735 EMIT_ONE_BYTE (c);
5740 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5741 coding->produced_char += produced_chars;
5742 coding->produced = dst - coding->destination;
5743 return 0;
5747 /*** 7. C library functions ***/
5749 /* Setup coding context CODING from information about CODING_SYSTEM.
5750 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5751 CODING_SYSTEM is invalid, signal an error. */
5753 void
5754 setup_coding_system (coding_system, coding)
5755 Lisp_Object coding_system;
5756 struct coding_system *coding;
5758 Lisp_Object attrs;
5759 Lisp_Object eol_type;
5760 Lisp_Object coding_type;
5761 Lisp_Object val;
5763 if (NILP (coding_system))
5764 coding_system = Qundecided;
5766 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5768 attrs = CODING_ID_ATTRS (coding->id);
5769 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5771 coding->mode = 0;
5772 coding->head_ascii = -1;
5773 if (VECTORP (eol_type))
5774 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5775 | CODING_REQUIRE_DETECTION_MASK);
5776 else if (! EQ (eol_type, Qunix))
5777 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5778 | CODING_REQUIRE_ENCODING_MASK);
5779 else
5780 coding->common_flags = 0;
5781 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5782 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5783 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5784 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5785 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5786 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5788 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5789 coding->max_charset_id = SCHARS (val) - 1;
5790 coding->safe_charsets = SDATA (val);
5791 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5793 coding_type = CODING_ATTR_TYPE (attrs);
5794 if (EQ (coding_type, Qundecided))
5796 coding->detector = NULL;
5797 coding->decoder = decode_coding_raw_text;
5798 coding->encoder = encode_coding_raw_text;
5799 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5801 else if (EQ (coding_type, Qiso_2022))
5803 int i;
5804 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5806 /* Invoke graphic register 0 to plane 0. */
5807 CODING_ISO_INVOCATION (coding, 0) = 0;
5808 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5809 CODING_ISO_INVOCATION (coding, 1)
5810 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5811 /* Setup the initial status of designation. */
5812 for (i = 0; i < 4; i++)
5813 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5814 /* Not single shifting initially. */
5815 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5816 /* Beginning of buffer should also be regarded as bol. */
5817 CODING_ISO_BOL (coding) = 1;
5818 coding->detector = detect_coding_iso_2022;
5819 coding->decoder = decode_coding_iso_2022;
5820 coding->encoder = encode_coding_iso_2022;
5821 if (flags & CODING_ISO_FLAG_SAFE)
5822 coding->mode |= CODING_MODE_SAFE_ENCODING;
5823 coding->common_flags
5824 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5825 | CODING_REQUIRE_FLUSHING_MASK);
5826 if (flags & CODING_ISO_FLAG_COMPOSITION)
5827 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5828 if (flags & CODING_ISO_FLAG_DESIGNATION)
5829 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5830 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5832 setup_iso_safe_charsets (attrs);
5833 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5834 coding->max_charset_id = SCHARS (val) - 1;
5835 coding->safe_charsets = SDATA (val);
5837 CODING_ISO_FLAGS (coding) = flags;
5838 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5839 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5840 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5841 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5843 else if (EQ (coding_type, Qcharset))
5845 coding->detector = detect_coding_charset;
5846 coding->decoder = decode_coding_charset;
5847 coding->encoder = encode_coding_charset;
5848 coding->common_flags
5849 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851 else if (EQ (coding_type, Qutf_8))
5853 val = AREF (attrs, coding_attr_utf_bom);
5854 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5855 : EQ (val, Qt) ? utf_with_bom
5856 : utf_without_bom);
5857 coding->detector = detect_coding_utf_8;
5858 coding->decoder = decode_coding_utf_8;
5859 coding->encoder = encode_coding_utf_8;
5860 coding->common_flags
5861 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5863 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5865 else if (EQ (coding_type, Qutf_16))
5867 val = AREF (attrs, coding_attr_utf_bom);
5868 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5869 : EQ (val, Qt) ? utf_with_bom
5870 : utf_without_bom);
5871 val = AREF (attrs, coding_attr_utf_16_endian);
5872 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5873 : utf_16_little_endian);
5874 CODING_UTF_16_SURROGATE (coding) = 0;
5875 coding->detector = detect_coding_utf_16;
5876 coding->decoder = decode_coding_utf_16;
5877 coding->encoder = encode_coding_utf_16;
5878 coding->common_flags
5879 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5881 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5883 else if (EQ (coding_type, Qccl))
5885 coding->detector = detect_coding_ccl;
5886 coding->decoder = decode_coding_ccl;
5887 coding->encoder = encode_coding_ccl;
5888 coding->common_flags
5889 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5890 | CODING_REQUIRE_FLUSHING_MASK);
5892 else if (EQ (coding_type, Qemacs_mule))
5894 coding->detector = detect_coding_emacs_mule;
5895 coding->decoder = decode_coding_emacs_mule;
5896 coding->encoder = encode_coding_emacs_mule;
5897 coding->common_flags
5898 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5899 coding->spec.emacs_mule.full_support = 1;
5900 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5901 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5903 Lisp_Object tail, safe_charsets;
5904 int max_charset_id = 0;
5906 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5907 tail = XCDR (tail))
5908 if (max_charset_id < XFASTINT (XCAR (tail)))
5909 max_charset_id = XFASTINT (XCAR (tail));
5910 safe_charsets = make_uninit_string (max_charset_id + 1);
5911 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5912 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5913 tail = XCDR (tail))
5914 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5915 coding->max_charset_id = max_charset_id;
5916 coding->safe_charsets = SDATA (safe_charsets);
5917 coding->spec.emacs_mule.full_support = 1;
5919 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5920 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5922 else if (EQ (coding_type, Qshift_jis))
5924 coding->detector = detect_coding_sjis;
5925 coding->decoder = decode_coding_sjis;
5926 coding->encoder = encode_coding_sjis;
5927 coding->common_flags
5928 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5930 else if (EQ (coding_type, Qbig5))
5932 coding->detector = detect_coding_big5;
5933 coding->decoder = decode_coding_big5;
5934 coding->encoder = encode_coding_big5;
5935 coding->common_flags
5936 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5938 else /* EQ (coding_type, Qraw_text) */
5940 coding->detector = NULL;
5941 coding->decoder = decode_coding_raw_text;
5942 coding->encoder = encode_coding_raw_text;
5943 if (! EQ (eol_type, Qunix))
5945 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5946 if (! VECTORP (eol_type))
5947 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5952 return;
5955 /* Return a list of charsets supported by CODING. */
5957 Lisp_Object
5958 coding_charset_list (coding)
5959 struct coding_system *coding;
5961 Lisp_Object attrs, charset_list;
5963 CODING_GET_INFO (coding, attrs, charset_list);
5964 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5966 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5968 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5969 charset_list = Viso_2022_charset_list;
5971 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5973 charset_list = Vemacs_mule_charset_list;
5975 return charset_list;
5979 /* Return a list of charsets supported by CODING-SYSTEM. */
5981 Lisp_Object
5982 coding_system_charset_list (coding_system)
5983 Lisp_Object coding_system;
5985 int id;
5986 Lisp_Object attrs, charset_list;
5988 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5989 attrs = CODING_ID_ATTRS (id);
5991 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5993 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5995 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5996 charset_list = Viso_2022_charset_list;
5997 else
5998 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6000 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6002 charset_list = Vemacs_mule_charset_list;
6004 else
6006 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6008 return charset_list;
6012 /* Return raw-text or one of its subsidiaries that has the same
6013 eol_type as CODING-SYSTEM. */
6015 Lisp_Object
6016 raw_text_coding_system (coding_system)
6017 Lisp_Object coding_system;
6019 Lisp_Object spec, attrs;
6020 Lisp_Object eol_type, raw_text_eol_type;
6022 if (NILP (coding_system))
6023 return Qraw_text;
6024 spec = CODING_SYSTEM_SPEC (coding_system);
6025 attrs = AREF (spec, 0);
6027 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6028 return coding_system;
6030 eol_type = AREF (spec, 2);
6031 if (VECTORP (eol_type))
6032 return Qraw_text;
6033 spec = CODING_SYSTEM_SPEC (Qraw_text);
6034 raw_text_eol_type = AREF (spec, 2);
6035 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6036 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6037 : AREF (raw_text_eol_type, 2));
6041 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6042 does, return one of the subsidiary that has the same eol-spec as
6043 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
6044 inherit end-of-line format from the system's setting
6045 (system_eol_type). */
6047 Lisp_Object
6048 coding_inherit_eol_type (coding_system, parent)
6049 Lisp_Object coding_system, parent;
6051 Lisp_Object spec, eol_type;
6053 if (NILP (coding_system))
6054 coding_system = Qraw_text;
6055 spec = CODING_SYSTEM_SPEC (coding_system);
6056 eol_type = AREF (spec, 2);
6057 if (VECTORP (eol_type))
6059 Lisp_Object parent_eol_type;
6061 if (! NILP (parent))
6063 Lisp_Object parent_spec;
6065 parent_spec = CODING_SYSTEM_SPEC (parent);
6066 parent_eol_type = AREF (parent_spec, 2);
6068 else
6069 parent_eol_type = system_eol_type;
6070 if (EQ (parent_eol_type, Qunix))
6071 coding_system = AREF (eol_type, 0);
6072 else if (EQ (parent_eol_type, Qdos))
6073 coding_system = AREF (eol_type, 1);
6074 else if (EQ (parent_eol_type, Qmac))
6075 coding_system = AREF (eol_type, 2);
6077 return coding_system;
6080 /* Emacs has a mechanism to automatically detect a coding system if it
6081 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6082 it's impossible to distinguish some coding systems accurately
6083 because they use the same range of codes. So, at first, coding
6084 systems are categorized into 7, those are:
6086 o coding-category-emacs-mule
6088 The category for a coding system which has the same code range
6089 as Emacs' internal format. Assigned the coding-system (Lisp
6090 symbol) `emacs-mule' by default.
6092 o coding-category-sjis
6094 The category for a coding system which has the same code range
6095 as SJIS. Assigned the coding-system (Lisp
6096 symbol) `japanese-shift-jis' by default.
6098 o coding-category-iso-7
6100 The category for a coding system which has the same code range
6101 as ISO2022 of 7-bit environment. This doesn't use any locking
6102 shift and single shift functions. This can encode/decode all
6103 charsets. Assigned the coding-system (Lisp symbol)
6104 `iso-2022-7bit' by default.
6106 o coding-category-iso-7-tight
6108 Same as coding-category-iso-7 except that this can
6109 encode/decode only the specified charsets.
6111 o coding-category-iso-8-1
6113 The category for a coding system which has the same code range
6114 as ISO2022 of 8-bit environment and graphic plane 1 used only
6115 for DIMENSION1 charset. This doesn't use any locking shift
6116 and single shift functions. Assigned the coding-system (Lisp
6117 symbol) `iso-latin-1' by default.
6119 o coding-category-iso-8-2
6121 The category for a coding system which has the same code range
6122 as ISO2022 of 8-bit environment and graphic plane 1 used only
6123 for DIMENSION2 charset. This doesn't use any locking shift
6124 and single shift functions. Assigned the coding-system (Lisp
6125 symbol) `japanese-iso-8bit' by default.
6127 o coding-category-iso-7-else
6129 The category for a coding system which has the same code range
6130 as ISO2022 of 7-bit environemnt but uses locking shift or
6131 single shift functions. Assigned the coding-system (Lisp
6132 symbol) `iso-2022-7bit-lock' by default.
6134 o coding-category-iso-8-else
6136 The category for a coding system which has the same code range
6137 as ISO2022 of 8-bit environemnt but uses locking shift or
6138 single shift functions. Assigned the coding-system (Lisp
6139 symbol) `iso-2022-8bit-ss2' by default.
6141 o coding-category-big5
6143 The category for a coding system which has the same code range
6144 as BIG5. Assigned the coding-system (Lisp symbol)
6145 `cn-big5' by default.
6147 o coding-category-utf-8
6149 The category for a coding system which has the same code range
6150 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6151 symbol) `utf-8' by default.
6153 o coding-category-utf-16-be
6155 The category for a coding system in which a text has an
6156 Unicode signature (cf. Unicode Standard) in the order of BIG
6157 endian at the head. Assigned the coding-system (Lisp symbol)
6158 `utf-16-be' by default.
6160 o coding-category-utf-16-le
6162 The category for a coding system in which a text has an
6163 Unicode signature (cf. Unicode Standard) in the order of
6164 LITTLE endian at the head. Assigned the coding-system (Lisp
6165 symbol) `utf-16-le' by default.
6167 o coding-category-ccl
6169 The category for a coding system of which encoder/decoder is
6170 written in CCL programs. The default value is nil, i.e., no
6171 coding system is assigned.
6173 o coding-category-binary
6175 The category for a coding system not categorized in any of the
6176 above. Assigned the coding-system (Lisp symbol)
6177 `no-conversion' by default.
6179 Each of them is a Lisp symbol and the value is an actual
6180 `coding-system's (this is also a Lisp symbol) assigned by a user.
6181 What Emacs does actually is to detect a category of coding system.
6182 Then, it uses a `coding-system' assigned to it. If Emacs can't
6183 decide only one possible category, it selects a category of the
6184 highest priority. Priorities of categories are also specified by a
6185 user in a Lisp variable `coding-category-list'.
6189 #define EOL_SEEN_NONE 0
6190 #define EOL_SEEN_LF 1
6191 #define EOL_SEEN_CR 2
6192 #define EOL_SEEN_CRLF 4
6194 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6195 SOURCE is encoded. If CATEGORY is one of
6196 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6197 two-byte, else they are encoded by one-byte.
6199 Return one of EOL_SEEN_XXX. */
6201 #define MAX_EOL_CHECK_COUNT 3
6203 static int
6204 detect_eol (source, src_bytes, category)
6205 const unsigned char *source;
6206 EMACS_INT src_bytes;
6207 enum coding_category category;
6209 const unsigned char *src = source, *src_end = src + src_bytes;
6210 unsigned char c;
6211 int total = 0;
6212 int eol_seen = EOL_SEEN_NONE;
6214 if ((1 << category) & CATEGORY_MASK_UTF_16)
6216 int msb, lsb;
6218 msb = category == (coding_category_utf_16_le
6219 | coding_category_utf_16_le_nosig);
6220 lsb = 1 - msb;
6222 while (src + 1 < src_end)
6224 c = src[lsb];
6225 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6227 int this_eol;
6229 if (c == '\n')
6230 this_eol = EOL_SEEN_LF;
6231 else if (src + 3 >= src_end
6232 || src[msb + 2] != 0
6233 || src[lsb + 2] != '\n')
6234 this_eol = EOL_SEEN_CR;
6235 else
6237 this_eol = EOL_SEEN_CRLF;
6238 src += 2;
6241 if (eol_seen == EOL_SEEN_NONE)
6242 /* This is the first end-of-line. */
6243 eol_seen = this_eol;
6244 else if (eol_seen != this_eol)
6246 /* The found type is different from what found before.
6247 Allow for stray ^M characters in DOS EOL files. */
6248 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6249 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6250 eol_seen = EOL_SEEN_CRLF;
6251 else
6253 eol_seen = EOL_SEEN_LF;
6254 break;
6257 if (++total == MAX_EOL_CHECK_COUNT)
6258 break;
6260 src += 2;
6263 else
6265 while (src < src_end)
6267 c = *src++;
6268 if (c == '\n' || c == '\r')
6270 int this_eol;
6272 if (c == '\n')
6273 this_eol = EOL_SEEN_LF;
6274 else if (src >= src_end || *src != '\n')
6275 this_eol = EOL_SEEN_CR;
6276 else
6277 this_eol = EOL_SEEN_CRLF, src++;
6279 if (eol_seen == EOL_SEEN_NONE)
6280 /* This is the first end-of-line. */
6281 eol_seen = this_eol;
6282 else if (eol_seen != this_eol)
6284 /* The found type is different from what found before.
6285 Allow for stray ^M characters in DOS EOL files. */
6286 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6287 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6288 eol_seen = EOL_SEEN_CRLF;
6289 else
6291 eol_seen = EOL_SEEN_LF;
6292 break;
6295 if (++total == MAX_EOL_CHECK_COUNT)
6296 break;
6300 return eol_seen;
6304 static Lisp_Object
6305 adjust_coding_eol_type (coding, eol_seen)
6306 struct coding_system *coding;
6307 int eol_seen;
6309 Lisp_Object eol_type;
6311 eol_type = CODING_ID_EOL_TYPE (coding->id);
6312 if (eol_seen & EOL_SEEN_LF)
6314 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6315 eol_type = Qunix;
6317 else if (eol_seen & EOL_SEEN_CRLF)
6319 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6320 eol_type = Qdos;
6322 else if (eol_seen & EOL_SEEN_CR)
6324 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6325 eol_type = Qmac;
6327 return eol_type;
6330 /* Detect how a text specified in CODING is encoded. If a coding
6331 system is detected, update fields of CODING by the detected coding
6332 system. */
6334 void
6335 detect_coding (coding)
6336 struct coding_system *coding;
6338 const unsigned char *src, *src_end;
6339 int saved_mode = coding->mode;
6341 coding->consumed = coding->consumed_char = 0;
6342 coding->produced = coding->produced_char = 0;
6343 coding_set_source (coding);
6345 src_end = coding->source + coding->src_bytes;
6346 coding->head_ascii = 0;
6348 /* If we have not yet decided the text encoding type, detect it
6349 now. */
6350 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6352 int c, i;
6353 struct coding_detection_info detect_info;
6354 int null_byte_found = 0, eight_bit_found = 0;
6356 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6357 for (src = coding->source; src < src_end; src++)
6359 c = *src;
6360 if (c & 0x80)
6362 eight_bit_found = 1;
6363 if (null_byte_found)
6364 break;
6366 else if (c < 0x20)
6368 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6369 && ! inhibit_iso_escape_detection
6370 && ! detect_info.checked)
6372 if (detect_coding_iso_2022 (coding, &detect_info))
6374 /* We have scanned the whole data. */
6375 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6377 /* We didn't find an 8-bit code. We may
6378 have found a null-byte, but it's very
6379 rare that a binary file confirm to
6380 ISO-2022. */
6381 src = src_end;
6382 coding->head_ascii = src - coding->source;
6384 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6385 break;
6388 else if (! c && !inhibit_null_byte_detection)
6390 null_byte_found = 1;
6391 if (eight_bit_found)
6392 break;
6394 if (! eight_bit_found)
6395 coding->head_ascii++;
6397 else if (! eight_bit_found)
6398 coding->head_ascii++;
6401 if (null_byte_found || eight_bit_found
6402 || coding->head_ascii < coding->src_bytes
6403 || detect_info.found)
6405 enum coding_category category;
6406 struct coding_system *this;
6408 if (coding->head_ascii == coding->src_bytes)
6409 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6410 for (i = 0; i < coding_category_raw_text; i++)
6412 category = coding_priorities[i];
6413 this = coding_categories + category;
6414 if (detect_info.found & (1 << category))
6415 break;
6417 else
6419 if (null_byte_found)
6421 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6422 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6424 for (i = 0; i < coding_category_raw_text; i++)
6426 category = coding_priorities[i];
6427 this = coding_categories + category;
6428 if (this->id < 0)
6430 /* No coding system of this category is defined. */
6431 detect_info.rejected |= (1 << category);
6433 else if (category >= coding_category_raw_text)
6434 continue;
6435 else if (detect_info.checked & (1 << category))
6437 if (detect_info.found & (1 << category))
6438 break;
6440 else if ((*(this->detector)) (coding, &detect_info)
6441 && detect_info.found & (1 << category))
6443 if (category == coding_category_utf_16_auto)
6445 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6446 category = coding_category_utf_16_le;
6447 else
6448 category = coding_category_utf_16_be;
6450 break;
6455 if (i < coding_category_raw_text)
6456 setup_coding_system (CODING_ID_NAME (this->id), coding);
6457 else if (null_byte_found)
6458 setup_coding_system (Qno_conversion, coding);
6459 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6460 == CATEGORY_MASK_ANY)
6461 setup_coding_system (Qraw_text, coding);
6462 else if (detect_info.rejected)
6463 for (i = 0; i < coding_category_raw_text; i++)
6464 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6466 this = coding_categories + coding_priorities[i];
6467 setup_coding_system (CODING_ID_NAME (this->id), coding);
6468 break;
6472 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6473 == coding_category_utf_8_auto)
6475 Lisp_Object coding_systems;
6476 struct coding_detection_info detect_info;
6478 coding_systems
6479 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6480 detect_info.found = detect_info.rejected = 0;
6481 coding->head_ascii = 0;
6482 if (CONSP (coding_systems)
6483 && detect_coding_utf_8 (coding, &detect_info))
6485 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6486 setup_coding_system (XCAR (coding_systems), coding);
6487 else
6488 setup_coding_system (XCDR (coding_systems), coding);
6491 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6492 == coding_category_utf_16_auto)
6494 Lisp_Object coding_systems;
6495 struct coding_detection_info detect_info;
6497 coding_systems
6498 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6499 detect_info.found = detect_info.rejected = 0;
6500 coding->head_ascii = 0;
6501 if (CONSP (coding_systems)
6502 && detect_coding_utf_16 (coding, &detect_info))
6504 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6505 setup_coding_system (XCAR (coding_systems), coding);
6506 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6507 setup_coding_system (XCDR (coding_systems), coding);
6510 coding->mode = saved_mode;
6514 static void
6515 decode_eol (coding)
6516 struct coding_system *coding;
6518 Lisp_Object eol_type;
6519 unsigned char *p, *pbeg, *pend;
6521 eol_type = CODING_ID_EOL_TYPE (coding->id);
6522 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6523 return;
6525 if (NILP (coding->dst_object))
6526 pbeg = coding->destination;
6527 else
6528 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6529 pend = pbeg + coding->produced;
6531 if (VECTORP (eol_type))
6533 int eol_seen = EOL_SEEN_NONE;
6535 for (p = pbeg; p < pend; p++)
6537 if (*p == '\n')
6538 eol_seen |= EOL_SEEN_LF;
6539 else if (*p == '\r')
6541 if (p + 1 < pend && *(p + 1) == '\n')
6543 eol_seen |= EOL_SEEN_CRLF;
6544 p++;
6546 else
6547 eol_seen |= EOL_SEEN_CR;
6550 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6551 if ((eol_seen & EOL_SEEN_CRLF) != 0
6552 && (eol_seen & EOL_SEEN_CR) != 0
6553 && (eol_seen & EOL_SEEN_LF) == 0)
6554 eol_seen = EOL_SEEN_CRLF;
6555 else if (eol_seen != EOL_SEEN_NONE
6556 && eol_seen != EOL_SEEN_LF
6557 && eol_seen != EOL_SEEN_CRLF
6558 && eol_seen != EOL_SEEN_CR)
6559 eol_seen = EOL_SEEN_LF;
6560 if (eol_seen != EOL_SEEN_NONE)
6561 eol_type = adjust_coding_eol_type (coding, eol_seen);
6564 if (EQ (eol_type, Qmac))
6566 for (p = pbeg; p < pend; p++)
6567 if (*p == '\r')
6568 *p = '\n';
6570 else if (EQ (eol_type, Qdos))
6572 int n = 0;
6574 if (NILP (coding->dst_object))
6576 /* Start deleting '\r' from the tail to minimize the memory
6577 movement. */
6578 for (p = pend - 2; p >= pbeg; p--)
6579 if (*p == '\r')
6581 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6582 n++;
6585 else
6587 int pos_byte = coding->dst_pos_byte;
6588 int pos = coding->dst_pos;
6589 int pos_end = pos + coding->produced_char - 1;
6591 while (pos < pos_end)
6593 p = BYTE_POS_ADDR (pos_byte);
6594 if (*p == '\r' && p[1] == '\n')
6596 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6597 n++;
6598 pos_end--;
6600 pos++;
6601 if (coding->dst_multibyte)
6602 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6603 else
6604 pos_byte++;
6607 coding->produced -= n;
6608 coding->produced_char -= n;
6613 /* Return a translation table (or list of them) from coding system
6614 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6615 decoding (ENCODEP is zero). */
6617 static Lisp_Object
6618 get_translation_table (attrs, encodep, max_lookup)
6619 Lisp_Object attrs;
6620 int encodep, *max_lookup;
6622 Lisp_Object standard, translation_table;
6623 Lisp_Object val;
6625 if (encodep)
6626 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6627 standard = Vstandard_translation_table_for_encode;
6628 else
6629 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6630 standard = Vstandard_translation_table_for_decode;
6631 if (NILP (translation_table))
6632 translation_table = standard;
6633 else
6635 if (SYMBOLP (translation_table))
6636 translation_table = Fget (translation_table, Qtranslation_table);
6637 else if (CONSP (translation_table))
6639 translation_table = Fcopy_sequence (translation_table);
6640 for (val = translation_table; CONSP (val); val = XCDR (val))
6641 if (SYMBOLP (XCAR (val)))
6642 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6644 if (CHAR_TABLE_P (standard))
6646 if (CONSP (translation_table))
6647 translation_table = nconc2 (translation_table,
6648 Fcons (standard, Qnil));
6649 else
6650 translation_table = Fcons (translation_table,
6651 Fcons (standard, Qnil));
6655 if (max_lookup)
6657 *max_lookup = 1;
6658 if (CHAR_TABLE_P (translation_table)
6659 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6661 val = XCHAR_TABLE (translation_table)->extras[1];
6662 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6663 *max_lookup = XFASTINT (val);
6665 else if (CONSP (translation_table))
6667 Lisp_Object tail, val;
6669 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6670 if (CHAR_TABLE_P (XCAR (tail))
6671 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6673 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6674 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6675 *max_lookup = XFASTINT (val);
6679 return translation_table;
6682 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6683 do { \
6684 trans = Qnil; \
6685 if (CHAR_TABLE_P (table)) \
6687 trans = CHAR_TABLE_REF (table, c); \
6688 if (CHARACTERP (trans)) \
6689 c = XFASTINT (trans), trans = Qnil; \
6691 else if (CONSP (table)) \
6693 Lisp_Object tail; \
6695 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6696 if (CHAR_TABLE_P (XCAR (tail))) \
6698 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6699 if (CHARACTERP (trans)) \
6700 c = XFASTINT (trans), trans = Qnil; \
6701 else if (! NILP (trans)) \
6702 break; \
6705 } while (0)
6708 /* Return a translation of character(s) at BUF according to TRANS.
6709 TRANS is TO-CHAR or ((FROM . TO) ...) where
6710 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6711 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6712 translation is found, and Qnil if not found..
6713 If BUF is too short to lookup characters in FROM, return Qt. */
6715 static Lisp_Object
6716 get_translation (trans, buf, buf_end)
6717 Lisp_Object trans;
6718 int *buf, *buf_end;
6721 if (INTEGERP (trans))
6722 return trans;
6723 for (; CONSP (trans); trans = XCDR (trans))
6725 Lisp_Object val = XCAR (trans);
6726 Lisp_Object from = XCAR (val);
6727 int len = ASIZE (from);
6728 int i;
6730 for (i = 0; i < len; i++)
6732 if (buf + i == buf_end)
6733 return Qt;
6734 if (XINT (AREF (from, i)) != buf[i])
6735 break;
6737 if (i == len)
6738 return val;
6740 return Qnil;
6744 static int
6745 produce_chars (coding, translation_table, last_block)
6746 struct coding_system *coding;
6747 Lisp_Object translation_table;
6748 int last_block;
6750 unsigned char *dst = coding->destination + coding->produced;
6751 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6752 EMACS_INT produced;
6753 EMACS_INT produced_chars = 0;
6754 int carryover = 0;
6756 if (! coding->chars_at_source)
6758 /* Source characters are in coding->charbuf. */
6759 int *buf = coding->charbuf;
6760 int *buf_end = buf + coding->charbuf_used;
6762 if (EQ (coding->src_object, coding->dst_object))
6764 coding_set_source (coding);
6765 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6768 while (buf < buf_end)
6770 int c = *buf, i;
6772 if (c >= 0)
6774 int from_nchars = 1, to_nchars = 1;
6775 Lisp_Object trans = Qnil;
6777 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6778 if (! NILP (trans))
6780 trans = get_translation (trans, buf, buf_end);
6781 if (INTEGERP (trans))
6782 c = XINT (trans);
6783 else if (CONSP (trans))
6785 from_nchars = ASIZE (XCAR (trans));
6786 trans = XCDR (trans);
6787 if (INTEGERP (trans))
6788 c = XINT (trans);
6789 else
6791 to_nchars = ASIZE (trans);
6792 c = XINT (AREF (trans, 0));
6795 else if (EQ (trans, Qt) && ! last_block)
6796 break;
6799 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6801 dst = alloc_destination (coding,
6802 buf_end - buf
6803 + MAX_MULTIBYTE_LENGTH * to_nchars,
6804 dst);
6805 if (EQ (coding->src_object, coding->dst_object))
6807 coding_set_source (coding);
6808 dst_end = (((unsigned char *) coding->source)
6809 + coding->consumed);
6811 else
6812 dst_end = coding->destination + coding->dst_bytes;
6815 for (i = 0; i < to_nchars; i++)
6817 if (i > 0)
6818 c = XINT (AREF (trans, i));
6819 if (coding->dst_multibyte
6820 || ! CHAR_BYTE8_P (c))
6821 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6822 else
6823 *dst++ = CHAR_TO_BYTE8 (c);
6825 produced_chars += to_nchars;
6826 buf += from_nchars;
6828 else
6829 /* This is an annotation datum. (-C) is the length. */
6830 buf += -c;
6832 carryover = buf_end - buf;
6834 else
6836 /* Source characters are at coding->source. */
6837 const unsigned char *src = coding->source;
6838 const unsigned char *src_end = src + coding->consumed;
6840 if (EQ (coding->dst_object, coding->src_object))
6841 dst_end = (unsigned char *) src;
6842 if (coding->src_multibyte != coding->dst_multibyte)
6844 if (coding->src_multibyte)
6846 int multibytep = 1;
6847 EMACS_INT consumed_chars = 0;
6849 while (1)
6851 const unsigned char *src_base = src;
6852 int c;
6854 ONE_MORE_BYTE (c);
6855 if (dst == dst_end)
6857 if (EQ (coding->src_object, coding->dst_object))
6858 dst_end = (unsigned char *) src;
6859 if (dst == dst_end)
6861 EMACS_INT offset = src - coding->source;
6863 dst = alloc_destination (coding, src_end - src + 1,
6864 dst);
6865 dst_end = coding->destination + coding->dst_bytes;
6866 coding_set_source (coding);
6867 src = coding->source + offset;
6868 src_end = coding->source + coding->src_bytes;
6869 if (EQ (coding->src_object, coding->dst_object))
6870 dst_end = (unsigned char *) src;
6873 *dst++ = c;
6874 produced_chars++;
6876 no_more_source:
6879 else
6880 while (src < src_end)
6882 int multibytep = 1;
6883 int c = *src++;
6885 if (dst >= dst_end - 1)
6887 if (EQ (coding->src_object, coding->dst_object))
6888 dst_end = (unsigned char *) src;
6889 if (dst >= dst_end - 1)
6891 EMACS_INT offset = src - coding->source;
6892 EMACS_INT more_bytes;
6894 if (EQ (coding->src_object, coding->dst_object))
6895 more_bytes = ((src_end - src) / 2) + 2;
6896 else
6897 more_bytes = src_end - src + 2;
6898 dst = alloc_destination (coding, more_bytes, dst);
6899 dst_end = coding->destination + coding->dst_bytes;
6900 coding_set_source (coding);
6901 src = coding->source + offset;
6902 src_end = coding->source + coding->src_bytes;
6903 if (EQ (coding->src_object, coding->dst_object))
6904 dst_end = (unsigned char *) src;
6907 EMIT_ONE_BYTE (c);
6910 else
6912 if (!EQ (coding->src_object, coding->dst_object))
6914 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6916 if (require > 0)
6918 EMACS_INT offset = src - coding->source;
6920 dst = alloc_destination (coding, require, dst);
6921 coding_set_source (coding);
6922 src = coding->source + offset;
6923 src_end = coding->source + coding->src_bytes;
6926 produced_chars = coding->consumed_char;
6927 while (src < src_end)
6928 *dst++ = *src++;
6932 produced = dst - (coding->destination + coding->produced);
6933 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6934 insert_from_gap (produced_chars, produced);
6935 coding->produced += produced;
6936 coding->produced_char += produced_chars;
6937 return carryover;
6940 /* Compose text in CODING->object according to the annotation data at
6941 CHARBUF. CHARBUF is an array:
6942 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6945 static INLINE void
6946 produce_composition (coding, charbuf, pos)
6947 struct coding_system *coding;
6948 int *charbuf;
6949 EMACS_INT pos;
6951 int len;
6952 EMACS_INT to;
6953 enum composition_method method;
6954 Lisp_Object components;
6956 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6957 to = pos + charbuf[2];
6958 method = (enum composition_method) (charbuf[4]);
6960 if (method == COMPOSITION_RELATIVE)
6961 components = Qnil;
6962 else
6964 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6965 int i, j;
6967 if (method == COMPOSITION_WITH_RULE)
6968 len = charbuf[2] * 3 - 2;
6969 charbuf += MAX_ANNOTATION_LENGTH;
6970 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6971 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6973 if (charbuf[i] >= 0)
6974 args[j] = make_number (charbuf[i]);
6975 else
6977 i++;
6978 args[j] = make_number (charbuf[i] % 0x100);
6981 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6983 compose_text (pos, to, components, Qnil, coding->dst_object);
6987 /* Put `charset' property on text in CODING->object according to
6988 the annotation data at CHARBUF. CHARBUF is an array:
6989 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6992 static INLINE void
6993 produce_charset (coding, charbuf, pos)
6994 struct coding_system *coding;
6995 int *charbuf;
6996 EMACS_INT pos;
6998 EMACS_INT from = pos - charbuf[2];
6999 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7001 Fput_text_property (make_number (from), make_number (pos),
7002 Qcharset, CHARSET_NAME (charset),
7003 coding->dst_object);
7007 #define CHARBUF_SIZE 0x4000
7009 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7010 do { \
7011 int size = CHARBUF_SIZE; \
7013 coding->charbuf = NULL; \
7014 while (size > 1024) \
7016 coding->charbuf = (int *) alloca (sizeof (int) * size); \
7017 if (coding->charbuf) \
7018 break; \
7019 size >>= 1; \
7021 if (! coding->charbuf) \
7023 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7024 return coding->result; \
7026 coding->charbuf_size = size; \
7027 } while (0)
7030 static void
7031 produce_annotation (coding, pos)
7032 struct coding_system *coding;
7033 EMACS_INT pos;
7035 int *charbuf = coding->charbuf;
7036 int *charbuf_end = charbuf + coding->charbuf_used;
7038 if (NILP (coding->dst_object))
7039 return;
7041 while (charbuf < charbuf_end)
7043 if (*charbuf >= 0)
7044 pos++, charbuf++;
7045 else
7047 int len = -*charbuf;
7049 if (len > 2)
7050 switch (charbuf[1])
7052 case CODING_ANNOTATE_COMPOSITION_MASK:
7053 produce_composition (coding, charbuf, pos);
7054 break;
7055 case CODING_ANNOTATE_CHARSET_MASK:
7056 produce_charset (coding, charbuf, pos);
7057 break;
7059 charbuf += len;
7064 /* Decode the data at CODING->src_object into CODING->dst_object.
7065 CODING->src_object is a buffer, a string, or nil.
7066 CODING->dst_object is a buffer.
7068 If CODING->src_object is a buffer, it must be the current buffer.
7069 In this case, if CODING->src_pos is positive, it is a position of
7070 the source text in the buffer, otherwise, the source text is in the
7071 gap area of the buffer, and CODING->src_pos specifies the offset of
7072 the text from GPT (which must be the same as PT). If this is the
7073 same buffer as CODING->dst_object, CODING->src_pos must be
7074 negative.
7076 If CODING->src_object is a string, CODING->src_pos is an index to
7077 that string.
7079 If CODING->src_object is nil, CODING->source must already point to
7080 the non-relocatable memory area. In this case, CODING->src_pos is
7081 an offset from CODING->source.
7083 The decoded data is inserted at the current point of the buffer
7084 CODING->dst_object.
7087 static int
7088 decode_coding (coding)
7089 struct coding_system *coding;
7091 Lisp_Object attrs;
7092 Lisp_Object undo_list;
7093 Lisp_Object translation_table;
7094 int carryover;
7095 int i;
7097 if (BUFFERP (coding->src_object)
7098 && coding->src_pos > 0
7099 && coding->src_pos < GPT
7100 && coding->src_pos + coding->src_chars > GPT)
7101 move_gap_both (coding->src_pos, coding->src_pos_byte);
7103 undo_list = Qt;
7104 if (BUFFERP (coding->dst_object))
7106 if (current_buffer != XBUFFER (coding->dst_object))
7107 set_buffer_internal (XBUFFER (coding->dst_object));
7108 if (GPT != PT)
7109 move_gap_both (PT, PT_BYTE);
7110 undo_list = current_buffer->undo_list;
7111 current_buffer->undo_list = Qt;
7114 coding->consumed = coding->consumed_char = 0;
7115 coding->produced = coding->produced_char = 0;
7116 coding->chars_at_source = 0;
7117 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7118 coding->errors = 0;
7120 ALLOC_CONVERSION_WORK_AREA (coding);
7122 attrs = CODING_ID_ATTRS (coding->id);
7123 translation_table = get_translation_table (attrs, 0, NULL);
7125 carryover = 0;
7128 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7130 coding_set_source (coding);
7131 coding->annotated = 0;
7132 coding->charbuf_used = carryover;
7133 (*(coding->decoder)) (coding);
7134 coding_set_destination (coding);
7135 carryover = produce_chars (coding, translation_table, 0);
7136 if (coding->annotated)
7137 produce_annotation (coding, pos);
7138 for (i = 0; i < carryover; i++)
7139 coding->charbuf[i]
7140 = coding->charbuf[coding->charbuf_used - carryover + i];
7142 while (coding->consumed < coding->src_bytes
7143 && (coding->result == CODING_RESULT_SUCCESS
7144 || coding->result == CODING_RESULT_INVALID_SRC));
7146 if (carryover > 0)
7148 coding_set_destination (coding);
7149 coding->charbuf_used = carryover;
7150 produce_chars (coding, translation_table, 1);
7153 coding->carryover_bytes = 0;
7154 if (coding->consumed < coding->src_bytes)
7156 int nbytes = coding->src_bytes - coding->consumed;
7157 const unsigned char *src;
7159 coding_set_source (coding);
7160 coding_set_destination (coding);
7161 src = coding->source + coding->consumed;
7163 if (coding->mode & CODING_MODE_LAST_BLOCK)
7165 /* Flush out unprocessed data as binary chars. We are sure
7166 that the number of data is less than the size of
7167 coding->charbuf. */
7168 coding->charbuf_used = 0;
7169 coding->chars_at_source = 0;
7171 while (nbytes-- > 0)
7173 int c = *src++;
7175 if (c & 0x80)
7176 c = BYTE8_TO_CHAR (c);
7177 coding->charbuf[coding->charbuf_used++] = c;
7179 produce_chars (coding, Qnil, 1);
7181 else
7183 /* Record unprocessed bytes in coding->carryover. We are
7184 sure that the number of data is less than the size of
7185 coding->carryover. */
7186 unsigned char *p = coding->carryover;
7188 if (nbytes > sizeof coding->carryover)
7189 nbytes = sizeof coding->carryover;
7190 coding->carryover_bytes = nbytes;
7191 while (nbytes-- > 0)
7192 *p++ = *src++;
7194 coding->consumed = coding->src_bytes;
7197 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7198 && !inhibit_eol_conversion)
7199 decode_eol (coding);
7200 if (BUFFERP (coding->dst_object))
7202 current_buffer->undo_list = undo_list;
7203 record_insert (coding->dst_pos, coding->produced_char);
7205 return coding->result;
7209 /* Extract an annotation datum from a composition starting at POS and
7210 ending before LIMIT of CODING->src_object (buffer or string), store
7211 the data in BUF, set *STOP to a starting position of the next
7212 composition (if any) or to LIMIT, and return the address of the
7213 next element of BUF.
7215 If such an annotation is not found, set *STOP to a starting
7216 position of a composition after POS (if any) or to LIMIT, and
7217 return BUF. */
7219 static INLINE int *
7220 handle_composition_annotation (pos, limit, coding, buf, stop)
7221 EMACS_INT pos, limit;
7222 struct coding_system *coding;
7223 int *buf;
7224 EMACS_INT *stop;
7226 EMACS_INT start, end;
7227 Lisp_Object prop;
7229 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7230 || end > limit)
7231 *stop = limit;
7232 else if (start > pos)
7233 *stop = start;
7234 else
7236 if (start == pos)
7238 /* We found a composition. Store the corresponding
7239 annotation data in BUF. */
7240 int *head = buf;
7241 enum composition_method method = COMPOSITION_METHOD (prop);
7242 int nchars = COMPOSITION_LENGTH (prop);
7244 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7245 if (method != COMPOSITION_RELATIVE)
7247 Lisp_Object components;
7248 int len, i, i_byte;
7250 components = COMPOSITION_COMPONENTS (prop);
7251 if (VECTORP (components))
7253 len = XVECTOR (components)->size;
7254 for (i = 0; i < len; i++)
7255 *buf++ = XINT (AREF (components, i));
7257 else if (STRINGP (components))
7259 len = SCHARS (components);
7260 i = i_byte = 0;
7261 while (i < len)
7263 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7264 buf++;
7267 else if (INTEGERP (components))
7269 len = 1;
7270 *buf++ = XINT (components);
7272 else if (CONSP (components))
7274 for (len = 0; CONSP (components);
7275 len++, components = XCDR (components))
7276 *buf++ = XINT (XCAR (components));
7278 else
7279 abort ();
7280 *head -= len;
7284 if (find_composition (end, limit, &start, &end, &prop,
7285 coding->src_object)
7286 && end <= limit)
7287 *stop = start;
7288 else
7289 *stop = limit;
7291 return buf;
7295 /* Extract an annotation datum from a text property `charset' at POS of
7296 CODING->src_object (buffer of string), store the data in BUF, set
7297 *STOP to the position where the value of `charset' property changes
7298 (limiting by LIMIT), and return the address of the next element of
7299 BUF.
7301 If the property value is nil, set *STOP to the position where the
7302 property value is non-nil (limiting by LIMIT), and return BUF. */
7304 static INLINE int *
7305 handle_charset_annotation (pos, limit, coding, buf, stop)
7306 EMACS_INT pos, limit;
7307 struct coding_system *coding;
7308 int *buf;
7309 EMACS_INT *stop;
7311 Lisp_Object val, next;
7312 int id;
7314 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7315 if (! NILP (val) && CHARSETP (val))
7316 id = XINT (CHARSET_SYMBOL_ID (val));
7317 else
7318 id = -1;
7319 ADD_CHARSET_DATA (buf, 0, id);
7320 next = Fnext_single_property_change (make_number (pos), Qcharset,
7321 coding->src_object,
7322 make_number (limit));
7323 *stop = XINT (next);
7324 return buf;
7328 static void
7329 consume_chars (coding, translation_table, max_lookup)
7330 struct coding_system *coding;
7331 Lisp_Object translation_table;
7332 int max_lookup;
7334 int *buf = coding->charbuf;
7335 int *buf_end = coding->charbuf + coding->charbuf_size;
7336 const unsigned char *src = coding->source + coding->consumed;
7337 const unsigned char *src_end = coding->source + coding->src_bytes;
7338 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7339 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7340 int multibytep = coding->src_multibyte;
7341 Lisp_Object eol_type;
7342 int c;
7343 EMACS_INT stop, stop_composition, stop_charset;
7344 int *lookup_buf = NULL;
7346 if (! NILP (translation_table))
7347 lookup_buf = alloca (sizeof (int) * max_lookup);
7349 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7350 if (VECTORP (eol_type))
7351 eol_type = Qunix;
7353 /* Note: composition handling is not yet implemented. */
7354 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7356 if (NILP (coding->src_object))
7357 stop = stop_composition = stop_charset = end_pos;
7358 else
7360 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7361 stop = stop_composition = pos;
7362 else
7363 stop = stop_composition = end_pos;
7364 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7365 stop = stop_charset = pos;
7366 else
7367 stop_charset = end_pos;
7370 /* Compensate for CRLF and conversion. */
7371 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7372 while (buf < buf_end)
7374 Lisp_Object trans;
7376 if (pos == stop)
7378 if (pos == end_pos)
7379 break;
7380 if (pos == stop_composition)
7381 buf = handle_composition_annotation (pos, end_pos, coding,
7382 buf, &stop_composition);
7383 if (pos == stop_charset)
7384 buf = handle_charset_annotation (pos, end_pos, coding,
7385 buf, &stop_charset);
7386 stop = (stop_composition < stop_charset
7387 ? stop_composition : stop_charset);
7390 if (! multibytep)
7392 EMACS_INT bytes;
7394 if (coding->encoder == encode_coding_raw_text)
7395 c = *src++, pos++;
7396 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7397 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7398 else
7399 c = BYTE8_TO_CHAR (*src), src++, pos++;
7401 else
7402 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7403 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7404 c = '\n';
7405 if (! EQ (eol_type, Qunix))
7407 if (c == '\n')
7409 if (EQ (eol_type, Qdos))
7410 *buf++ = '\r';
7411 else
7412 c = '\r';
7416 trans = Qnil;
7417 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7418 if (NILP (trans))
7419 *buf++ = c;
7420 else
7422 int from_nchars = 1, to_nchars = 1;
7423 int *lookup_buf_end;
7424 const unsigned char *p = src;
7425 int i;
7427 lookup_buf[0] = c;
7428 for (i = 1; i < max_lookup && p < src_end; i++)
7429 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7430 lookup_buf_end = lookup_buf + i;
7431 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7432 if (INTEGERP (trans))
7433 c = XINT (trans);
7434 else if (CONSP (trans))
7436 from_nchars = ASIZE (XCAR (trans));
7437 trans = XCDR (trans);
7438 if (INTEGERP (trans))
7439 c = XINT (trans);
7440 else
7442 to_nchars = ASIZE (trans);
7443 if (buf + to_nchars > buf_end)
7444 break;
7445 c = XINT (AREF (trans, 0));
7448 else
7449 break;
7450 *buf++ = c;
7451 for (i = 1; i < to_nchars; i++)
7452 *buf++ = XINT (AREF (trans, i));
7453 for (i = 1; i < from_nchars; i++, pos++)
7454 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7458 coding->consumed = src - coding->source;
7459 coding->consumed_char = pos - coding->src_pos;
7460 coding->charbuf_used = buf - coding->charbuf;
7461 coding->chars_at_source = 0;
7465 /* Encode the text at CODING->src_object into CODING->dst_object.
7466 CODING->src_object is a buffer or a string.
7467 CODING->dst_object is a buffer or nil.
7469 If CODING->src_object is a buffer, it must be the current buffer.
7470 In this case, if CODING->src_pos is positive, it is a position of
7471 the source text in the buffer, otherwise. the source text is in the
7472 gap area of the buffer, and coding->src_pos specifies the offset of
7473 the text from GPT (which must be the same as PT). If this is the
7474 same buffer as CODING->dst_object, CODING->src_pos must be
7475 negative and CODING should not have `pre-write-conversion'.
7477 If CODING->src_object is a string, CODING should not have
7478 `pre-write-conversion'.
7480 If CODING->dst_object is a buffer, the encoded data is inserted at
7481 the current point of that buffer.
7483 If CODING->dst_object is nil, the encoded data is placed at the
7484 memory area specified by CODING->destination. */
7486 static int
7487 encode_coding (coding)
7488 struct coding_system *coding;
7490 Lisp_Object attrs;
7491 Lisp_Object translation_table;
7492 int max_lookup;
7494 attrs = CODING_ID_ATTRS (coding->id);
7495 if (coding->encoder == encode_coding_raw_text)
7496 translation_table = Qnil, max_lookup = 0;
7497 else
7498 translation_table = get_translation_table (attrs, 1, &max_lookup);
7500 if (BUFFERP (coding->dst_object))
7502 set_buffer_internal (XBUFFER (coding->dst_object));
7503 coding->dst_multibyte
7504 = ! NILP (current_buffer->enable_multibyte_characters);
7507 coding->consumed = coding->consumed_char = 0;
7508 coding->produced = coding->produced_char = 0;
7509 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7510 coding->errors = 0;
7512 ALLOC_CONVERSION_WORK_AREA (coding);
7514 do {
7515 coding_set_source (coding);
7516 consume_chars (coding, translation_table, max_lookup);
7517 coding_set_destination (coding);
7518 (*(coding->encoder)) (coding);
7519 } while (coding->consumed_char < coding->src_chars);
7521 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7522 insert_from_gap (coding->produced_char, coding->produced);
7524 return (coding->result);
7528 /* Name (or base name) of work buffer for code conversion. */
7529 static Lisp_Object Vcode_conversion_workbuf_name;
7531 /* A working buffer used by the top level conversion. Once it is
7532 created, it is never destroyed. It has the name
7533 Vcode_conversion_workbuf_name. The other working buffers are
7534 destroyed after the use is finished, and their names are modified
7535 versions of Vcode_conversion_workbuf_name. */
7536 static Lisp_Object Vcode_conversion_reused_workbuf;
7538 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7539 static int reused_workbuf_in_use;
7542 /* Return a working buffer of code convesion. MULTIBYTE specifies the
7543 multibyteness of returning buffer. */
7545 static Lisp_Object
7546 make_conversion_work_buffer (multibyte)
7547 int multibyte;
7549 Lisp_Object name, workbuf;
7550 struct buffer *current;
7552 if (reused_workbuf_in_use++)
7554 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7555 workbuf = Fget_buffer_create (name);
7557 else
7559 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7560 Vcode_conversion_reused_workbuf
7561 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7562 workbuf = Vcode_conversion_reused_workbuf;
7564 current = current_buffer;
7565 set_buffer_internal (XBUFFER (workbuf));
7566 /* We can't allow modification hooks to run in the work buffer. For
7567 instance, directory_files_internal assumes that file decoding
7568 doesn't compile new regexps. */
7569 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7570 Ferase_buffer ();
7571 current_buffer->undo_list = Qt;
7572 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7573 set_buffer_internal (current);
7574 return workbuf;
7578 static Lisp_Object
7579 code_conversion_restore (arg)
7580 Lisp_Object arg;
7582 Lisp_Object current, workbuf;
7583 struct gcpro gcpro1;
7585 GCPRO1 (arg);
7586 current = XCAR (arg);
7587 workbuf = XCDR (arg);
7588 if (! NILP (workbuf))
7590 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7591 reused_workbuf_in_use = 0;
7592 else if (! NILP (Fbuffer_live_p (workbuf)))
7593 Fkill_buffer (workbuf);
7595 set_buffer_internal (XBUFFER (current));
7596 UNGCPRO;
7597 return Qnil;
7600 Lisp_Object
7601 code_conversion_save (with_work_buf, multibyte)
7602 int with_work_buf, multibyte;
7604 Lisp_Object workbuf = Qnil;
7606 if (with_work_buf)
7607 workbuf = make_conversion_work_buffer (multibyte);
7608 record_unwind_protect (code_conversion_restore,
7609 Fcons (Fcurrent_buffer (), workbuf));
7610 return workbuf;
7614 decode_coding_gap (coding, chars, bytes)
7615 struct coding_system *coding;
7616 EMACS_INT chars, bytes;
7618 int count = specpdl_ptr - specpdl;
7619 Lisp_Object attrs;
7621 code_conversion_save (0, 0);
7623 coding->src_object = Fcurrent_buffer ();
7624 coding->src_chars = chars;
7625 coding->src_bytes = bytes;
7626 coding->src_pos = -chars;
7627 coding->src_pos_byte = -bytes;
7628 coding->src_multibyte = chars < bytes;
7629 coding->dst_object = coding->src_object;
7630 coding->dst_pos = PT;
7631 coding->dst_pos_byte = PT_BYTE;
7632 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7634 if (CODING_REQUIRE_DETECTION (coding))
7635 detect_coding (coding);
7637 coding->mode |= CODING_MODE_LAST_BLOCK;
7638 current_buffer->text->inhibit_shrinking = 1;
7639 decode_coding (coding);
7640 current_buffer->text->inhibit_shrinking = 0;
7642 attrs = CODING_ID_ATTRS (coding->id);
7643 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7645 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7646 Lisp_Object val;
7648 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7649 val = call1 (CODING_ATTR_POST_READ (attrs),
7650 make_number (coding->produced_char));
7651 CHECK_NATNUM (val);
7652 coding->produced_char += Z - prev_Z;
7653 coding->produced += Z_BYTE - prev_Z_BYTE;
7656 unbind_to (count, Qnil);
7657 return coding->result;
7661 encode_coding_gap (coding, chars, bytes)
7662 struct coding_system *coding;
7663 EMACS_INT chars, bytes;
7665 int count = specpdl_ptr - specpdl;
7667 code_conversion_save (0, 0);
7669 coding->src_object = Fcurrent_buffer ();
7670 coding->src_chars = chars;
7671 coding->src_bytes = bytes;
7672 coding->src_pos = -chars;
7673 coding->src_pos_byte = -bytes;
7674 coding->src_multibyte = chars < bytes;
7675 coding->dst_object = coding->src_object;
7676 coding->dst_pos = PT;
7677 coding->dst_pos_byte = PT_BYTE;
7679 encode_coding (coding);
7681 unbind_to (count, Qnil);
7682 return coding->result;
7686 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7687 SRC_OBJECT into DST_OBJECT by coding context CODING.
7689 SRC_OBJECT is a buffer, a string, or Qnil.
7691 If it is a buffer, the text is at point of the buffer. FROM and TO
7692 are positions in the buffer.
7694 If it is a string, the text is at the beginning of the string.
7695 FROM and TO are indices to the string.
7697 If it is nil, the text is at coding->source. FROM and TO are
7698 indices to coding->source.
7700 DST_OBJECT is a buffer, Qt, or Qnil.
7702 If it is a buffer, the decoded text is inserted at point of the
7703 buffer. If the buffer is the same as SRC_OBJECT, the source text
7704 is deleted.
7706 If it is Qt, a string is made from the decoded text, and
7707 set in CODING->dst_object.
7709 If it is Qnil, the decoded text is stored at CODING->destination.
7710 The caller must allocate CODING->dst_bytes bytes at
7711 CODING->destination by xmalloc. If the decoded text is longer than
7712 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7715 void
7716 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7717 dst_object)
7718 struct coding_system *coding;
7719 Lisp_Object src_object;
7720 EMACS_INT from, from_byte, to, to_byte;
7721 Lisp_Object dst_object;
7723 int count = specpdl_ptr - specpdl;
7724 unsigned char *destination;
7725 EMACS_INT dst_bytes;
7726 EMACS_INT chars = to - from;
7727 EMACS_INT bytes = to_byte - from_byte;
7728 Lisp_Object attrs;
7729 int saved_pt = -1, saved_pt_byte;
7730 int need_marker_adjustment = 0;
7731 Lisp_Object old_deactivate_mark;
7733 old_deactivate_mark = Vdeactivate_mark;
7735 if (NILP (dst_object))
7737 destination = coding->destination;
7738 dst_bytes = coding->dst_bytes;
7741 coding->src_object = src_object;
7742 coding->src_chars = chars;
7743 coding->src_bytes = bytes;
7744 coding->src_multibyte = chars < bytes;
7746 if (STRINGP (src_object))
7748 coding->src_pos = from;
7749 coding->src_pos_byte = from_byte;
7751 else if (BUFFERP (src_object))
7753 set_buffer_internal (XBUFFER (src_object));
7754 if (from != GPT)
7755 move_gap_both (from, from_byte);
7756 if (EQ (src_object, dst_object))
7758 struct Lisp_Marker *tail;
7760 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7762 tail->need_adjustment
7763 = tail->charpos == (tail->insertion_type ? from : to);
7764 need_marker_adjustment |= tail->need_adjustment;
7766 saved_pt = PT, saved_pt_byte = PT_BYTE;
7767 TEMP_SET_PT_BOTH (from, from_byte);
7768 current_buffer->text->inhibit_shrinking = 1;
7769 del_range_both (from, from_byte, to, to_byte, 1);
7770 coding->src_pos = -chars;
7771 coding->src_pos_byte = -bytes;
7773 else
7775 coding->src_pos = from;
7776 coding->src_pos_byte = from_byte;
7780 if (CODING_REQUIRE_DETECTION (coding))
7781 detect_coding (coding);
7782 attrs = CODING_ID_ATTRS (coding->id);
7784 if (EQ (dst_object, Qt)
7785 || (! NILP (CODING_ATTR_POST_READ (attrs))
7786 && NILP (dst_object)))
7788 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7789 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7790 coding->dst_pos = BEG;
7791 coding->dst_pos_byte = BEG_BYTE;
7793 else if (BUFFERP (dst_object))
7795 code_conversion_save (0, 0);
7796 coding->dst_object = dst_object;
7797 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7798 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7799 coding->dst_multibyte
7800 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7802 else
7804 code_conversion_save (0, 0);
7805 coding->dst_object = Qnil;
7806 /* Most callers presume this will return a multibyte result, and they
7807 won't use `binary' or `raw-text' anyway, so let's not worry about
7808 CODING_FOR_UNIBYTE. */
7809 coding->dst_multibyte = 1;
7812 decode_coding (coding);
7814 if (BUFFERP (coding->dst_object))
7815 set_buffer_internal (XBUFFER (coding->dst_object));
7817 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7819 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7820 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7821 Lisp_Object val;
7823 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7824 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7825 old_deactivate_mark);
7826 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7827 make_number (coding->produced_char));
7828 UNGCPRO;
7829 CHECK_NATNUM (val);
7830 coding->produced_char += Z - prev_Z;
7831 coding->produced += Z_BYTE - prev_Z_BYTE;
7834 if (EQ (dst_object, Qt))
7836 coding->dst_object = Fbuffer_string ();
7838 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7840 set_buffer_internal (XBUFFER (coding->dst_object));
7841 if (dst_bytes < coding->produced)
7843 destination = xrealloc (destination, coding->produced);
7844 if (! destination)
7846 record_conversion_result (coding,
7847 CODING_RESULT_INSUFFICIENT_DST);
7848 unbind_to (count, Qnil);
7849 return;
7851 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7852 move_gap_both (BEGV, BEGV_BYTE);
7853 bcopy (BEGV_ADDR, destination, coding->produced);
7854 coding->destination = destination;
7858 if (saved_pt >= 0)
7860 /* This is the case of:
7861 (BUFFERP (src_object) && EQ (src_object, dst_object))
7862 As we have moved PT while replacing the original buffer
7863 contents, we must recover it now. */
7864 set_buffer_internal (XBUFFER (src_object));
7865 current_buffer->text->inhibit_shrinking = 0;
7866 if (saved_pt < from)
7867 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7868 else if (saved_pt < from + chars)
7869 TEMP_SET_PT_BOTH (from, from_byte);
7870 else if (! NILP (current_buffer->enable_multibyte_characters))
7871 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7872 saved_pt_byte + (coding->produced - bytes));
7873 else
7874 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7875 saved_pt_byte + (coding->produced - bytes));
7877 if (need_marker_adjustment)
7879 struct Lisp_Marker *tail;
7881 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7882 if (tail->need_adjustment)
7884 tail->need_adjustment = 0;
7885 if (tail->insertion_type)
7887 tail->bytepos = from_byte;
7888 tail->charpos = from;
7890 else
7892 tail->bytepos = from_byte + coding->produced;
7893 tail->charpos
7894 = (NILP (current_buffer->enable_multibyte_characters)
7895 ? tail->bytepos : from + coding->produced_char);
7901 Vdeactivate_mark = old_deactivate_mark;
7902 unbind_to (count, coding->dst_object);
7906 void
7907 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7908 dst_object)
7909 struct coding_system *coding;
7910 Lisp_Object src_object;
7911 EMACS_INT from, from_byte, to, to_byte;
7912 Lisp_Object dst_object;
7914 int count = specpdl_ptr - specpdl;
7915 EMACS_INT chars = to - from;
7916 EMACS_INT bytes = to_byte - from_byte;
7917 Lisp_Object attrs;
7918 int saved_pt = -1, saved_pt_byte;
7919 int need_marker_adjustment = 0;
7920 int kill_src_buffer = 0;
7921 Lisp_Object old_deactivate_mark;
7923 old_deactivate_mark = Vdeactivate_mark;
7925 coding->src_object = src_object;
7926 coding->src_chars = chars;
7927 coding->src_bytes = bytes;
7928 coding->src_multibyte = chars < bytes;
7930 attrs = CODING_ID_ATTRS (coding->id);
7932 if (EQ (src_object, dst_object))
7934 struct Lisp_Marker *tail;
7936 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7938 tail->need_adjustment
7939 = tail->charpos == (tail->insertion_type ? from : to);
7940 need_marker_adjustment |= tail->need_adjustment;
7944 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7946 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7947 set_buffer_internal (XBUFFER (coding->src_object));
7948 if (STRINGP (src_object))
7949 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7950 else if (BUFFERP (src_object))
7951 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7952 else
7953 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7955 if (EQ (src_object, dst_object))
7957 set_buffer_internal (XBUFFER (src_object));
7958 saved_pt = PT, saved_pt_byte = PT_BYTE;
7959 del_range_both (from, from_byte, to, to_byte, 1);
7960 set_buffer_internal (XBUFFER (coding->src_object));
7964 Lisp_Object args[3];
7965 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7967 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7968 old_deactivate_mark);
7969 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7970 args[1] = make_number (BEG);
7971 args[2] = make_number (Z);
7972 safe_call (3, args);
7973 UNGCPRO;
7975 if (XBUFFER (coding->src_object) != current_buffer)
7976 kill_src_buffer = 1;
7977 coding->src_object = Fcurrent_buffer ();
7978 if (BEG != GPT)
7979 move_gap_both (BEG, BEG_BYTE);
7980 coding->src_chars = Z - BEG;
7981 coding->src_bytes = Z_BYTE - BEG_BYTE;
7982 coding->src_pos = BEG;
7983 coding->src_pos_byte = BEG_BYTE;
7984 coding->src_multibyte = Z < Z_BYTE;
7986 else if (STRINGP (src_object))
7988 code_conversion_save (0, 0);
7989 coding->src_pos = from;
7990 coding->src_pos_byte = from_byte;
7992 else if (BUFFERP (src_object))
7994 code_conversion_save (0, 0);
7995 set_buffer_internal (XBUFFER (src_object));
7996 if (EQ (src_object, dst_object))
7998 saved_pt = PT, saved_pt_byte = PT_BYTE;
7999 coding->src_object = del_range_1 (from, to, 1, 1);
8000 coding->src_pos = 0;
8001 coding->src_pos_byte = 0;
8003 else
8005 if (from < GPT && to >= GPT)
8006 move_gap_both (from, from_byte);
8007 coding->src_pos = from;
8008 coding->src_pos_byte = from_byte;
8011 else
8012 code_conversion_save (0, 0);
8014 if (BUFFERP (dst_object))
8016 coding->dst_object = dst_object;
8017 if (EQ (src_object, dst_object))
8019 coding->dst_pos = from;
8020 coding->dst_pos_byte = from_byte;
8022 else
8024 struct buffer *current = current_buffer;
8026 set_buffer_temp (XBUFFER (dst_object));
8027 coding->dst_pos = PT;
8028 coding->dst_pos_byte = PT_BYTE;
8029 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8030 set_buffer_temp (current);
8032 coding->dst_multibyte
8033 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8035 else if (EQ (dst_object, Qt))
8037 coding->dst_object = Qnil;
8038 coding->dst_bytes = coding->src_chars;
8039 if (coding->dst_bytes == 0)
8040 coding->dst_bytes = 1;
8041 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8042 coding->dst_multibyte = 0;
8044 else
8046 coding->dst_object = Qnil;
8047 coding->dst_multibyte = 0;
8050 encode_coding (coding);
8052 if (EQ (dst_object, Qt))
8054 if (BUFFERP (coding->dst_object))
8055 coding->dst_object = Fbuffer_string ();
8056 else
8058 coding->dst_object
8059 = make_unibyte_string ((char *) coding->destination,
8060 coding->produced);
8061 xfree (coding->destination);
8065 if (saved_pt >= 0)
8067 /* This is the case of:
8068 (BUFFERP (src_object) && EQ (src_object, dst_object))
8069 As we have moved PT while replacing the original buffer
8070 contents, we must recover it now. */
8071 set_buffer_internal (XBUFFER (src_object));
8072 if (saved_pt < from)
8073 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8074 else if (saved_pt < from + chars)
8075 TEMP_SET_PT_BOTH (from, from_byte);
8076 else if (! NILP (current_buffer->enable_multibyte_characters))
8077 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8078 saved_pt_byte + (coding->produced - bytes));
8079 else
8080 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8081 saved_pt_byte + (coding->produced - bytes));
8083 if (need_marker_adjustment)
8085 struct Lisp_Marker *tail;
8087 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8088 if (tail->need_adjustment)
8090 tail->need_adjustment = 0;
8091 if (tail->insertion_type)
8093 tail->bytepos = from_byte;
8094 tail->charpos = from;
8096 else
8098 tail->bytepos = from_byte + coding->produced;
8099 tail->charpos
8100 = (NILP (current_buffer->enable_multibyte_characters)
8101 ? tail->bytepos : from + coding->produced_char);
8107 if (kill_src_buffer)
8108 Fkill_buffer (coding->src_object);
8110 Vdeactivate_mark = old_deactivate_mark;
8111 unbind_to (count, Qnil);
8115 Lisp_Object
8116 preferred_coding_system ()
8118 int id = coding_categories[coding_priorities[0]].id;
8120 return CODING_ID_NAME (id);
8124 #ifdef emacs
8125 /*** 8. Emacs Lisp library functions ***/
8127 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8128 doc: /* Return t if OBJECT is nil or a coding-system.
8129 See the documentation of `define-coding-system' for information
8130 about coding-system objects. */)
8131 (object)
8132 Lisp_Object object;
8134 if (NILP (object)
8135 || CODING_SYSTEM_ID (object) >= 0)
8136 return Qt;
8137 if (! SYMBOLP (object)
8138 || NILP (Fget (object, Qcoding_system_define_form)))
8139 return Qnil;
8140 return Qt;
8143 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8144 Sread_non_nil_coding_system, 1, 1, 0,
8145 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8146 (prompt)
8147 Lisp_Object prompt;
8149 Lisp_Object val;
8152 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8153 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8155 while (SCHARS (val) == 0);
8156 return (Fintern (val, Qnil));
8159 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8160 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8161 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8162 Ignores case when completing coding systems (all Emacs coding systems
8163 are lower-case). */)
8164 (prompt, default_coding_system)
8165 Lisp_Object prompt, default_coding_system;
8167 Lisp_Object val;
8168 int count = SPECPDL_INDEX ();
8170 if (SYMBOLP (default_coding_system))
8171 default_coding_system = SYMBOL_NAME (default_coding_system);
8172 specbind (Qcompletion_ignore_case, Qt);
8173 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8174 Qt, Qnil, Qcoding_system_history,
8175 default_coding_system, Qnil);
8176 unbind_to (count, Qnil);
8177 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8180 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8181 1, 1, 0,
8182 doc: /* Check validity of CODING-SYSTEM.
8183 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8184 It is valid if it is nil or a symbol defined as a coding system by the
8185 function `define-coding-system'. */)
8186 (coding_system)
8187 Lisp_Object coding_system;
8189 Lisp_Object define_form;
8191 define_form = Fget (coding_system, Qcoding_system_define_form);
8192 if (! NILP (define_form))
8194 Fput (coding_system, Qcoding_system_define_form, Qnil);
8195 safe_eval (define_form);
8197 if (!NILP (Fcoding_system_p (coding_system)))
8198 return coding_system;
8199 xsignal1 (Qcoding_system_error, coding_system);
8203 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8204 HIGHEST is nonzero, return the coding system of the highest
8205 priority among the detected coding systems. Otherwize return a
8206 list of detected coding systems sorted by their priorities. If
8207 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8208 multibyte form but contains only ASCII and eight-bit chars.
8209 Otherwise, the bytes are raw bytes.
8211 CODING-SYSTEM controls the detection as below:
8213 If it is nil, detect both text-format and eol-format. If the
8214 text-format part of CODING-SYSTEM is already specified
8215 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8216 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8217 detect only text-format. */
8219 Lisp_Object
8220 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8221 coding_system)
8222 const unsigned char *src;
8223 EMACS_INT src_chars, src_bytes;
8224 int highest;
8225 int multibytep;
8226 Lisp_Object coding_system;
8228 const unsigned char *src_end = src + src_bytes;
8229 Lisp_Object attrs, eol_type;
8230 Lisp_Object val = Qnil;
8231 struct coding_system coding;
8232 int id;
8233 struct coding_detection_info detect_info;
8234 enum coding_category base_category;
8235 int null_byte_found = 0, eight_bit_found = 0;
8237 if (NILP (coding_system))
8238 coding_system = Qundecided;
8239 setup_coding_system (coding_system, &coding);
8240 attrs = CODING_ID_ATTRS (coding.id);
8241 eol_type = CODING_ID_EOL_TYPE (coding.id);
8242 coding_system = CODING_ATTR_BASE_NAME (attrs);
8244 coding.source = src;
8245 coding.src_chars = src_chars;
8246 coding.src_bytes = src_bytes;
8247 coding.src_multibyte = multibytep;
8248 coding.consumed = 0;
8249 coding.mode |= CODING_MODE_LAST_BLOCK;
8250 coding.head_ascii = 0;
8252 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8254 /* At first, detect text-format if necessary. */
8255 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8256 if (base_category == coding_category_undecided)
8258 enum coding_category category;
8259 struct coding_system *this;
8260 int c, i;
8262 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8263 for (; src < src_end; src++)
8265 c = *src;
8266 if (c & 0x80)
8268 eight_bit_found = 1;
8269 if (null_byte_found)
8270 break;
8272 else if (c < 0x20)
8274 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8275 && ! inhibit_iso_escape_detection
8276 && ! detect_info.checked)
8278 if (detect_coding_iso_2022 (&coding, &detect_info))
8280 /* We have scanned the whole data. */
8281 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8283 /* We didn't find an 8-bit code. We may
8284 have found a null-byte, but it's very
8285 rare that a binary file confirm to
8286 ISO-2022. */
8287 src = src_end;
8288 coding.head_ascii = src - coding.source;
8290 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8291 break;
8294 else if (! c && !inhibit_null_byte_detection)
8296 null_byte_found = 1;
8297 if (eight_bit_found)
8298 break;
8300 if (! eight_bit_found)
8301 coding.head_ascii++;
8303 else if (! eight_bit_found)
8304 coding.head_ascii++;
8307 if (null_byte_found || eight_bit_found
8308 || coding.head_ascii < coding.src_bytes
8309 || detect_info.found)
8311 if (coding.head_ascii == coding.src_bytes)
8312 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8313 for (i = 0; i < coding_category_raw_text; i++)
8315 category = coding_priorities[i];
8316 this = coding_categories + category;
8317 if (detect_info.found & (1 << category))
8318 break;
8320 else
8322 if (null_byte_found)
8324 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8325 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8327 for (i = 0; i < coding_category_raw_text; i++)
8329 category = coding_priorities[i];
8330 this = coding_categories + category;
8332 if (this->id < 0)
8334 /* No coding system of this category is defined. */
8335 detect_info.rejected |= (1 << category);
8337 else if (category >= coding_category_raw_text)
8338 continue;
8339 else if (detect_info.checked & (1 << category))
8341 if (highest
8342 && (detect_info.found & (1 << category)))
8343 break;
8345 else if ((*(this->detector)) (&coding, &detect_info)
8346 && highest
8347 && (detect_info.found & (1 << category)))
8349 if (category == coding_category_utf_16_auto)
8351 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8352 category = coding_category_utf_16_le;
8353 else
8354 category = coding_category_utf_16_be;
8356 break;
8362 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8363 || null_byte_found)
8365 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8366 id = CODING_SYSTEM_ID (Qno_conversion);
8367 val = Fcons (make_number (id), Qnil);
8369 else if (! detect_info.rejected && ! detect_info.found)
8371 detect_info.found = CATEGORY_MASK_ANY;
8372 id = coding_categories[coding_category_undecided].id;
8373 val = Fcons (make_number (id), Qnil);
8375 else if (highest)
8377 if (detect_info.found)
8379 detect_info.found = 1 << category;
8380 val = Fcons (make_number (this->id), Qnil);
8382 else
8383 for (i = 0; i < coding_category_raw_text; i++)
8384 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8386 detect_info.found = 1 << coding_priorities[i];
8387 id = coding_categories[coding_priorities[i]].id;
8388 val = Fcons (make_number (id), Qnil);
8389 break;
8392 else
8394 int mask = detect_info.rejected | detect_info.found;
8395 int found = 0;
8397 for (i = coding_category_raw_text - 1; i >= 0; i--)
8399 category = coding_priorities[i];
8400 if (! (mask & (1 << category)))
8402 found |= 1 << category;
8403 id = coding_categories[category].id;
8404 if (id >= 0)
8405 val = Fcons (make_number (id), val);
8408 for (i = coding_category_raw_text - 1; i >= 0; i--)
8410 category = coding_priorities[i];
8411 if (detect_info.found & (1 << category))
8413 id = coding_categories[category].id;
8414 val = Fcons (make_number (id), val);
8417 detect_info.found |= found;
8420 else if (base_category == coding_category_utf_8_auto)
8422 if (detect_coding_utf_8 (&coding, &detect_info))
8424 struct coding_system *this;
8426 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8427 this = coding_categories + coding_category_utf_8_sig;
8428 else
8429 this = coding_categories + coding_category_utf_8_nosig;
8430 val = Fcons (make_number (this->id), Qnil);
8433 else if (base_category == coding_category_utf_16_auto)
8435 if (detect_coding_utf_16 (&coding, &detect_info))
8437 struct coding_system *this;
8439 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8440 this = coding_categories + coding_category_utf_16_le;
8441 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8442 this = coding_categories + coding_category_utf_16_be;
8443 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8444 this = coding_categories + coding_category_utf_16_be_nosig;
8445 else
8446 this = coding_categories + coding_category_utf_16_le_nosig;
8447 val = Fcons (make_number (this->id), Qnil);
8450 else
8452 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8453 val = Fcons (make_number (coding.id), Qnil);
8456 /* Then, detect eol-format if necessary. */
8458 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8459 Lisp_Object tail;
8461 if (VECTORP (eol_type))
8463 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8465 if (null_byte_found)
8466 normal_eol = EOL_SEEN_LF;
8467 else
8468 normal_eol = detect_eol (coding.source, src_bytes,
8469 coding_category_raw_text);
8471 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8472 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8473 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8474 coding_category_utf_16_be);
8475 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8476 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8477 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8478 coding_category_utf_16_le);
8480 else
8482 if (EQ (eol_type, Qunix))
8483 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8484 else if (EQ (eol_type, Qdos))
8485 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8486 else
8487 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8490 for (tail = val; CONSP (tail); tail = XCDR (tail))
8492 enum coding_category category;
8493 int this_eol;
8495 id = XINT (XCAR (tail));
8496 attrs = CODING_ID_ATTRS (id);
8497 category = XINT (CODING_ATTR_CATEGORY (attrs));
8498 eol_type = CODING_ID_EOL_TYPE (id);
8499 if (VECTORP (eol_type))
8501 if (category == coding_category_utf_16_be
8502 || category == coding_category_utf_16_be_nosig)
8503 this_eol = utf_16_be_eol;
8504 else if (category == coding_category_utf_16_le
8505 || category == coding_category_utf_16_le_nosig)
8506 this_eol = utf_16_le_eol;
8507 else
8508 this_eol = normal_eol;
8510 if (this_eol == EOL_SEEN_LF)
8511 XSETCAR (tail, AREF (eol_type, 0));
8512 else if (this_eol == EOL_SEEN_CRLF)
8513 XSETCAR (tail, AREF (eol_type, 1));
8514 else if (this_eol == EOL_SEEN_CR)
8515 XSETCAR (tail, AREF (eol_type, 2));
8516 else
8517 XSETCAR (tail, CODING_ID_NAME (id));
8519 else
8520 XSETCAR (tail, CODING_ID_NAME (id));
8524 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8528 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8529 2, 3, 0,
8530 doc: /* Detect coding system of the text in the region between START and END.
8531 Return a list of possible coding systems ordered by priority.
8532 The coding systems to try and their priorities follows what
8533 the function `coding-system-priority-list' (which see) returns.
8535 If only ASCII characters are found (except for such ISO-2022 control
8536 characters as ESC), it returns a list of single element `undecided'
8537 or its subsidiary coding system according to a detected end-of-line
8538 format.
8540 If optional argument HIGHEST is non-nil, return the coding system of
8541 highest priority. */)
8542 (start, end, highest)
8543 Lisp_Object start, end, highest;
8545 int from, to;
8546 int from_byte, to_byte;
8548 CHECK_NUMBER_COERCE_MARKER (start);
8549 CHECK_NUMBER_COERCE_MARKER (end);
8551 validate_region (&start, &end);
8552 from = XINT (start), to = XINT (end);
8553 from_byte = CHAR_TO_BYTE (from);
8554 to_byte = CHAR_TO_BYTE (to);
8556 if (from < GPT && to >= GPT)
8557 move_gap_both (to, to_byte);
8559 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8560 to - from, to_byte - from_byte,
8561 !NILP (highest),
8562 !NILP (current_buffer
8563 ->enable_multibyte_characters),
8564 Qnil);
8567 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8568 1, 2, 0,
8569 doc: /* Detect coding system of the text in STRING.
8570 Return a list of possible coding systems ordered by priority.
8571 The coding systems to try and their priorities follows what
8572 the function `coding-system-priority-list' (which see) returns.
8574 If only ASCII characters are found (except for such ISO-2022 control
8575 characters as ESC), it returns a list of single element `undecided'
8576 or its subsidiary coding system according to a detected end-of-line
8577 format.
8579 If optional argument HIGHEST is non-nil, return the coding system of
8580 highest priority. */)
8581 (string, highest)
8582 Lisp_Object string, highest;
8584 CHECK_STRING (string);
8586 return detect_coding_system (SDATA (string),
8587 SCHARS (string), SBYTES (string),
8588 !NILP (highest), STRING_MULTIBYTE (string),
8589 Qnil);
8593 static INLINE int
8594 char_encodable_p (c, attrs)
8595 int c;
8596 Lisp_Object attrs;
8598 Lisp_Object tail;
8599 struct charset *charset;
8600 Lisp_Object translation_table;
8602 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8603 if (! NILP (translation_table))
8604 c = translate_char (translation_table, c);
8605 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8606 CONSP (tail); tail = XCDR (tail))
8608 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8609 if (CHAR_CHARSET_P (c, charset))
8610 break;
8612 return (! NILP (tail));
8616 /* Return a list of coding systems that safely encode the text between
8617 START and END. If EXCLUDE is non-nil, it is a list of coding
8618 systems not to check. The returned list doesn't contain any such
8619 coding systems. In any case, if the text contains only ASCII or is
8620 unibyte, return t. */
8622 DEFUN ("find-coding-systems-region-internal",
8623 Ffind_coding_systems_region_internal,
8624 Sfind_coding_systems_region_internal, 2, 3, 0,
8625 doc: /* Internal use only. */)
8626 (start, end, exclude)
8627 Lisp_Object start, end, exclude;
8629 Lisp_Object coding_attrs_list, safe_codings;
8630 EMACS_INT start_byte, end_byte;
8631 const unsigned char *p, *pbeg, *pend;
8632 int c;
8633 Lisp_Object tail, elt;
8635 if (STRINGP (start))
8637 if (!STRING_MULTIBYTE (start)
8638 || SCHARS (start) == SBYTES (start))
8639 return Qt;
8640 start_byte = 0;
8641 end_byte = SBYTES (start);
8643 else
8645 CHECK_NUMBER_COERCE_MARKER (start);
8646 CHECK_NUMBER_COERCE_MARKER (end);
8647 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8648 args_out_of_range (start, end);
8649 if (NILP (current_buffer->enable_multibyte_characters))
8650 return Qt;
8651 start_byte = CHAR_TO_BYTE (XINT (start));
8652 end_byte = CHAR_TO_BYTE (XINT (end));
8653 if (XINT (end) - XINT (start) == end_byte - start_byte)
8654 return Qt;
8656 if (XINT (start) < GPT && XINT (end) > GPT)
8658 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8659 move_gap_both (XINT (start), start_byte);
8660 else
8661 move_gap_both (XINT (end), end_byte);
8665 coding_attrs_list = Qnil;
8666 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8667 if (NILP (exclude)
8668 || NILP (Fmemq (XCAR (tail), exclude)))
8670 Lisp_Object attrs;
8672 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8673 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8674 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8676 ASET (attrs, coding_attr_trans_tbl,
8677 get_translation_table (attrs, 1, NULL));
8678 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8682 if (STRINGP (start))
8683 p = pbeg = SDATA (start);
8684 else
8685 p = pbeg = BYTE_POS_ADDR (start_byte);
8686 pend = p + (end_byte - start_byte);
8688 while (p < pend && ASCII_BYTE_P (*p)) p++;
8689 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8691 while (p < pend)
8693 if (ASCII_BYTE_P (*p))
8694 p++;
8695 else
8697 c = STRING_CHAR_ADVANCE (p);
8699 charset_map_loaded = 0;
8700 for (tail = coding_attrs_list; CONSP (tail);)
8702 elt = XCAR (tail);
8703 if (NILP (elt))
8704 tail = XCDR (tail);
8705 else if (char_encodable_p (c, elt))
8706 tail = XCDR (tail);
8707 else if (CONSP (XCDR (tail)))
8709 XSETCAR (tail, XCAR (XCDR (tail)));
8710 XSETCDR (tail, XCDR (XCDR (tail)));
8712 else
8714 XSETCAR (tail, Qnil);
8715 tail = XCDR (tail);
8718 if (charset_map_loaded)
8720 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8722 if (STRINGP (start))
8723 pbeg = SDATA (start);
8724 else
8725 pbeg = BYTE_POS_ADDR (start_byte);
8726 p = pbeg + p_offset;
8727 pend = pbeg + pend_offset;
8732 safe_codings = list2 (Qraw_text, Qno_conversion);
8733 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8734 if (! NILP (XCAR (tail)))
8735 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8737 return safe_codings;
8741 DEFUN ("unencodable-char-position", Funencodable_char_position,
8742 Sunencodable_char_position, 3, 5, 0,
8743 doc: /*
8744 Return position of first un-encodable character in a region.
8745 START and END specify the region and CODING-SYSTEM specifies the
8746 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8748 If optional 4th argument COUNT is non-nil, it specifies at most how
8749 many un-encodable characters to search. In this case, the value is a
8750 list of positions.
8752 If optional 5th argument STRING is non-nil, it is a string to search
8753 for un-encodable characters. In that case, START and END are indexes
8754 to the string. */)
8755 (start, end, coding_system, count, string)
8756 Lisp_Object start, end, coding_system, count, string;
8758 int n;
8759 struct coding_system coding;
8760 Lisp_Object attrs, charset_list, translation_table;
8761 Lisp_Object positions;
8762 int from, to;
8763 const unsigned char *p, *stop, *pend;
8764 int ascii_compatible;
8766 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8767 attrs = CODING_ID_ATTRS (coding.id);
8768 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8769 return Qnil;
8770 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8771 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8772 translation_table = get_translation_table (attrs, 1, NULL);
8774 if (NILP (string))
8776 validate_region (&start, &end);
8777 from = XINT (start);
8778 to = XINT (end);
8779 if (NILP (current_buffer->enable_multibyte_characters)
8780 || (ascii_compatible
8781 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8782 return Qnil;
8783 p = CHAR_POS_ADDR (from);
8784 pend = CHAR_POS_ADDR (to);
8785 if (from < GPT && to >= GPT)
8786 stop = GPT_ADDR;
8787 else
8788 stop = pend;
8790 else
8792 CHECK_STRING (string);
8793 CHECK_NATNUM (start);
8794 CHECK_NATNUM (end);
8795 from = XINT (start);
8796 to = XINT (end);
8797 if (from > to
8798 || to > SCHARS (string))
8799 args_out_of_range_3 (string, start, end);
8800 if (! STRING_MULTIBYTE (string))
8801 return Qnil;
8802 p = SDATA (string) + string_char_to_byte (string, from);
8803 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8804 if (ascii_compatible && (to - from) == (pend - p))
8805 return Qnil;
8808 if (NILP (count))
8809 n = 1;
8810 else
8812 CHECK_NATNUM (count);
8813 n = XINT (count);
8816 positions = Qnil;
8817 while (1)
8819 int c;
8821 if (ascii_compatible)
8822 while (p < stop && ASCII_BYTE_P (*p))
8823 p++, from++;
8824 if (p >= stop)
8826 if (p >= pend)
8827 break;
8828 stop = pend;
8829 p = GAP_END_ADDR;
8832 c = STRING_CHAR_ADVANCE (p);
8833 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8834 && ! char_charset (translate_char (translation_table, c),
8835 charset_list, NULL))
8837 positions = Fcons (make_number (from), positions);
8838 n--;
8839 if (n == 0)
8840 break;
8843 from++;
8846 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8850 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8851 Scheck_coding_systems_region, 3, 3, 0,
8852 doc: /* Check if the region is encodable by coding systems.
8854 START and END are buffer positions specifying the region.
8855 CODING-SYSTEM-LIST is a list of coding systems to check.
8857 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8858 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8859 whole region, POS0, POS1, ... are buffer positions where non-encodable
8860 characters are found.
8862 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8863 value is nil.
8865 START may be a string. In that case, check if the string is
8866 encodable, and the value contains indices to the string instead of
8867 buffer positions. END is ignored.
8869 If the current buffer (or START if it is a string) is unibyte, the value
8870 is nil. */)
8871 (start, end, coding_system_list)
8872 Lisp_Object start, end, coding_system_list;
8874 Lisp_Object list;
8875 EMACS_INT start_byte, end_byte;
8876 int pos;
8877 const unsigned char *p, *pbeg, *pend;
8878 int c;
8879 Lisp_Object tail, elt, attrs;
8881 if (STRINGP (start))
8883 if (!STRING_MULTIBYTE (start)
8884 || SCHARS (start) == SBYTES (start))
8885 return Qnil;
8886 start_byte = 0;
8887 end_byte = SBYTES (start);
8888 pos = 0;
8890 else
8892 CHECK_NUMBER_COERCE_MARKER (start);
8893 CHECK_NUMBER_COERCE_MARKER (end);
8894 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8895 args_out_of_range (start, end);
8896 if (NILP (current_buffer->enable_multibyte_characters))
8897 return Qnil;
8898 start_byte = CHAR_TO_BYTE (XINT (start));
8899 end_byte = CHAR_TO_BYTE (XINT (end));
8900 if (XINT (end) - XINT (start) == end_byte - start_byte)
8901 return Qnil;
8903 if (XINT (start) < GPT && XINT (end) > GPT)
8905 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8906 move_gap_both (XINT (start), start_byte);
8907 else
8908 move_gap_both (XINT (end), end_byte);
8910 pos = XINT (start);
8913 list = Qnil;
8914 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8916 elt = XCAR (tail);
8917 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8918 ASET (attrs, coding_attr_trans_tbl,
8919 get_translation_table (attrs, 1, NULL));
8920 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8923 if (STRINGP (start))
8924 p = pbeg = SDATA (start);
8925 else
8926 p = pbeg = BYTE_POS_ADDR (start_byte);
8927 pend = p + (end_byte - start_byte);
8929 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8930 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8932 while (p < pend)
8934 if (ASCII_BYTE_P (*p))
8935 p++;
8936 else
8938 c = STRING_CHAR_ADVANCE (p);
8940 charset_map_loaded = 0;
8941 for (tail = list; CONSP (tail); tail = XCDR (tail))
8943 elt = XCDR (XCAR (tail));
8944 if (! char_encodable_p (c, XCAR (elt)))
8945 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8947 if (charset_map_loaded)
8949 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8951 if (STRINGP (start))
8952 pbeg = SDATA (start);
8953 else
8954 pbeg = BYTE_POS_ADDR (start_byte);
8955 p = pbeg + p_offset;
8956 pend = pbeg + pend_offset;
8959 pos++;
8962 tail = list;
8963 list = Qnil;
8964 for (; CONSP (tail); tail = XCDR (tail))
8966 elt = XCAR (tail);
8967 if (CONSP (XCDR (XCDR (elt))))
8968 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8969 list);
8972 return list;
8976 Lisp_Object
8977 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8978 Lisp_Object start, end, coding_system, dst_object;
8979 int encodep, norecord;
8981 struct coding_system coding;
8982 EMACS_INT from, from_byte, to, to_byte;
8983 Lisp_Object src_object;
8985 CHECK_NUMBER_COERCE_MARKER (start);
8986 CHECK_NUMBER_COERCE_MARKER (end);
8987 if (NILP (coding_system))
8988 coding_system = Qno_conversion;
8989 else
8990 CHECK_CODING_SYSTEM (coding_system);
8991 src_object = Fcurrent_buffer ();
8992 if (NILP (dst_object))
8993 dst_object = src_object;
8994 else if (! EQ (dst_object, Qt))
8995 CHECK_BUFFER (dst_object);
8997 validate_region (&start, &end);
8998 from = XFASTINT (start);
8999 from_byte = CHAR_TO_BYTE (from);
9000 to = XFASTINT (end);
9001 to_byte = CHAR_TO_BYTE (to);
9003 setup_coding_system (coding_system, &coding);
9004 coding.mode |= CODING_MODE_LAST_BLOCK;
9006 if (encodep)
9007 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9008 dst_object);
9009 else
9010 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9011 dst_object);
9012 if (! norecord)
9013 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9015 return (BUFFERP (dst_object)
9016 ? make_number (coding.produced_char)
9017 : coding.dst_object);
9021 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9022 3, 4, "r\nzCoding system: ",
9023 doc: /* Decode the current region from the specified coding system.
9024 When called from a program, takes four arguments:
9025 START, END, CODING-SYSTEM, and DESTINATION.
9026 START and END are buffer positions.
9028 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9029 If nil, the region between START and END is replaced by the decoded text.
9030 If buffer, the decoded text is inserted in that buffer after point (point
9031 does not move).
9032 In those cases, the length of the decoded text is returned.
9033 If DESTINATION is t, the decoded text is returned.
9035 This function sets `last-coding-system-used' to the precise coding system
9036 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9037 not fully specified.) */)
9038 (start, end, coding_system, destination)
9039 Lisp_Object start, end, coding_system, destination;
9041 return code_convert_region (start, end, coding_system, destination, 0, 0);
9044 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9045 3, 4, "r\nzCoding system: ",
9046 doc: /* Encode the current region by specified coding system.
9047 When called from a program, takes four arguments:
9048 START, END, CODING-SYSTEM and DESTINATION.
9049 START and END are buffer positions.
9051 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9052 If nil, the region between START and END is replace by the encoded text.
9053 If buffer, the encoded text is inserted in that buffer after point (point
9054 does not move).
9055 In those cases, the length of the encoded text is returned.
9056 If DESTINATION is t, the encoded text is returned.
9058 This function sets `last-coding-system-used' to the precise coding system
9059 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9060 not fully specified.) */)
9061 (start, end, coding_system, destination)
9062 Lisp_Object start, end, coding_system, destination;
9064 return code_convert_region (start, end, coding_system, destination, 1, 0);
9067 Lisp_Object
9068 code_convert_string (string, coding_system, dst_object,
9069 encodep, nocopy, norecord)
9070 Lisp_Object string, coding_system, dst_object;
9071 int encodep, nocopy, norecord;
9073 struct coding_system coding;
9074 EMACS_INT chars, bytes;
9076 CHECK_STRING (string);
9077 if (NILP (coding_system))
9079 if (! norecord)
9080 Vlast_coding_system_used = Qno_conversion;
9081 if (NILP (dst_object))
9082 return (nocopy ? Fcopy_sequence (string) : string);
9085 if (NILP (coding_system))
9086 coding_system = Qno_conversion;
9087 else
9088 CHECK_CODING_SYSTEM (coding_system);
9089 if (NILP (dst_object))
9090 dst_object = Qt;
9091 else if (! EQ (dst_object, Qt))
9092 CHECK_BUFFER (dst_object);
9094 setup_coding_system (coding_system, &coding);
9095 coding.mode |= CODING_MODE_LAST_BLOCK;
9096 chars = SCHARS (string);
9097 bytes = SBYTES (string);
9098 if (encodep)
9099 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9100 else
9101 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9102 if (! norecord)
9103 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9105 return (BUFFERP (dst_object)
9106 ? make_number (coding.produced_char)
9107 : coding.dst_object);
9111 /* Encode or decode STRING according to CODING_SYSTEM.
9112 Do not set Vlast_coding_system_used.
9114 This function is called only from macros DECODE_FILE and
9115 ENCODE_FILE, thus we ignore character composition. */
9117 Lisp_Object
9118 code_convert_string_norecord (string, coding_system, encodep)
9119 Lisp_Object string, coding_system;
9120 int encodep;
9122 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9126 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9127 2, 4, 0,
9128 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9130 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9131 if the decoding operation is trivial.
9133 Optional fourth arg BUFFER non-nil means that the decoded text is
9134 inserted in that buffer after point (point does not move). In this
9135 case, the return value is the length of the decoded text.
9137 This function sets `last-coding-system-used' to the precise coding system
9138 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9139 not fully specified.) */)
9140 (string, coding_system, nocopy, buffer)
9141 Lisp_Object string, coding_system, nocopy, buffer;
9143 return code_convert_string (string, coding_system, buffer,
9144 0, ! NILP (nocopy), 0);
9147 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9148 2, 4, 0,
9149 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9151 Optional third arg NOCOPY non-nil means it is OK to return STRING
9152 itself if the encoding operation is trivial.
9154 Optional fourth arg BUFFER non-nil means that the encoded text is
9155 inserted in that buffer after point (point does not move). In this
9156 case, the return value is the length of the encoded text.
9158 This function sets `last-coding-system-used' to the precise coding system
9159 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9160 not fully specified.) */)
9161 (string, coding_system, nocopy, buffer)
9162 Lisp_Object string, coding_system, nocopy, buffer;
9164 return code_convert_string (string, coding_system, buffer,
9165 1, ! NILP (nocopy), 1);
9169 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9170 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9171 Return the corresponding character. */)
9172 (code)
9173 Lisp_Object code;
9175 Lisp_Object spec, attrs, val;
9176 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9177 int c;
9179 CHECK_NATNUM (code);
9180 c = XFASTINT (code);
9181 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9182 attrs = AREF (spec, 0);
9184 if (ASCII_BYTE_P (c)
9185 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9186 return code;
9188 val = CODING_ATTR_CHARSET_LIST (attrs);
9189 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9190 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9191 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9193 if (c <= 0x7F)
9194 charset = charset_roman;
9195 else if (c >= 0xA0 && c < 0xDF)
9197 charset = charset_kana;
9198 c -= 0x80;
9200 else
9202 int s1 = c >> 8, s2 = c & 0xFF;
9204 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9205 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9206 error ("Invalid code: %d", code);
9207 SJIS_TO_JIS (c);
9208 charset = charset_kanji;
9210 c = DECODE_CHAR (charset, c);
9211 if (c < 0)
9212 error ("Invalid code: %d", code);
9213 return make_number (c);
9217 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9218 doc: /* Encode a Japanese character CH to shift_jis encoding.
9219 Return the corresponding code in SJIS. */)
9220 (ch)
9221 Lisp_Object ch;
9223 Lisp_Object spec, attrs, charset_list;
9224 int c;
9225 struct charset *charset;
9226 unsigned code;
9228 CHECK_CHARACTER (ch);
9229 c = XFASTINT (ch);
9230 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9231 attrs = AREF (spec, 0);
9233 if (ASCII_CHAR_P (c)
9234 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9235 return ch;
9237 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9238 charset = char_charset (c, charset_list, &code);
9239 if (code == CHARSET_INVALID_CODE (charset))
9240 error ("Can't encode by shift_jis encoding: %d", c);
9241 JIS_TO_SJIS (code);
9243 return make_number (code);
9246 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9247 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9248 Return the corresponding character. */)
9249 (code)
9250 Lisp_Object code;
9252 Lisp_Object spec, attrs, val;
9253 struct charset *charset_roman, *charset_big5, *charset;
9254 int c;
9256 CHECK_NATNUM (code);
9257 c = XFASTINT (code);
9258 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9259 attrs = AREF (spec, 0);
9261 if (ASCII_BYTE_P (c)
9262 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9263 return code;
9265 val = CODING_ATTR_CHARSET_LIST (attrs);
9266 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9267 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9269 if (c <= 0x7F)
9270 charset = charset_roman;
9271 else
9273 int b1 = c >> 8, b2 = c & 0x7F;
9274 if (b1 < 0xA1 || b1 > 0xFE
9275 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9276 error ("Invalid code: %d", code);
9277 charset = charset_big5;
9279 c = DECODE_CHAR (charset, (unsigned )c);
9280 if (c < 0)
9281 error ("Invalid code: %d", code);
9282 return make_number (c);
9285 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9286 doc: /* Encode the Big5 character CH to BIG5 coding system.
9287 Return the corresponding character code in Big5. */)
9288 (ch)
9289 Lisp_Object ch;
9291 Lisp_Object spec, attrs, charset_list;
9292 struct charset *charset;
9293 int c;
9294 unsigned code;
9296 CHECK_CHARACTER (ch);
9297 c = XFASTINT (ch);
9298 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9299 attrs = AREF (spec, 0);
9300 if (ASCII_CHAR_P (c)
9301 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9302 return ch;
9304 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9305 charset = char_charset (c, charset_list, &code);
9306 if (code == CHARSET_INVALID_CODE (charset))
9307 error ("Can't encode by Big5 encoding: %d", c);
9309 return make_number (code);
9313 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9314 Sset_terminal_coding_system_internal, 1, 2, 0,
9315 doc: /* Internal use only. */)
9316 (coding_system, terminal)
9317 Lisp_Object coding_system;
9318 Lisp_Object terminal;
9320 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9321 CHECK_SYMBOL (coding_system);
9322 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9323 /* We had better not send unsafe characters to terminal. */
9324 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9325 /* Characer composition should be disabled. */
9326 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9327 terminal_coding->src_multibyte = 1;
9328 terminal_coding->dst_multibyte = 0;
9329 return Qnil;
9332 DEFUN ("set-safe-terminal-coding-system-internal",
9333 Fset_safe_terminal_coding_system_internal,
9334 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9335 doc: /* Internal use only. */)
9336 (coding_system)
9337 Lisp_Object coding_system;
9339 CHECK_SYMBOL (coding_system);
9340 setup_coding_system (Fcheck_coding_system (coding_system),
9341 &safe_terminal_coding);
9342 /* Characer composition should be disabled. */
9343 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9344 safe_terminal_coding.src_multibyte = 1;
9345 safe_terminal_coding.dst_multibyte = 0;
9346 return Qnil;
9349 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9350 Sterminal_coding_system, 0, 1, 0,
9351 doc: /* Return coding system specified for terminal output on the given terminal.
9352 TERMINAL may be a terminal object, a frame, or nil for the selected
9353 frame's terminal device. */)
9354 (terminal)
9355 Lisp_Object terminal;
9357 struct coding_system *terminal_coding
9358 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9359 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9361 /* For backward compatibility, return nil if it is `undecided'. */
9362 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9365 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9366 Sset_keyboard_coding_system_internal, 1, 2, 0,
9367 doc: /* Internal use only. */)
9368 (coding_system, terminal)
9369 Lisp_Object coding_system;
9370 Lisp_Object terminal;
9372 struct terminal *t = get_terminal (terminal, 1);
9373 CHECK_SYMBOL (coding_system);
9374 setup_coding_system (Fcheck_coding_system (coding_system),
9375 TERMINAL_KEYBOARD_CODING (t));
9376 /* Characer composition should be disabled. */
9377 TERMINAL_KEYBOARD_CODING (t)->common_flags
9378 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9379 return Qnil;
9382 DEFUN ("keyboard-coding-system",
9383 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9384 doc: /* Return coding system specified for decoding keyboard input. */)
9385 (terminal)
9386 Lisp_Object terminal;
9388 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9389 (get_terminal (terminal, 1))->id);
9393 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9394 Sfind_operation_coding_system, 1, MANY, 0,
9395 doc: /* Choose a coding system for an operation based on the target name.
9396 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9397 DECODING-SYSTEM is the coding system to use for decoding
9398 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9399 for encoding (in case OPERATION does encoding).
9401 The first argument OPERATION specifies an I/O primitive:
9402 For file I/O, `insert-file-contents' or `write-region'.
9403 For process I/O, `call-process', `call-process-region', or `start-process'.
9404 For network I/O, `open-network-stream'.
9406 The remaining arguments should be the same arguments that were passed
9407 to the primitive. Depending on which primitive, one of those arguments
9408 is selected as the TARGET. For example, if OPERATION does file I/O,
9409 whichever argument specifies the file name is TARGET.
9411 TARGET has a meaning which depends on OPERATION:
9412 For file I/O, TARGET is a file name (except for the special case below).
9413 For process I/O, TARGET is a process name.
9414 For network I/O, TARGET is a service name or a port number.
9416 This function looks up what is specified for TARGET in
9417 `file-coding-system-alist', `process-coding-system-alist',
9418 or `network-coding-system-alist' depending on OPERATION.
9419 They may specify a coding system, a cons of coding systems,
9420 or a function symbol to call.
9421 In the last case, we call the function with one argument,
9422 which is a list of all the arguments given to this function.
9423 If the function can't decide a coding system, it can return
9424 `undecided' so that the normal code-detection is performed.
9426 If OPERATION is `insert-file-contents', the argument corresponding to
9427 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9428 file name to look up, and BUFFER is a buffer that contains the file's
9429 contents (not yet decoded). If `file-coding-system-alist' specifies a
9430 function to call for FILENAME, that function should examine the
9431 contents of BUFFER instead of reading the file.
9433 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9434 (nargs, args)
9435 int nargs;
9436 Lisp_Object *args;
9438 Lisp_Object operation, target_idx, target, val;
9439 register Lisp_Object chain;
9441 if (nargs < 2)
9442 error ("Too few arguments");
9443 operation = args[0];
9444 if (!SYMBOLP (operation)
9445 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9446 error ("Invalid first argument");
9447 if (nargs < 1 + XINT (target_idx))
9448 error ("Too few arguments for operation: %s",
9449 SDATA (SYMBOL_NAME (operation)));
9450 target = args[XINT (target_idx) + 1];
9451 if (!(STRINGP (target)
9452 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9453 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9454 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9455 error ("Invalid %dth argument", XINT (target_idx) + 1);
9456 if (CONSP (target))
9457 target = XCAR (target);
9459 chain = ((EQ (operation, Qinsert_file_contents)
9460 || EQ (operation, Qwrite_region))
9461 ? Vfile_coding_system_alist
9462 : (EQ (operation, Qopen_network_stream)
9463 ? Vnetwork_coding_system_alist
9464 : Vprocess_coding_system_alist));
9465 if (NILP (chain))
9466 return Qnil;
9468 for (; CONSP (chain); chain = XCDR (chain))
9470 Lisp_Object elt;
9472 elt = XCAR (chain);
9473 if (CONSP (elt)
9474 && ((STRINGP (target)
9475 && STRINGP (XCAR (elt))
9476 && fast_string_match (XCAR (elt), target) >= 0)
9477 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9479 val = XCDR (elt);
9480 /* Here, if VAL is both a valid coding system and a valid
9481 function symbol, we return VAL as a coding system. */
9482 if (CONSP (val))
9483 return val;
9484 if (! SYMBOLP (val))
9485 return Qnil;
9486 if (! NILP (Fcoding_system_p (val)))
9487 return Fcons (val, val);
9488 if (! NILP (Ffboundp (val)))
9490 /* We use call1 rather than safe_call1
9491 so as to get bug reports about functions called here
9492 which don't handle the current interface. */
9493 val = call1 (val, Flist (nargs, args));
9494 if (CONSP (val))
9495 return val;
9496 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9497 return Fcons (val, val);
9499 return Qnil;
9502 return Qnil;
9505 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9506 Sset_coding_system_priority, 0, MANY, 0,
9507 doc: /* Assign higher priority to the coding systems given as arguments.
9508 If multiple coding systems belong to the same category,
9509 all but the first one are ignored.
9511 usage: (set-coding-system-priority &rest coding-systems) */)
9512 (nargs, args)
9513 int nargs;
9514 Lisp_Object *args;
9516 int i, j;
9517 int changed[coding_category_max];
9518 enum coding_category priorities[coding_category_max];
9520 bzero (changed, sizeof changed);
9522 for (i = j = 0; i < nargs; i++)
9524 enum coding_category category;
9525 Lisp_Object spec, attrs;
9527 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9528 attrs = AREF (spec, 0);
9529 category = XINT (CODING_ATTR_CATEGORY (attrs));
9530 if (changed[category])
9531 /* Ignore this coding system because a coding system of the
9532 same category already had a higher priority. */
9533 continue;
9534 changed[category] = 1;
9535 priorities[j++] = category;
9536 if (coding_categories[category].id >= 0
9537 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9538 setup_coding_system (args[i], &coding_categories[category]);
9539 Fset (AREF (Vcoding_category_table, category), args[i]);
9542 /* Now we have decided top J priorities. Reflect the order of the
9543 original priorities to the remaining priorities. */
9545 for (i = j, j = 0; i < coding_category_max; i++, j++)
9547 while (j < coding_category_max
9548 && changed[coding_priorities[j]])
9549 j++;
9550 if (j == coding_category_max)
9551 abort ();
9552 priorities[i] = coding_priorities[j];
9555 bcopy (priorities, coding_priorities, sizeof priorities);
9557 /* Update `coding-category-list'. */
9558 Vcoding_category_list = Qnil;
9559 for (i = coding_category_max - 1; i >= 0; i--)
9560 Vcoding_category_list
9561 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9562 Vcoding_category_list);
9564 return Qnil;
9567 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9568 Scoding_system_priority_list, 0, 1, 0,
9569 doc: /* Return a list of coding systems ordered by their priorities.
9570 The list contains a subset of coding systems; i.e. coding systems
9571 assigned to each coding category (see `coding-category-list').
9573 HIGHESTP non-nil means just return the highest priority one. */)
9574 (highestp)
9575 Lisp_Object highestp;
9577 int i;
9578 Lisp_Object val;
9580 for (i = 0, val = Qnil; i < coding_category_max; i++)
9582 enum coding_category category = coding_priorities[i];
9583 int id = coding_categories[category].id;
9584 Lisp_Object attrs;
9586 if (id < 0)
9587 continue;
9588 attrs = CODING_ID_ATTRS (id);
9589 if (! NILP (highestp))
9590 return CODING_ATTR_BASE_NAME (attrs);
9591 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9593 return Fnreverse (val);
9596 static char *suffixes[] = { "-unix", "-dos", "-mac" };
9598 static Lisp_Object
9599 make_subsidiaries (base)
9600 Lisp_Object base;
9602 Lisp_Object subsidiaries;
9603 int base_name_len = SBYTES (SYMBOL_NAME (base));
9604 char *buf = (char *) alloca (base_name_len + 6);
9605 int i;
9607 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9608 subsidiaries = Fmake_vector (make_number (3), Qnil);
9609 for (i = 0; i < 3; i++)
9611 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9612 ASET (subsidiaries, i, intern (buf));
9614 return subsidiaries;
9618 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9619 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9620 doc: /* For internal use only.
9621 usage: (define-coding-system-internal ...) */)
9622 (nargs, args)
9623 int nargs;
9624 Lisp_Object *args;
9626 Lisp_Object name;
9627 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9628 Lisp_Object attrs; /* Vector of attributes. */
9629 Lisp_Object eol_type;
9630 Lisp_Object aliases;
9631 Lisp_Object coding_type, charset_list, safe_charsets;
9632 enum coding_category category;
9633 Lisp_Object tail, val;
9634 int max_charset_id = 0;
9635 int i;
9637 if (nargs < coding_arg_max)
9638 goto short_args;
9640 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9642 name = args[coding_arg_name];
9643 CHECK_SYMBOL (name);
9644 CODING_ATTR_BASE_NAME (attrs) = name;
9646 val = args[coding_arg_mnemonic];
9647 if (! STRINGP (val))
9648 CHECK_CHARACTER (val);
9649 CODING_ATTR_MNEMONIC (attrs) = val;
9651 coding_type = args[coding_arg_coding_type];
9652 CHECK_SYMBOL (coding_type);
9653 CODING_ATTR_TYPE (attrs) = coding_type;
9655 charset_list = args[coding_arg_charset_list];
9656 if (SYMBOLP (charset_list))
9658 if (EQ (charset_list, Qiso_2022))
9660 if (! EQ (coding_type, Qiso_2022))
9661 error ("Invalid charset-list");
9662 charset_list = Viso_2022_charset_list;
9664 else if (EQ (charset_list, Qemacs_mule))
9666 if (! EQ (coding_type, Qemacs_mule))
9667 error ("Invalid charset-list");
9668 charset_list = Vemacs_mule_charset_list;
9670 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9671 if (max_charset_id < XFASTINT (XCAR (tail)))
9672 max_charset_id = XFASTINT (XCAR (tail));
9674 else
9676 charset_list = Fcopy_sequence (charset_list);
9677 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9679 struct charset *charset;
9681 val = XCAR (tail);
9682 CHECK_CHARSET_GET_CHARSET (val, charset);
9683 if (EQ (coding_type, Qiso_2022)
9684 ? CHARSET_ISO_FINAL (charset) < 0
9685 : EQ (coding_type, Qemacs_mule)
9686 ? CHARSET_EMACS_MULE_ID (charset) < 0
9687 : 0)
9688 error ("Can't handle charset `%s'",
9689 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9691 XSETCAR (tail, make_number (charset->id));
9692 if (max_charset_id < charset->id)
9693 max_charset_id = charset->id;
9696 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9698 safe_charsets = make_uninit_string (max_charset_id + 1);
9699 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9700 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9701 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9702 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9704 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9706 val = args[coding_arg_decode_translation_table];
9707 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9708 CHECK_SYMBOL (val);
9709 CODING_ATTR_DECODE_TBL (attrs) = val;
9711 val = args[coding_arg_encode_translation_table];
9712 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9713 CHECK_SYMBOL (val);
9714 CODING_ATTR_ENCODE_TBL (attrs) = val;
9716 val = args[coding_arg_post_read_conversion];
9717 CHECK_SYMBOL (val);
9718 CODING_ATTR_POST_READ (attrs) = val;
9720 val = args[coding_arg_pre_write_conversion];
9721 CHECK_SYMBOL (val);
9722 CODING_ATTR_PRE_WRITE (attrs) = val;
9724 val = args[coding_arg_default_char];
9725 if (NILP (val))
9726 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9727 else
9729 CHECK_CHARACTER (val);
9730 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9733 val = args[coding_arg_for_unibyte];
9734 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9736 val = args[coding_arg_plist];
9737 CHECK_LIST (val);
9738 CODING_ATTR_PLIST (attrs) = val;
9740 if (EQ (coding_type, Qcharset))
9742 /* Generate a lisp vector of 256 elements. Each element is nil,
9743 integer, or a list of charset IDs.
9745 If Nth element is nil, the byte code N is invalid in this
9746 coding system.
9748 If Nth element is a number NUM, N is the first byte of a
9749 charset whose ID is NUM.
9751 If Nth element is a list of charset IDs, N is the first byte
9752 of one of them. The list is sorted by dimensions of the
9753 charsets. A charset of smaller dimension comes firtst. */
9754 val = Fmake_vector (make_number (256), Qnil);
9756 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9758 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9759 int dim = CHARSET_DIMENSION (charset);
9760 int idx = (dim - 1) * 4;
9762 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9763 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9765 for (i = charset->code_space[idx];
9766 i <= charset->code_space[idx + 1]; i++)
9768 Lisp_Object tmp, tmp2;
9769 int dim2;
9771 tmp = AREF (val, i);
9772 if (NILP (tmp))
9773 tmp = XCAR (tail);
9774 else if (NUMBERP (tmp))
9776 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9777 if (dim < dim2)
9778 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9779 else
9780 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9782 else
9784 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9786 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9787 if (dim < dim2)
9788 break;
9790 if (NILP (tmp2))
9791 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9792 else
9794 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9795 XSETCAR (tmp2, XCAR (tail));
9798 ASET (val, i, tmp);
9801 ASET (attrs, coding_attr_charset_valids, val);
9802 category = coding_category_charset;
9804 else if (EQ (coding_type, Qccl))
9806 Lisp_Object valids;
9808 if (nargs < coding_arg_ccl_max)
9809 goto short_args;
9811 val = args[coding_arg_ccl_decoder];
9812 CHECK_CCL_PROGRAM (val);
9813 if (VECTORP (val))
9814 val = Fcopy_sequence (val);
9815 ASET (attrs, coding_attr_ccl_decoder, val);
9817 val = args[coding_arg_ccl_encoder];
9818 CHECK_CCL_PROGRAM (val);
9819 if (VECTORP (val))
9820 val = Fcopy_sequence (val);
9821 ASET (attrs, coding_attr_ccl_encoder, val);
9823 val = args[coding_arg_ccl_valids];
9824 valids = Fmake_string (make_number (256), make_number (0));
9825 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9827 int from, to;
9829 val = Fcar (tail);
9830 if (INTEGERP (val))
9832 from = to = XINT (val);
9833 if (from < 0 || from > 255)
9834 args_out_of_range_3 (val, make_number (0), make_number (255));
9836 else
9838 CHECK_CONS (val);
9839 CHECK_NATNUM_CAR (val);
9840 CHECK_NATNUM_CDR (val);
9841 from = XINT (XCAR (val));
9842 if (from > 255)
9843 args_out_of_range_3 (XCAR (val),
9844 make_number (0), make_number (255));
9845 to = XINT (XCDR (val));
9846 if (to < from || to > 255)
9847 args_out_of_range_3 (XCDR (val),
9848 XCAR (val), make_number (255));
9850 for (i = from; i <= to; i++)
9851 SSET (valids, i, 1);
9853 ASET (attrs, coding_attr_ccl_valids, valids);
9855 category = coding_category_ccl;
9857 else if (EQ (coding_type, Qutf_16))
9859 Lisp_Object bom, endian;
9861 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9863 if (nargs < coding_arg_utf16_max)
9864 goto short_args;
9866 bom = args[coding_arg_utf16_bom];
9867 if (! NILP (bom) && ! EQ (bom, Qt))
9869 CHECK_CONS (bom);
9870 val = XCAR (bom);
9871 CHECK_CODING_SYSTEM (val);
9872 val = XCDR (bom);
9873 CHECK_CODING_SYSTEM (val);
9875 ASET (attrs, coding_attr_utf_bom, bom);
9877 endian = args[coding_arg_utf16_endian];
9878 CHECK_SYMBOL (endian);
9879 if (NILP (endian))
9880 endian = Qbig;
9881 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9882 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9883 ASET (attrs, coding_attr_utf_16_endian, endian);
9885 category = (CONSP (bom)
9886 ? coding_category_utf_16_auto
9887 : NILP (bom)
9888 ? (EQ (endian, Qbig)
9889 ? coding_category_utf_16_be_nosig
9890 : coding_category_utf_16_le_nosig)
9891 : (EQ (endian, Qbig)
9892 ? coding_category_utf_16_be
9893 : coding_category_utf_16_le));
9895 else if (EQ (coding_type, Qiso_2022))
9897 Lisp_Object initial, reg_usage, request, flags;
9898 int i;
9900 if (nargs < coding_arg_iso2022_max)
9901 goto short_args;
9903 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9904 CHECK_VECTOR (initial);
9905 for (i = 0; i < 4; i++)
9907 val = Faref (initial, make_number (i));
9908 if (! NILP (val))
9910 struct charset *charset;
9912 CHECK_CHARSET_GET_CHARSET (val, charset);
9913 ASET (initial, i, make_number (CHARSET_ID (charset)));
9914 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9915 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9917 else
9918 ASET (initial, i, make_number (-1));
9921 reg_usage = args[coding_arg_iso2022_reg_usage];
9922 CHECK_CONS (reg_usage);
9923 CHECK_NUMBER_CAR (reg_usage);
9924 CHECK_NUMBER_CDR (reg_usage);
9926 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9927 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9929 int id;
9930 Lisp_Object tmp;
9932 val = Fcar (tail);
9933 CHECK_CONS (val);
9934 tmp = XCAR (val);
9935 CHECK_CHARSET_GET_ID (tmp, id);
9936 CHECK_NATNUM_CDR (val);
9937 if (XINT (XCDR (val)) >= 4)
9938 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9939 XSETCAR (val, make_number (id));
9942 flags = args[coding_arg_iso2022_flags];
9943 CHECK_NATNUM (flags);
9944 i = XINT (flags);
9945 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9946 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9948 ASET (attrs, coding_attr_iso_initial, initial);
9949 ASET (attrs, coding_attr_iso_usage, reg_usage);
9950 ASET (attrs, coding_attr_iso_request, request);
9951 ASET (attrs, coding_attr_iso_flags, flags);
9952 setup_iso_safe_charsets (attrs);
9954 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9955 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9956 | CODING_ISO_FLAG_SINGLE_SHIFT))
9957 ? coding_category_iso_7_else
9958 : EQ (args[coding_arg_charset_list], Qiso_2022)
9959 ? coding_category_iso_7
9960 : coding_category_iso_7_tight);
9961 else
9963 int id = XINT (AREF (initial, 1));
9965 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9966 || EQ (args[coding_arg_charset_list], Qiso_2022)
9967 || id < 0)
9968 ? coding_category_iso_8_else
9969 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9970 ? coding_category_iso_8_1
9971 : coding_category_iso_8_2);
9973 if (category != coding_category_iso_8_1
9974 && category != coding_category_iso_8_2)
9975 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9977 else if (EQ (coding_type, Qemacs_mule))
9979 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9980 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9981 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9982 category = coding_category_emacs_mule;
9984 else if (EQ (coding_type, Qshift_jis))
9987 struct charset *charset;
9989 if (XINT (Flength (charset_list)) != 3
9990 && XINT (Flength (charset_list)) != 4)
9991 error ("There should be three or four charsets");
9993 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9994 if (CHARSET_DIMENSION (charset) != 1)
9995 error ("Dimension of charset %s is not one",
9996 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9997 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9998 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10000 charset_list = XCDR (charset_list);
10001 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10002 if (CHARSET_DIMENSION (charset) != 1)
10003 error ("Dimension of charset %s is not one",
10004 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10006 charset_list = XCDR (charset_list);
10007 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10008 if (CHARSET_DIMENSION (charset) != 2)
10009 error ("Dimension of charset %s is not two",
10010 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10012 charset_list = XCDR (charset_list);
10013 if (! NILP (charset_list))
10015 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10016 if (CHARSET_DIMENSION (charset) != 2)
10017 error ("Dimension of charset %s is not two",
10018 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10021 category = coding_category_sjis;
10022 Vsjis_coding_system = name;
10024 else if (EQ (coding_type, Qbig5))
10026 struct charset *charset;
10028 if (XINT (Flength (charset_list)) != 2)
10029 error ("There should be just two charsets");
10031 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10032 if (CHARSET_DIMENSION (charset) != 1)
10033 error ("Dimension of charset %s is not one",
10034 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10035 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10036 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10038 charset_list = XCDR (charset_list);
10039 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10040 if (CHARSET_DIMENSION (charset) != 2)
10041 error ("Dimension of charset %s is not two",
10042 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10044 category = coding_category_big5;
10045 Vbig5_coding_system = name;
10047 else if (EQ (coding_type, Qraw_text))
10049 category = coding_category_raw_text;
10050 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10052 else if (EQ (coding_type, Qutf_8))
10054 Lisp_Object bom;
10056 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10058 if (nargs < coding_arg_utf8_max)
10059 goto short_args;
10061 bom = args[coding_arg_utf8_bom];
10062 if (! NILP (bom) && ! EQ (bom, Qt))
10064 CHECK_CONS (bom);
10065 val = XCAR (bom);
10066 CHECK_CODING_SYSTEM (val);
10067 val = XCDR (bom);
10068 CHECK_CODING_SYSTEM (val);
10070 ASET (attrs, coding_attr_utf_bom, bom);
10072 category = (CONSP (bom) ? coding_category_utf_8_auto
10073 : NILP (bom) ? coding_category_utf_8_nosig
10074 : coding_category_utf_8_sig);
10076 else if (EQ (coding_type, Qundecided))
10077 category = coding_category_undecided;
10078 else
10079 error ("Invalid coding system type: %s",
10080 SDATA (SYMBOL_NAME (coding_type)));
10082 CODING_ATTR_CATEGORY (attrs) = make_number (category);
10083 CODING_ATTR_PLIST (attrs)
10084 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10085 CODING_ATTR_PLIST (attrs)));
10086 CODING_ATTR_PLIST (attrs)
10087 = Fcons (QCascii_compatible_p,
10088 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10089 CODING_ATTR_PLIST (attrs)));
10091 eol_type = args[coding_arg_eol_type];
10092 if (! NILP (eol_type)
10093 && ! EQ (eol_type, Qunix)
10094 && ! EQ (eol_type, Qdos)
10095 && ! EQ (eol_type, Qmac))
10096 error ("Invalid eol-type");
10098 aliases = Fcons (name, Qnil);
10100 if (NILP (eol_type))
10102 eol_type = make_subsidiaries (name);
10103 for (i = 0; i < 3; i++)
10105 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10107 this_name = AREF (eol_type, i);
10108 this_aliases = Fcons (this_name, Qnil);
10109 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10110 this_spec = Fmake_vector (make_number (3), attrs);
10111 ASET (this_spec, 1, this_aliases);
10112 ASET (this_spec, 2, this_eol_type);
10113 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10114 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10115 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10116 if (NILP (val))
10117 Vcoding_system_alist
10118 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10119 Vcoding_system_alist);
10123 spec_vec = Fmake_vector (make_number (3), attrs);
10124 ASET (spec_vec, 1, aliases);
10125 ASET (spec_vec, 2, eol_type);
10127 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10128 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10129 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10130 if (NILP (val))
10131 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10132 Vcoding_system_alist);
10135 int id = coding_categories[category].id;
10137 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10138 setup_coding_system (name, &coding_categories[category]);
10141 return Qnil;
10143 short_args:
10144 return Fsignal (Qwrong_number_of_arguments,
10145 Fcons (intern ("define-coding-system-internal"),
10146 make_number (nargs)));
10150 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10151 3, 3, 0,
10152 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10153 (coding_system, prop, val)
10154 Lisp_Object coding_system, prop, val;
10156 Lisp_Object spec, attrs;
10158 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10159 attrs = AREF (spec, 0);
10160 if (EQ (prop, QCmnemonic))
10162 if (! STRINGP (val))
10163 CHECK_CHARACTER (val);
10164 CODING_ATTR_MNEMONIC (attrs) = val;
10166 else if (EQ (prop, QCdefault_char))
10168 if (NILP (val))
10169 val = make_number (' ');
10170 else
10171 CHECK_CHARACTER (val);
10172 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10174 else if (EQ (prop, QCdecode_translation_table))
10176 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10177 CHECK_SYMBOL (val);
10178 CODING_ATTR_DECODE_TBL (attrs) = val;
10180 else if (EQ (prop, QCencode_translation_table))
10182 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10183 CHECK_SYMBOL (val);
10184 CODING_ATTR_ENCODE_TBL (attrs) = val;
10186 else if (EQ (prop, QCpost_read_conversion))
10188 CHECK_SYMBOL (val);
10189 CODING_ATTR_POST_READ (attrs) = val;
10191 else if (EQ (prop, QCpre_write_conversion))
10193 CHECK_SYMBOL (val);
10194 CODING_ATTR_PRE_WRITE (attrs) = val;
10196 else if (EQ (prop, QCascii_compatible_p))
10198 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10201 CODING_ATTR_PLIST (attrs)
10202 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10203 return val;
10207 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10208 Sdefine_coding_system_alias, 2, 2, 0,
10209 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10210 (alias, coding_system)
10211 Lisp_Object alias, coding_system;
10213 Lisp_Object spec, aliases, eol_type, val;
10215 CHECK_SYMBOL (alias);
10216 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10217 aliases = AREF (spec, 1);
10218 /* ALIASES should be a list of length more than zero, and the first
10219 element is a base coding system. Append ALIAS at the tail of the
10220 list. */
10221 while (!NILP (XCDR (aliases)))
10222 aliases = XCDR (aliases);
10223 XSETCDR (aliases, Fcons (alias, Qnil));
10225 eol_type = AREF (spec, 2);
10226 if (VECTORP (eol_type))
10228 Lisp_Object subsidiaries;
10229 int i;
10231 subsidiaries = make_subsidiaries (alias);
10232 for (i = 0; i < 3; i++)
10233 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10234 AREF (eol_type, i));
10237 Fputhash (alias, spec, Vcoding_system_hash_table);
10238 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10239 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10240 if (NILP (val))
10241 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10242 Vcoding_system_alist);
10244 return Qnil;
10247 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10248 1, 1, 0,
10249 doc: /* Return the base of CODING-SYSTEM.
10250 Any alias or subsidiary coding system is not a base coding system. */)
10251 (coding_system)
10252 Lisp_Object coding_system;
10254 Lisp_Object spec, attrs;
10256 if (NILP (coding_system))
10257 return (Qno_conversion);
10258 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10259 attrs = AREF (spec, 0);
10260 return CODING_ATTR_BASE_NAME (attrs);
10263 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10264 1, 1, 0,
10265 doc: "Return the property list of CODING-SYSTEM.")
10266 (coding_system)
10267 Lisp_Object coding_system;
10269 Lisp_Object spec, attrs;
10271 if (NILP (coding_system))
10272 coding_system = Qno_conversion;
10273 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10274 attrs = AREF (spec, 0);
10275 return CODING_ATTR_PLIST (attrs);
10279 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10280 1, 1, 0,
10281 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10282 (coding_system)
10283 Lisp_Object coding_system;
10285 Lisp_Object spec;
10287 if (NILP (coding_system))
10288 coding_system = Qno_conversion;
10289 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10290 return AREF (spec, 1);
10293 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10294 Scoding_system_eol_type, 1, 1, 0,
10295 doc: /* Return eol-type of CODING-SYSTEM.
10296 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10298 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10299 and CR respectively.
10301 A vector value indicates that a format of end-of-line should be
10302 detected automatically. Nth element of the vector is the subsidiary
10303 coding system whose eol-type is N. */)
10304 (coding_system)
10305 Lisp_Object coding_system;
10307 Lisp_Object spec, eol_type;
10308 int n;
10310 if (NILP (coding_system))
10311 coding_system = Qno_conversion;
10312 if (! CODING_SYSTEM_P (coding_system))
10313 return Qnil;
10314 spec = CODING_SYSTEM_SPEC (coding_system);
10315 eol_type = AREF (spec, 2);
10316 if (VECTORP (eol_type))
10317 return Fcopy_sequence (eol_type);
10318 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10319 return make_number (n);
10322 #endif /* emacs */
10325 /*** 9. Post-amble ***/
10327 void
10328 init_coding_once ()
10330 int i;
10332 for (i = 0; i < coding_category_max; i++)
10334 coding_categories[i].id = -1;
10335 coding_priorities[i] = i;
10338 /* ISO2022 specific initialize routine. */
10339 for (i = 0; i < 0x20; i++)
10340 iso_code_class[i] = ISO_control_0;
10341 for (i = 0x21; i < 0x7F; i++)
10342 iso_code_class[i] = ISO_graphic_plane_0;
10343 for (i = 0x80; i < 0xA0; i++)
10344 iso_code_class[i] = ISO_control_1;
10345 for (i = 0xA1; i < 0xFF; i++)
10346 iso_code_class[i] = ISO_graphic_plane_1;
10347 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10348 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10349 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10350 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10351 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10352 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10353 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10354 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10355 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10357 for (i = 0; i < 256; i++)
10359 emacs_mule_bytes[i] = 1;
10361 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10362 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10363 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10364 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10367 #ifdef emacs
10369 void
10370 syms_of_coding ()
10372 staticpro (&Vcoding_system_hash_table);
10374 Lisp_Object args[2];
10375 args[0] = QCtest;
10376 args[1] = Qeq;
10377 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10380 staticpro (&Vsjis_coding_system);
10381 Vsjis_coding_system = Qnil;
10383 staticpro (&Vbig5_coding_system);
10384 Vbig5_coding_system = Qnil;
10386 staticpro (&Vcode_conversion_reused_workbuf);
10387 Vcode_conversion_reused_workbuf = Qnil;
10389 staticpro (&Vcode_conversion_workbuf_name);
10390 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
10392 reused_workbuf_in_use = 0;
10394 DEFSYM (Qcharset, "charset");
10395 DEFSYM (Qtarget_idx, "target-idx");
10396 DEFSYM (Qcoding_system_history, "coding-system-history");
10397 Fset (Qcoding_system_history, Qnil);
10399 /* Target FILENAME is the first argument. */
10400 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10401 /* Target FILENAME is the third argument. */
10402 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10404 DEFSYM (Qcall_process, "call-process");
10405 /* Target PROGRAM is the first argument. */
10406 Fput (Qcall_process, Qtarget_idx, make_number (0));
10408 DEFSYM (Qcall_process_region, "call-process-region");
10409 /* Target PROGRAM is the third argument. */
10410 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10412 DEFSYM (Qstart_process, "start-process");
10413 /* Target PROGRAM is the third argument. */
10414 Fput (Qstart_process, Qtarget_idx, make_number (2));
10416 DEFSYM (Qopen_network_stream, "open-network-stream");
10417 /* Target SERVICE is the fourth argument. */
10418 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10420 DEFSYM (Qcoding_system, "coding-system");
10421 DEFSYM (Qcoding_aliases, "coding-aliases");
10423 DEFSYM (Qeol_type, "eol-type");
10424 DEFSYM (Qunix, "unix");
10425 DEFSYM (Qdos, "dos");
10427 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10428 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10429 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10430 DEFSYM (Qdefault_char, "default-char");
10431 DEFSYM (Qundecided, "undecided");
10432 DEFSYM (Qno_conversion, "no-conversion");
10433 DEFSYM (Qraw_text, "raw-text");
10435 DEFSYM (Qiso_2022, "iso-2022");
10437 DEFSYM (Qutf_8, "utf-8");
10438 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10440 DEFSYM (Qutf_16, "utf-16");
10441 DEFSYM (Qbig, "big");
10442 DEFSYM (Qlittle, "little");
10444 DEFSYM (Qshift_jis, "shift-jis");
10445 DEFSYM (Qbig5, "big5");
10447 DEFSYM (Qcoding_system_p, "coding-system-p");
10449 DEFSYM (Qcoding_system_error, "coding-system-error");
10450 Fput (Qcoding_system_error, Qerror_conditions,
10451 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
10452 Fput (Qcoding_system_error, Qerror_message,
10453 build_string ("Invalid coding system"));
10455 /* Intern this now in case it isn't already done.
10456 Setting this variable twice is harmless.
10457 But don't staticpro it here--that is done in alloc.c. */
10458 Qchar_table_extra_slots = intern ("char-table-extra-slots");
10460 DEFSYM (Qtranslation_table, "translation-table");
10461 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10462 DEFSYM (Qtranslation_table_id, "translation-table-id");
10463 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10464 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10466 DEFSYM (Qvalid_codes, "valid-codes");
10468 DEFSYM (Qemacs_mule, "emacs-mule");
10470 DEFSYM (QCcategory, ":category");
10471 DEFSYM (QCmnemonic, ":mnemonic");
10472 DEFSYM (QCdefault_char, ":default-char");
10473 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10474 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10475 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10476 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10477 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10479 Vcoding_category_table
10480 = Fmake_vector (make_number (coding_category_max), Qnil);
10481 staticpro (&Vcoding_category_table);
10482 /* Followings are target of code detection. */
10483 ASET (Vcoding_category_table, coding_category_iso_7,
10484 intern ("coding-category-iso-7"));
10485 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10486 intern ("coding-category-iso-7-tight"));
10487 ASET (Vcoding_category_table, coding_category_iso_8_1,
10488 intern ("coding-category-iso-8-1"));
10489 ASET (Vcoding_category_table, coding_category_iso_8_2,
10490 intern ("coding-category-iso-8-2"));
10491 ASET (Vcoding_category_table, coding_category_iso_7_else,
10492 intern ("coding-category-iso-7-else"));
10493 ASET (Vcoding_category_table, coding_category_iso_8_else,
10494 intern ("coding-category-iso-8-else"));
10495 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10496 intern ("coding-category-utf-8-auto"));
10497 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10498 intern ("coding-category-utf-8"));
10499 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10500 intern ("coding-category-utf-8-sig"));
10501 ASET (Vcoding_category_table, coding_category_utf_16_be,
10502 intern ("coding-category-utf-16-be"));
10503 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10504 intern ("coding-category-utf-16-auto"));
10505 ASET (Vcoding_category_table, coding_category_utf_16_le,
10506 intern ("coding-category-utf-16-le"));
10507 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10508 intern ("coding-category-utf-16-be-nosig"));
10509 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10510 intern ("coding-category-utf-16-le-nosig"));
10511 ASET (Vcoding_category_table, coding_category_charset,
10512 intern ("coding-category-charset"));
10513 ASET (Vcoding_category_table, coding_category_sjis,
10514 intern ("coding-category-sjis"));
10515 ASET (Vcoding_category_table, coding_category_big5,
10516 intern ("coding-category-big5"));
10517 ASET (Vcoding_category_table, coding_category_ccl,
10518 intern ("coding-category-ccl"));
10519 ASET (Vcoding_category_table, coding_category_emacs_mule,
10520 intern ("coding-category-emacs-mule"));
10521 /* Followings are NOT target of code detection. */
10522 ASET (Vcoding_category_table, coding_category_raw_text,
10523 intern ("coding-category-raw-text"));
10524 ASET (Vcoding_category_table, coding_category_undecided,
10525 intern ("coding-category-undecided"));
10527 DEFSYM (Qinsufficient_source, "insufficient-source");
10528 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10529 DEFSYM (Qinvalid_source, "invalid-source");
10530 DEFSYM (Qinterrupted, "interrupted");
10531 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10532 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10534 defsubr (&Scoding_system_p);
10535 defsubr (&Sread_coding_system);
10536 defsubr (&Sread_non_nil_coding_system);
10537 defsubr (&Scheck_coding_system);
10538 defsubr (&Sdetect_coding_region);
10539 defsubr (&Sdetect_coding_string);
10540 defsubr (&Sfind_coding_systems_region_internal);
10541 defsubr (&Sunencodable_char_position);
10542 defsubr (&Scheck_coding_systems_region);
10543 defsubr (&Sdecode_coding_region);
10544 defsubr (&Sencode_coding_region);
10545 defsubr (&Sdecode_coding_string);
10546 defsubr (&Sencode_coding_string);
10547 defsubr (&Sdecode_sjis_char);
10548 defsubr (&Sencode_sjis_char);
10549 defsubr (&Sdecode_big5_char);
10550 defsubr (&Sencode_big5_char);
10551 defsubr (&Sset_terminal_coding_system_internal);
10552 defsubr (&Sset_safe_terminal_coding_system_internal);
10553 defsubr (&Sterminal_coding_system);
10554 defsubr (&Sset_keyboard_coding_system_internal);
10555 defsubr (&Skeyboard_coding_system);
10556 defsubr (&Sfind_operation_coding_system);
10557 defsubr (&Sset_coding_system_priority);
10558 defsubr (&Sdefine_coding_system_internal);
10559 defsubr (&Sdefine_coding_system_alias);
10560 defsubr (&Scoding_system_put);
10561 defsubr (&Scoding_system_base);
10562 defsubr (&Scoding_system_plist);
10563 defsubr (&Scoding_system_aliases);
10564 defsubr (&Scoding_system_eol_type);
10565 defsubr (&Scoding_system_priority_list);
10567 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10568 doc: /* List of coding systems.
10570 Do not alter the value of this variable manually. This variable should be
10571 updated by the functions `define-coding-system' and
10572 `define-coding-system-alias'. */);
10573 Vcoding_system_list = Qnil;
10575 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10576 doc: /* Alist of coding system names.
10577 Each element is one element list of coding system name.
10578 This variable is given to `completing-read' as COLLECTION argument.
10580 Do not alter the value of this variable manually. This variable should be
10581 updated by the functions `make-coding-system' and
10582 `define-coding-system-alias'. */);
10583 Vcoding_system_alist = Qnil;
10585 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10586 doc: /* List of coding-categories (symbols) ordered by priority.
10588 On detecting a coding system, Emacs tries code detection algorithms
10589 associated with each coding-category one by one in this order. When
10590 one algorithm agrees with a byte sequence of source text, the coding
10591 system bound to the corresponding coding-category is selected.
10593 Don't modify this variable directly, but use `set-coding-priority'. */);
10595 int i;
10597 Vcoding_category_list = Qnil;
10598 for (i = coding_category_max - 1; i >= 0; i--)
10599 Vcoding_category_list
10600 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10601 Vcoding_category_list);
10604 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10605 doc: /* Specify the coding system for read operations.
10606 It is useful to bind this variable with `let', but do not set it globally.
10607 If the value is a coding system, it is used for decoding on read operation.
10608 If not, an appropriate element is used from one of the coding system alists.
10609 There are three such tables: `file-coding-system-alist',
10610 `process-coding-system-alist', and `network-coding-system-alist'. */);
10611 Vcoding_system_for_read = Qnil;
10613 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10614 doc: /* Specify the coding system for write operations.
10615 Programs bind this variable with `let', but you should not set it globally.
10616 If the value is a coding system, it is used for encoding of output,
10617 when writing it to a file and when sending it to a file or subprocess.
10619 If this does not specify a coding system, an appropriate element
10620 is used from one of the coding system alists.
10621 There are three such tables: `file-coding-system-alist',
10622 `process-coding-system-alist', and `network-coding-system-alist'.
10623 For output to files, if the above procedure does not specify a coding system,
10624 the value of `buffer-file-coding-system' is used. */);
10625 Vcoding_system_for_write = Qnil;
10627 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10628 doc: /*
10629 Coding system used in the latest file or process I/O. */);
10630 Vlast_coding_system_used = Qnil;
10632 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10633 doc: /*
10634 Error status of the last code conversion.
10636 When an error was detected in the last code conversion, this variable
10637 is set to one of the following symbols.
10638 `insufficient-source'
10639 `inconsistent-eol'
10640 `invalid-source'
10641 `interrupted'
10642 `insufficient-memory'
10643 When no error was detected, the value doesn't change. So, to check
10644 the error status of a code conversion by this variable, you must
10645 explicitly set this variable to nil before performing code
10646 conversion. */);
10647 Vlast_code_conversion_error = Qnil;
10649 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10650 doc: /*
10651 *Non-nil means always inhibit code conversion of end-of-line format.
10652 See info node `Coding Systems' and info node `Text and Binary' concerning
10653 such conversion. */);
10654 inhibit_eol_conversion = 0;
10656 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10657 doc: /*
10658 Non-nil means process buffer inherits coding system of process output.
10659 Bind it to t if the process output is to be treated as if it were a file
10660 read from some filesystem. */);
10661 inherit_process_coding_system = 0;
10663 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10664 doc: /*
10665 Alist to decide a coding system to use for a file I/O operation.
10666 The format is ((PATTERN . VAL) ...),
10667 where PATTERN is a regular expression matching a file name,
10668 VAL is a coding system, a cons of coding systems, or a function symbol.
10669 If VAL is a coding system, it is used for both decoding and encoding
10670 the file contents.
10671 If VAL is a cons of coding systems, the car part is used for decoding,
10672 and the cdr part is used for encoding.
10673 If VAL is a function symbol, the function must return a coding system
10674 or a cons of coding systems which are used as above. The function is
10675 called with an argument that is a list of the arguments with which
10676 `find-operation-coding-system' was called. If the function can't decide
10677 a coding system, it can return `undecided' so that the normal
10678 code-detection is performed.
10680 See also the function `find-operation-coding-system'
10681 and the variable `auto-coding-alist'. */);
10682 Vfile_coding_system_alist = Qnil;
10684 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10685 doc: /*
10686 Alist to decide a coding system to use for a process I/O operation.
10687 The format is ((PATTERN . VAL) ...),
10688 where PATTERN is a regular expression matching a program name,
10689 VAL is a coding system, a cons of coding systems, or a function symbol.
10690 If VAL is a coding system, it is used for both decoding what received
10691 from the program and encoding what sent to the program.
10692 If VAL is a cons of coding systems, the car part is used for decoding,
10693 and the cdr part is used for encoding.
10694 If VAL is a function symbol, the function must return a coding system
10695 or a cons of coding systems which are used as above.
10697 See also the function `find-operation-coding-system'. */);
10698 Vprocess_coding_system_alist = Qnil;
10700 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10701 doc: /*
10702 Alist to decide a coding system to use for a network I/O operation.
10703 The format is ((PATTERN . VAL) ...),
10704 where PATTERN is a regular expression matching a network service name
10705 or is a port number to connect to,
10706 VAL is a coding system, a cons of coding systems, or a function symbol.
10707 If VAL is a coding system, it is used for both decoding what received
10708 from the network stream and encoding what sent to the network stream.
10709 If VAL is a cons of coding systems, the car part is used for decoding,
10710 and the cdr part is used for encoding.
10711 If VAL is a function symbol, the function must return a coding system
10712 or a cons of coding systems which are used as above.
10714 See also the function `find-operation-coding-system'. */);
10715 Vnetwork_coding_system_alist = Qnil;
10717 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10718 doc: /* Coding system to use with system messages.
10719 Also used for decoding keyboard input on X Window system. */);
10720 Vlocale_coding_system = Qnil;
10722 /* The eol mnemonics are reset in startup.el system-dependently. */
10723 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10724 doc: /*
10725 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10726 eol_mnemonic_unix = build_string (":");
10728 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10729 doc: /*
10730 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10731 eol_mnemonic_dos = build_string ("\\");
10733 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10734 doc: /*
10735 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10736 eol_mnemonic_mac = build_string ("/");
10738 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10739 doc: /*
10740 *String displayed in mode line when end-of-line format is not yet determined. */);
10741 eol_mnemonic_undecided = build_string (":");
10743 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10744 doc: /*
10745 *Non-nil enables character translation while encoding and decoding. */);
10746 Venable_character_translation = Qt;
10748 DEFVAR_LISP ("standard-translation-table-for-decode",
10749 &Vstandard_translation_table_for_decode,
10750 doc: /* Table for translating characters while decoding. */);
10751 Vstandard_translation_table_for_decode = Qnil;
10753 DEFVAR_LISP ("standard-translation-table-for-encode",
10754 &Vstandard_translation_table_for_encode,
10755 doc: /* Table for translating characters while encoding. */);
10756 Vstandard_translation_table_for_encode = Qnil;
10758 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10759 doc: /* Alist of charsets vs revision numbers.
10760 While encoding, if a charset (car part of an element) is found,
10761 designate it with the escape sequence identifying revision (cdr part
10762 of the element). */);
10763 Vcharset_revision_table = Qnil;
10765 DEFVAR_LISP ("default-process-coding-system",
10766 &Vdefault_process_coding_system,
10767 doc: /* Cons of coding systems used for process I/O by default.
10768 The car part is used for decoding a process output,
10769 the cdr part is used for encoding a text to be sent to a process. */);
10770 Vdefault_process_coding_system = Qnil;
10772 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10773 doc: /*
10774 Table of extra Latin codes in the range 128..159 (inclusive).
10775 This is a vector of length 256.
10776 If Nth element is non-nil, the existence of code N in a file
10777 \(or output of subprocess) doesn't prevent it to be detected as
10778 a coding system of ISO 2022 variant which has a flag
10779 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10780 or reading output of a subprocess.
10781 Only 128th through 159th elements have a meaning. */);
10782 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10784 DEFVAR_LISP ("select-safe-coding-system-function",
10785 &Vselect_safe_coding_system_function,
10786 doc: /*
10787 Function to call to select safe coding system for encoding a text.
10789 If set, this function is called to force a user to select a proper
10790 coding system which can encode the text in the case that a default
10791 coding system used in each operation can't encode the text. The
10792 function should take care that the buffer is not modified while
10793 the coding system is being selected.
10795 The default value is `select-safe-coding-system' (which see). */);
10796 Vselect_safe_coding_system_function = Qnil;
10798 DEFVAR_BOOL ("coding-system-require-warning",
10799 &coding_system_require_warning,
10800 doc: /* Internal use only.
10801 If non-nil, on writing a file, `select-safe-coding-system-function' is
10802 called even if `coding-system-for-write' is non-nil. The command
10803 `universal-coding-system-argument' binds this variable to t temporarily. */);
10804 coding_system_require_warning = 0;
10807 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10808 &inhibit_iso_escape_detection,
10809 doc: /*
10810 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10812 When Emacs reads text, it tries to detect how the text is encoded.
10813 This code detection is sensitive to escape sequences. If Emacs sees
10814 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10815 of the ISO2022 encodings, and decodes text by the corresponding coding
10816 system (e.g. `iso-2022-7bit').
10818 However, there may be a case that you want to read escape sequences in
10819 a file as is. In such a case, you can set this variable to non-nil.
10820 Then the code detection will ignore any escape sequences, and no text is
10821 detected as encoded in some ISO-2022 encoding. The result is that all
10822 escape sequences become visible in a buffer.
10824 The default value is nil, and it is strongly recommended not to change
10825 it. That is because many Emacs Lisp source files that contain
10826 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10827 in Emacs's distribution, and they won't be decoded correctly on
10828 reading if you suppress escape sequence detection.
10830 The other way to read escape sequences in a file without decoding is
10831 to explicitly specify some coding system that doesn't use ISO-2022
10832 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10833 inhibit_iso_escape_detection = 0;
10835 DEFVAR_BOOL ("inhibit-null-byte-detection",
10836 &inhibit_null_byte_detection,
10837 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10838 By default, Emacs treats it as binary data, and does not attempt to
10839 decode it. The effect is as if you specified `no-conversion' for
10840 reading that text.
10842 Set this to non-nil when a regular text happens to include null bytes.
10843 Examples are Index nodes of Info files and null-byte delimited output
10844 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10845 decode text as usual. */);
10846 inhibit_null_byte_detection = 0;
10848 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10849 doc: /* Char table for translating self-inserting characters.
10850 This is applied to the result of input methods, not their input.
10851 See also `keyboard-translate-table'.
10853 Use of this variable for character code unification was rendered
10854 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10855 internal character representation. */);
10856 Vtranslation_table_for_input = Qnil;
10859 Lisp_Object args[coding_arg_max];
10860 Lisp_Object plist[16];
10861 int i;
10863 for (i = 0; i < coding_arg_max; i++)
10864 args[i] = Qnil;
10866 plist[0] = intern (":name");
10867 plist[1] = args[coding_arg_name] = Qno_conversion;
10868 plist[2] = intern (":mnemonic");
10869 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10870 plist[4] = intern (":coding-type");
10871 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10872 plist[6] = intern (":ascii-compatible-p");
10873 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10874 plist[8] = intern (":default-char");
10875 plist[9] = args[coding_arg_default_char] = make_number (0);
10876 plist[10] = intern (":for-unibyte");
10877 plist[11] = args[coding_arg_for_unibyte] = Qt;
10878 plist[12] = intern (":docstring");
10879 plist[13] = build_string ("Do no conversion.\n\
10881 When you visit a file with this coding, the file is read into a\n\
10882 unibyte buffer as is, thus each byte of a file is treated as a\n\
10883 character.");
10884 plist[14] = intern (":eol-type");
10885 plist[15] = args[coding_arg_eol_type] = Qunix;
10886 args[coding_arg_plist] = Flist (16, plist);
10887 Fdefine_coding_system_internal (coding_arg_max, args);
10889 plist[1] = args[coding_arg_name] = Qundecided;
10890 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10891 plist[5] = args[coding_arg_coding_type] = Qundecided;
10892 /* This is already set.
10893 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10894 plist[8] = intern (":charset-list");
10895 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10896 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10897 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10898 plist[15] = args[coding_arg_eol_type] = Qnil;
10899 args[coding_arg_plist] = Flist (16, plist);
10900 Fdefine_coding_system_internal (coding_arg_max, args);
10903 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10906 int i;
10908 for (i = 0; i < coding_category_max; i++)
10909 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10911 #if defined (MSDOS) || defined (WINDOWSNT)
10912 system_eol_type = Qdos;
10913 #else
10914 system_eol_type = Qunix;
10915 #endif
10916 staticpro (&system_eol_type);
10919 char *
10920 emacs_strerror (error_number)
10921 int error_number;
10923 char *str;
10925 synchronize_system_messages_locale ();
10926 str = strerror (error_number);
10928 if (! NILP (Vlocale_coding_system))
10930 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10931 Vlocale_coding_system,
10933 str = (char *) SDATA (dec);
10936 return str;
10939 #endif /* emacs */
10941 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10942 (do not change this comment) */