Comment (minor header format fixes).
[emacs.git] / src / coding.c
blob01878a37b5c0b5f45590f64f4f21d9f6016db41a
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008, 2009
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
12 This file is part of GNU Emacs.
14 GNU Emacs is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
27 /*** TABLE OF CONTENTS ***
29 0. General comments
30 1. Preamble
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
45 /*** 0. General comments ***
48 CODING SYSTEM
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
57 coding system.
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
61 stored in the hash table Vcharset_hash_table. The conversion from
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
65 Coding systems are classified into the following types depending on
66 the encoding mechanism. Here's a brief description of the types.
68 o UTF-8
70 o UTF-16
72 o Charset-base coding system
74 A coding system defined by one or more (coded) character sets.
75 Decoding and encoding are done by a code converter defined for each
76 character set.
78 o Old Emacs internal format (emacs-mule)
80 The coding system adopted by old versions of Emacs (20 and 21).
82 o ISO2022-base coding system
84 The most famous coding system for multiple character sets. X's
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
89 o SJIS (or Shift-JIS or MS-Kanji-Code)
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
93 section 8.
95 o BIG5
97 A coding system to encode character sets: ASCII and Big5. Widely
98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
103 o CCL
105 If a user wants to decode/encode text encoded in a coding system
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
110 o Raw-text
112 A coding system for text containing raw eight-bit data. Emacs
113 treats each byte of source text as a character (except for
114 end-of-line conversion).
116 o No-conversion
118 Like raw text, but don't do end-of-line conversion.
121 END-OF-LINE FORMAT
123 How text end-of-line is encoded depends on operating system. For
124 instance, Unix's format is just one byte of LF (line-feed) code,
125 whereas DOS's format is two-byte sequence of `carriage-return' and
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
129 Since text character encoding and end-of-line encoding are
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
133 STRUCT CODING_SYSTEM
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
138 conversion (e.g. the location of source and destination data).
142 /* COMMON MACROS */
145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
147 These functions check if a byte sequence specified as a source in
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
153 Below is the template of these functions. */
155 #if 0
156 static int
157 detect_coding_XXX (coding, detect_info)
158 struct coding_system *coding;
159 struct coding_detection_info *detect_info;
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
163 int multibytep = coding->src_multibyte;
164 int consumed_chars = 0;
165 int found = 0;
166 ...;
168 while (1)
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
181 return 0;
183 no_more_source:
184 /* The source exausted successfully. */
185 detect_info->found |= found;
186 return 1;
188 #endif
190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
202 Below is the template of these functions. */
204 #if 0
205 static void
206 decode_coding_XXXX (coding)
207 struct coding_system *coding;
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
214 const unsigned char *src_base;
215 /* A buffer to produce decoded characters. */
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
218 int multibytep = coding->src_multibyte;
220 while (1)
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
243 #endif
245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
261 Below is a template of these functions. */
262 #if 0
263 static void
264 encode_coding_XXX (coding)
265 struct coding_system *coding;
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
285 #endif
288 /*** 1. Preamble ***/
290 #include <config.h>
291 #include <stdio.h>
293 #include "lisp.h"
294 #include "buffer.h"
295 #include "character.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
307 Lisp_Object Qunix, Qdos;
308 extern Lisp_Object Qmac; /* frame.c */
309 Lisp_Object Qbuffer_file_coding_system;
310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
311 Lisp_Object Qdefault_char;
312 Lisp_Object Qno_conversion, Qundecided;
313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
314 Lisp_Object Qbig, Qlittle;
315 Lisp_Object Qcoding_system_history;
316 Lisp_Object Qvalid_codes;
317 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
320 Lisp_Object QCascii_compatible_p;
322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
323 Lisp_Object Qcall_process, Qcall_process_region;
324 Lisp_Object Qstart_process, Qopen_network_stream;
325 Lisp_Object Qtarget_idx;
327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328 Lisp_Object Qinterrupted, Qinsufficient_memory;
330 extern Lisp_Object Qcompletion_ignore_case;
332 /* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334 static Lisp_Object Qcoding_system_define_form;
336 int coding_system_require_warning;
338 Lisp_Object Vselect_safe_coding_system_function;
340 /* Mnemonic string for each format of end-of-line. */
341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342 /* Mnemonic string to indicate format of end-of-line is not yet
343 decided. */
344 Lisp_Object eol_mnemonic_undecided;
346 /* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350 static Lisp_Object system_eol_type;
352 #ifdef emacs
354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
358 /* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360 Lisp_Object Qemacs_mule, Qraw_text;
361 Lisp_Object Qutf_8_emacs;
363 /* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365 /* Coding-system for reading files and receiving data from process. */
366 Lisp_Object Vcoding_system_for_read;
367 /* Coding-system for writing files and sending data to process. */
368 Lisp_Object Vcoding_system_for_write;
369 /* Coding-system actually used in the latest I/O. */
370 Lisp_Object Vlast_coding_system_used;
371 /* Set to non-nil when an error is detected while code conversion. */
372 Lisp_Object Vlast_code_conversion_error;
373 /* A vector of length 256 which contains information about special
374 Latin codes (especially for dealing with Microsoft codes). */
375 Lisp_Object Vlatin_extra_code_table;
377 /* Flag to inhibit code conversion of end-of-line format. */
378 int inhibit_eol_conversion;
380 /* Flag to inhibit ISO2022 escape sequence detection. */
381 int inhibit_iso_escape_detection;
383 /* Flag to inhibit detection of binary files through null bytes. */
384 int inhibit_null_byte_detection;
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
389 /* Coding system to be used to encode text for terminal display when
390 terminal coding system is nil. */
391 struct coding_system safe_terminal_coding;
393 Lisp_Object Vfile_coding_system_alist;
394 Lisp_Object Vprocess_coding_system_alist;
395 Lisp_Object Vnetwork_coding_system_alist;
397 Lisp_Object Vlocale_coding_system;
399 #endif /* emacs */
401 /* Flag to tell if we look up translation table on character code
402 conversion. */
403 Lisp_Object Venable_character_translation;
404 /* Standard translation table to look up on decoding (reading). */
405 Lisp_Object Vstandard_translation_table_for_decode;
406 /* Standard translation table to look up on encoding (writing). */
407 Lisp_Object Vstandard_translation_table_for_encode;
409 Lisp_Object Qtranslation_table;
410 Lisp_Object Qtranslation_table_id;
411 Lisp_Object Qtranslation_table_for_decode;
412 Lisp_Object Qtranslation_table_for_encode;
414 /* Alist of charsets vs revision number. */
415 static Lisp_Object Vcharset_revision_table;
417 /* Default coding systems used for process I/O. */
418 Lisp_Object Vdefault_process_coding_system;
420 /* Char table for translating Quail and self-inserting input. */
421 Lisp_Object Vtranslation_table_for_input;
423 /* Two special coding systems. */
424 Lisp_Object Vsjis_coding_system;
425 Lisp_Object Vbig5_coding_system;
427 /* ISO2022 section */
429 #define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
435 #define CODING_ISO_REQUEST(coding, charset_id) \
436 ((charset_id <= (coding)->max_charset_id \
437 ? (coding)->safe_charsets[charset_id] \
438 : -1))
441 #define CODING_ISO_FLAGS(coding) \
442 ((coding)->spec.iso_2022.flags)
443 #define CODING_ISO_DESIGNATION(coding, reg) \
444 ((coding)->spec.iso_2022.current_designation[reg])
445 #define CODING_ISO_INVOCATION(coding, plane) \
446 ((coding)->spec.iso_2022.current_invocation[plane])
447 #define CODING_ISO_SINGLE_SHIFTING(coding) \
448 ((coding)->spec.iso_2022.single_shifting)
449 #define CODING_ISO_BOL(coding) \
450 ((coding)->spec.iso_2022.bol)
451 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
452 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
454 /* Control characters of ISO2022. */
455 /* code */ /* function */
456 #define ISO_CODE_LF 0x0A /* line-feed */
457 #define ISO_CODE_CR 0x0D /* carriage-return */
458 #define ISO_CODE_SO 0x0E /* shift-out */
459 #define ISO_CODE_SI 0x0F /* shift-in */
460 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
461 #define ISO_CODE_ESC 0x1B /* escape */
462 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
463 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
464 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
466 /* All code (1-byte) of ISO2022 is classified into one of the
467 followings. */
468 enum iso_code_class_type
470 ISO_control_0, /* Control codes in the range
471 0x00..0x1F and 0x7F, except for the
472 following 5 codes. */
473 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
474 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
475 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
476 ISO_escape, /* ISO_CODE_SO (0x1B) */
477 ISO_control_1, /* Control codes in the range
478 0x80..0x9F, except for the
479 following 3 codes. */
480 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
481 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
482 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
483 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
484 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
485 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
486 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
489 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
490 `iso-flags' attribute of an iso2022 coding system. */
492 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
493 instead of the correct short-form sequence (e.g. ESC $ A). */
494 #define CODING_ISO_FLAG_LONG_FORM 0x0001
496 /* If set, reset graphic planes and registers at end-of-line to the
497 initial state. */
498 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
500 /* If set, reset graphic planes and registers before any control
501 characters to the initial state. */
502 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
504 /* If set, encode by 7-bit environment. */
505 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
507 /* If set, use locking-shift function. */
508 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
510 /* If set, use single-shift function. Overwrite
511 CODING_ISO_FLAG_LOCKING_SHIFT. */
512 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
514 /* If set, use designation escape sequence. */
515 #define CODING_ISO_FLAG_DESIGNATION 0x0040
517 /* If set, produce revision number sequence. */
518 #define CODING_ISO_FLAG_REVISION 0x0080
520 /* If set, produce ISO6429's direction specifying sequence. */
521 #define CODING_ISO_FLAG_DIRECTION 0x0100
523 /* If set, assume designation states are reset at beginning of line on
524 output. */
525 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
527 /* If set, designation sequence should be placed at beginning of line
528 on output. */
529 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
531 /* If set, do not encode unsafe charactes on output. */
532 #define CODING_ISO_FLAG_SAFE 0x0800
534 /* If set, extra latin codes (128..159) are accepted as a valid code
535 on input. */
536 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
538 #define CODING_ISO_FLAG_COMPOSITION 0x2000
540 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
542 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
544 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
546 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
548 /* A character to be produced on output if encoding of the original
549 character is prohibited by CODING_ISO_FLAG_SAFE. */
550 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
552 /* UTF-8 section */
553 #define CODING_UTF_8_BOM(coding) \
554 ((coding)->spec.utf_8_bom)
556 /* UTF-16 section */
557 #define CODING_UTF_16_BOM(coding) \
558 ((coding)->spec.utf_16.bom)
560 #define CODING_UTF_16_ENDIAN(coding) \
561 ((coding)->spec.utf_16.endian)
563 #define CODING_UTF_16_SURROGATE(coding) \
564 ((coding)->spec.utf_16.surrogate)
567 /* CCL section */
568 #define CODING_CCL_DECODER(coding) \
569 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
570 #define CODING_CCL_ENCODER(coding) \
571 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
572 #define CODING_CCL_VALIDS(coding) \
573 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
575 /* Index for each coding category in `coding_categories' */
577 enum coding_category
579 coding_category_iso_7,
580 coding_category_iso_7_tight,
581 coding_category_iso_8_1,
582 coding_category_iso_8_2,
583 coding_category_iso_7_else,
584 coding_category_iso_8_else,
585 coding_category_utf_8_auto,
586 coding_category_utf_8_nosig,
587 coding_category_utf_8_sig,
588 coding_category_utf_16_auto,
589 coding_category_utf_16_be,
590 coding_category_utf_16_le,
591 coding_category_utf_16_be_nosig,
592 coding_category_utf_16_le_nosig,
593 coding_category_charset,
594 coding_category_sjis,
595 coding_category_big5,
596 coding_category_ccl,
597 coding_category_emacs_mule,
598 /* All above are targets of code detection. */
599 coding_category_raw_text,
600 coding_category_undecided,
601 coding_category_max
604 /* Definitions of flag bits used in detect_coding_XXXX. */
605 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
606 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
607 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
608 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
609 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
610 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
611 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
612 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
613 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
614 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
615 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
616 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
617 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
618 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
619 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
620 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
621 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
622 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
623 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
624 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
626 /* This value is returned if detect_coding_mask () find nothing other
627 than ASCII characters. */
628 #define CATEGORY_MASK_ANY \
629 (CATEGORY_MASK_ISO_7 \
630 | CATEGORY_MASK_ISO_7_TIGHT \
631 | CATEGORY_MASK_ISO_8_1 \
632 | CATEGORY_MASK_ISO_8_2 \
633 | CATEGORY_MASK_ISO_7_ELSE \
634 | CATEGORY_MASK_ISO_8_ELSE \
635 | CATEGORY_MASK_UTF_8_AUTO \
636 | CATEGORY_MASK_UTF_8_NOSIG \
637 | CATEGORY_MASK_UTF_8_SIG \
638 | CATEGORY_MASK_UTF_16_AUTO \
639 | CATEGORY_MASK_UTF_16_BE \
640 | CATEGORY_MASK_UTF_16_LE \
641 | CATEGORY_MASK_UTF_16_BE_NOSIG \
642 | CATEGORY_MASK_UTF_16_LE_NOSIG \
643 | CATEGORY_MASK_CHARSET \
644 | CATEGORY_MASK_SJIS \
645 | CATEGORY_MASK_BIG5 \
646 | CATEGORY_MASK_CCL \
647 | CATEGORY_MASK_EMACS_MULE)
650 #define CATEGORY_MASK_ISO_7BIT \
651 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
653 #define CATEGORY_MASK_ISO_8BIT \
654 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
656 #define CATEGORY_MASK_ISO_ELSE \
657 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
659 #define CATEGORY_MASK_ISO_ESCAPE \
660 (CATEGORY_MASK_ISO_7 \
661 | CATEGORY_MASK_ISO_7_TIGHT \
662 | CATEGORY_MASK_ISO_7_ELSE \
663 | CATEGORY_MASK_ISO_8_ELSE)
665 #define CATEGORY_MASK_ISO \
666 ( CATEGORY_MASK_ISO_7BIT \
667 | CATEGORY_MASK_ISO_8BIT \
668 | CATEGORY_MASK_ISO_ELSE)
670 #define CATEGORY_MASK_UTF_16 \
671 (CATEGORY_MASK_UTF_16_AUTO \
672 | CATEGORY_MASK_UTF_16_BE \
673 | CATEGORY_MASK_UTF_16_LE \
674 | CATEGORY_MASK_UTF_16_BE_NOSIG \
675 | CATEGORY_MASK_UTF_16_LE_NOSIG)
677 #define CATEGORY_MASK_UTF_8 \
678 (CATEGORY_MASK_UTF_8_AUTO \
679 | CATEGORY_MASK_UTF_8_NOSIG \
680 | CATEGORY_MASK_UTF_8_SIG)
682 /* List of symbols `coding-category-xxx' ordered by priority. This
683 variable is exposed to Emacs Lisp. */
684 static Lisp_Object Vcoding_category_list;
686 /* Table of coding categories (Lisp symbols). This variable is for
687 internal use oly. */
688 static Lisp_Object Vcoding_category_table;
690 /* Table of coding-categories ordered by priority. */
691 static enum coding_category coding_priorities[coding_category_max];
693 /* Nth element is a coding context for the coding system bound to the
694 Nth coding category. */
695 static struct coding_system coding_categories[coding_category_max];
697 /*** Commonly used macros and functions ***/
699 #ifndef min
700 #define min(a, b) ((a) < (b) ? (a) : (b))
701 #endif
702 #ifndef max
703 #define max(a, b) ((a) > (b) ? (a) : (b))
704 #endif
706 #define CODING_GET_INFO(coding, attrs, charset_list) \
707 do { \
708 (attrs) = CODING_ID_ATTRS ((coding)->id); \
709 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
710 } while (0)
713 /* Safely get one byte from the source text pointed by SRC which ends
714 at SRC_END, and set C to that byte. If there are not enough bytes
715 in the source, it jumps to `no_more_source'. If multibytep is
716 nonzero, and a multibyte character is found at SRC, set C to the
717 negative value of the character code. The caller should declare
718 and set these variables appropriately in advance:
719 src, src_end, multibytep */
721 #define ONE_MORE_BYTE(c) \
722 do { \
723 if (src == src_end) \
725 if (src_base < src) \
726 record_conversion_result \
727 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
728 goto no_more_source; \
730 c = *src++; \
731 if (multibytep && (c & 0x80)) \
733 if ((c & 0xFE) == 0xC0) \
734 c = ((c & 1) << 6) | *src++; \
735 else \
737 src--; \
738 c = - string_char (src, &src, NULL); \
739 record_conversion_result \
740 (coding, CODING_RESULT_INVALID_SRC); \
743 consumed_chars++; \
744 } while (0)
747 #define ONE_MORE_BYTE_NO_CHECK(c) \
748 do { \
749 c = *src++; \
750 if (multibytep && (c & 0x80)) \
752 if ((c & 0xFE) == 0xC0) \
753 c = ((c & 1) << 6) | *src++; \
754 else \
756 src--; \
757 c = - string_char (src, &src, NULL); \
758 record_conversion_result \
759 (coding, CODING_RESULT_INVALID_SRC); \
762 consumed_chars++; \
763 } while (0)
766 /* Store a byte C in the place pointed by DST and increment DST to the
767 next free point, and increment PRODUCED_CHARS. The caller should
768 assure that C is 0..127, and declare and set the variable `dst'
769 appropriately in advance.
773 #define EMIT_ONE_ASCII_BYTE(c) \
774 do { \
775 produced_chars++; \
776 *dst++ = (c); \
777 } while (0)
780 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
782 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
783 do { \
784 produced_chars += 2; \
785 *dst++ = (c1), *dst++ = (c2); \
786 } while (0)
789 /* Store a byte C in the place pointed by DST and increment DST to the
790 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
791 nonzero, store in an appropriate multibyte from. The caller should
792 declare and set the variables `dst' and `multibytep' appropriately
793 in advance. */
795 #define EMIT_ONE_BYTE(c) \
796 do { \
797 produced_chars++; \
798 if (multibytep) \
800 int ch = (c); \
801 if (ch >= 0x80) \
802 ch = BYTE8_TO_CHAR (ch); \
803 CHAR_STRING_ADVANCE (ch, dst); \
805 else \
806 *dst++ = (c); \
807 } while (0)
810 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
812 #define EMIT_TWO_BYTES(c1, c2) \
813 do { \
814 produced_chars += 2; \
815 if (multibytep) \
817 int ch; \
819 ch = (c1); \
820 if (ch >= 0x80) \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
823 ch = (c2); \
824 if (ch >= 0x80) \
825 ch = BYTE8_TO_CHAR (ch); \
826 CHAR_STRING_ADVANCE (ch, dst); \
828 else \
830 *dst++ = (c1); \
831 *dst++ = (c2); \
833 } while (0)
836 #define EMIT_THREE_BYTES(c1, c2, c3) \
837 do { \
838 EMIT_ONE_BYTE (c1); \
839 EMIT_TWO_BYTES (c2, c3); \
840 } while (0)
843 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
844 do { \
845 EMIT_TWO_BYTES (c1, c2); \
846 EMIT_TWO_BYTES (c3, c4); \
847 } while (0)
850 /* Prototypes for static functions. */
851 static void record_conversion_result P_ ((struct coding_system *coding,
852 enum coding_result_code result));
853 static int detect_coding_utf_8 P_ ((struct coding_system *,
854 struct coding_detection_info *info));
855 static void decode_coding_utf_8 P_ ((struct coding_system *));
856 static int encode_coding_utf_8 P_ ((struct coding_system *));
858 static int detect_coding_utf_16 P_ ((struct coding_system *,
859 struct coding_detection_info *info));
860 static void decode_coding_utf_16 P_ ((struct coding_system *));
861 static int encode_coding_utf_16 P_ ((struct coding_system *));
863 static int detect_coding_iso_2022 P_ ((struct coding_system *,
864 struct coding_detection_info *info));
865 static void decode_coding_iso_2022 P_ ((struct coding_system *));
866 static int encode_coding_iso_2022 P_ ((struct coding_system *));
868 static int detect_coding_emacs_mule P_ ((struct coding_system *,
869 struct coding_detection_info *info));
870 static void decode_coding_emacs_mule P_ ((struct coding_system *));
871 static int encode_coding_emacs_mule P_ ((struct coding_system *));
873 static int detect_coding_sjis P_ ((struct coding_system *,
874 struct coding_detection_info *info));
875 static void decode_coding_sjis P_ ((struct coding_system *));
876 static int encode_coding_sjis P_ ((struct coding_system *));
878 static int detect_coding_big5 P_ ((struct coding_system *,
879 struct coding_detection_info *info));
880 static void decode_coding_big5 P_ ((struct coding_system *));
881 static int encode_coding_big5 P_ ((struct coding_system *));
883 static int detect_coding_ccl P_ ((struct coding_system *,
884 struct coding_detection_info *info));
885 static void decode_coding_ccl P_ ((struct coding_system *));
886 static int encode_coding_ccl P_ ((struct coding_system *));
888 static void decode_coding_raw_text P_ ((struct coding_system *));
889 static int encode_coding_raw_text P_ ((struct coding_system *));
891 static void coding_set_source P_ ((struct coding_system *));
892 static void coding_set_destination P_ ((struct coding_system *));
893 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
894 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
895 EMACS_INT, EMACS_INT));
896 static unsigned char *alloc_destination P_ ((struct coding_system *,
897 EMACS_INT, unsigned char *));
898 static void setup_iso_safe_charsets P_ ((Lisp_Object));
899 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
900 int *, int *,
901 unsigned char *));
902 static int detect_eol P_ ((const unsigned char *,
903 EMACS_INT, enum coding_category));
904 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
905 static void decode_eol P_ ((struct coding_system *));
906 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
907 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
908 int, int *, int *));
909 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
910 static INLINE void produce_composition P_ ((struct coding_system *, int *,
911 EMACS_INT));
912 static INLINE void produce_charset P_ ((struct coding_system *, int *,
913 EMACS_INT));
914 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
915 static int decode_coding P_ ((struct coding_system *));
916 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
917 struct coding_system *,
918 int *, EMACS_INT *));
919 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
920 struct coding_system *,
921 int *, EMACS_INT *));
922 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
923 static int encode_coding P_ ((struct coding_system *));
924 static Lisp_Object make_conversion_work_buffer P_ ((int));
925 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
926 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
927 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
929 static void
930 record_conversion_result (struct coding_system *coding,
931 enum coding_result_code result)
933 coding->result = result;
934 switch (result)
936 case CODING_RESULT_INSUFFICIENT_SRC:
937 Vlast_code_conversion_error = Qinsufficient_source;
938 break;
939 case CODING_RESULT_INCONSISTENT_EOL:
940 Vlast_code_conversion_error = Qinconsistent_eol;
941 break;
942 case CODING_RESULT_INVALID_SRC:
943 Vlast_code_conversion_error = Qinvalid_source;
944 break;
945 case CODING_RESULT_INTERRUPT:
946 Vlast_code_conversion_error = Qinterrupted;
947 break;
948 case CODING_RESULT_INSUFFICIENT_MEM:
949 Vlast_code_conversion_error = Qinsufficient_memory;
950 break;
951 default:
952 Vlast_code_conversion_error = intern ("Unknown error");
956 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
957 do { \
958 charset_map_loaded = 0; \
959 c = DECODE_CHAR (charset, code); \
960 if (charset_map_loaded) \
962 const unsigned char *orig = coding->source; \
963 EMACS_INT offset; \
965 coding_set_source (coding); \
966 offset = coding->source - orig; \
967 src += offset; \
968 src_base += offset; \
969 src_end += offset; \
971 } while (0)
974 /* If there are at least BYTES length of room at dst, allocate memory
975 for coding->destination and update dst and dst_end. We don't have
976 to take care of coding->source which will be relocated. It is
977 handled by calling coding_set_source in encode_coding. */
979 #define ASSURE_DESTINATION(bytes) \
980 do { \
981 if (dst + (bytes) >= dst_end) \
983 int more_bytes = charbuf_end - charbuf + (bytes); \
985 dst = alloc_destination (coding, more_bytes, dst); \
986 dst_end = coding->destination + coding->dst_bytes; \
988 } while (0)
991 /* Store multibyte form of the character C in P, and advance P to the
992 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
993 never calls MAYBE_UNIFY_CHAR. */
995 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
996 do { \
997 if ((c) <= MAX_1_BYTE_CHAR) \
998 *(p)++ = (c); \
999 else if ((c) <= MAX_2_BYTE_CHAR) \
1000 *(p)++ = (0xC0 | ((c) >> 6)), \
1001 *(p)++ = (0x80 | ((c) & 0x3F)); \
1002 else if ((c) <= MAX_3_BYTE_CHAR) \
1003 *(p)++ = (0xE0 | ((c) >> 12)), \
1004 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1005 *(p)++ = (0x80 | ((c) & 0x3F)); \
1006 else if ((c) <= MAX_4_BYTE_CHAR) \
1007 *(p)++ = (0xF0 | (c >> 18)), \
1008 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1009 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1010 *(p)++ = (0x80 | (c & 0x3F)); \
1011 else if ((c) <= MAX_5_BYTE_CHAR) \
1012 *(p)++ = 0xF8, \
1013 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1014 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1015 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1016 *(p)++ = (0x80 | (c & 0x3F)); \
1017 else \
1018 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1019 } while (0)
1022 /* Return the character code of character whose multibyte form is at
1023 P, and advance P to the end of the multibyte form. This is like
1024 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1026 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1027 (!((p)[0] & 0x80) \
1028 ? *(p)++ \
1029 : ! ((p)[0] & 0x20) \
1030 ? ((p) += 2, \
1031 ((((p)[-2] & 0x1F) << 6) \
1032 | ((p)[-1] & 0x3F) \
1033 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1034 : ! ((p)[0] & 0x10) \
1035 ? ((p) += 3, \
1036 ((((p)[-3] & 0x0F) << 12) \
1037 | (((p)[-2] & 0x3F) << 6) \
1038 | ((p)[-1] & 0x3F))) \
1039 : ! ((p)[0] & 0x08) \
1040 ? ((p) += 4, \
1041 ((((p)[-4] & 0xF) << 18) \
1042 | (((p)[-3] & 0x3F) << 12) \
1043 | (((p)[-2] & 0x3F) << 6) \
1044 | ((p)[-1] & 0x3F))) \
1045 : ((p) += 5, \
1046 ((((p)[-4] & 0x3F) << 18) \
1047 | (((p)[-3] & 0x3F) << 12) \
1048 | (((p)[-2] & 0x3F) << 6) \
1049 | ((p)[-1] & 0x3F))))
1052 static void
1053 coding_set_source (coding)
1054 struct coding_system *coding;
1056 if (BUFFERP (coding->src_object))
1058 struct buffer *buf = XBUFFER (coding->src_object);
1060 if (coding->src_pos < 0)
1061 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1062 else
1063 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1065 else if (STRINGP (coding->src_object))
1067 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1069 else
1070 /* Otherwise, the source is C string and is never relocated
1071 automatically. Thus we don't have to update anything. */
1075 static void
1076 coding_set_destination (coding)
1077 struct coding_system *coding;
1079 if (BUFFERP (coding->dst_object))
1081 if (coding->src_pos < 0)
1083 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1084 coding->dst_bytes = (GAP_END_ADDR
1085 - (coding->src_bytes - coding->consumed)
1086 - coding->destination);
1088 else
1090 /* We are sure that coding->dst_pos_byte is before the gap
1091 of the buffer. */
1092 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1093 + coding->dst_pos_byte - BEG_BYTE);
1094 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1095 - coding->destination);
1098 else
1099 /* Otherwise, the destination is C string and is never relocated
1100 automatically. Thus we don't have to update anything. */
1105 static void
1106 coding_alloc_by_realloc (coding, bytes)
1107 struct coding_system *coding;
1108 EMACS_INT bytes;
1110 coding->destination = (unsigned char *) xrealloc (coding->destination,
1111 coding->dst_bytes + bytes);
1112 coding->dst_bytes += bytes;
1115 static void
1116 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1117 struct coding_system *coding;
1118 EMACS_INT gap_head_used, bytes;
1120 if (EQ (coding->src_object, coding->dst_object))
1122 /* The gap may contain the produced data at the head and not-yet
1123 consumed data at the tail. To preserve those data, we at
1124 first make the gap size to zero, then increase the gap
1125 size. */
1126 EMACS_INT add = GAP_SIZE;
1128 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1129 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1130 make_gap (bytes);
1131 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1132 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1134 else
1136 Lisp_Object this_buffer;
1138 this_buffer = Fcurrent_buffer ();
1139 set_buffer_internal (XBUFFER (coding->dst_object));
1140 make_gap (bytes);
1141 set_buffer_internal (XBUFFER (this_buffer));
1146 static unsigned char *
1147 alloc_destination (coding, nbytes, dst)
1148 struct coding_system *coding;
1149 EMACS_INT nbytes;
1150 unsigned char *dst;
1152 EMACS_INT offset = dst - coding->destination;
1154 if (BUFFERP (coding->dst_object))
1156 struct buffer *buf = XBUFFER (coding->dst_object);
1158 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1160 else
1161 coding_alloc_by_realloc (coding, nbytes);
1162 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1163 coding_set_destination (coding);
1164 dst = coding->destination + offset;
1165 return dst;
1168 /** Macros for annotations. */
1170 /* Maximum length of annotation data (sum of annotations for
1171 composition and charset). */
1172 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1174 /* An annotation data is stored in the array coding->charbuf in this
1175 format:
1176 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1177 LENGTH is the number of elements in the annotation.
1178 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1179 NCHARS is the number of characters in the text annotated.
1181 The format of the following elements depend on ANNOTATION_MASK.
1183 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1184 follows:
1185 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1186 METHOD is one of enum composition_method.
1187 Optionnal COMPOSITION-COMPONENTS are characters and composition
1188 rules.
1190 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1191 follows. */
1193 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1194 do { \
1195 *(buf)++ = -(len); \
1196 *(buf)++ = (mask); \
1197 *(buf)++ = (nchars); \
1198 coding->annotated = 1; \
1199 } while (0);
1201 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1202 do { \
1203 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1204 *buf++ = method; \
1205 } while (0)
1208 #define ADD_CHARSET_DATA(buf, nchars, id) \
1209 do { \
1210 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1211 *buf++ = id; \
1212 } while (0)
1215 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1220 /*** 3. UTF-8 ***/
1222 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1223 Check if a text is encoded in UTF-8. If it is, return 1, else
1224 return 0. */
1226 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1227 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1228 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1229 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1230 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1231 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1233 #define UTF_BOM 0xFEFF
1234 #define UTF_8_BOM_1 0xEF
1235 #define UTF_8_BOM_2 0xBB
1236 #define UTF_8_BOM_3 0xBF
1238 static int
1239 detect_coding_utf_8 (coding, detect_info)
1240 struct coding_system *coding;
1241 struct coding_detection_info *detect_info;
1243 const unsigned char *src = coding->source, *src_base;
1244 const unsigned char *src_end = coding->source + coding->src_bytes;
1245 int multibytep = coding->src_multibyte;
1246 int consumed_chars = 0;
1247 int bom_found = 0;
1248 int found = 0;
1250 detect_info->checked |= CATEGORY_MASK_UTF_8;
1251 /* A coding system of this category is always ASCII compatible. */
1252 src += coding->head_ascii;
1254 while (1)
1256 int c, c1, c2, c3, c4;
1258 src_base = src;
1259 ONE_MORE_BYTE (c);
1260 if (c < 0 || UTF_8_1_OCTET_P (c))
1261 continue;
1262 ONE_MORE_BYTE (c1);
1263 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1264 break;
1265 if (UTF_8_2_OCTET_LEADING_P (c))
1267 found = 1;
1268 continue;
1270 ONE_MORE_BYTE (c2);
1271 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1272 break;
1273 if (UTF_8_3_OCTET_LEADING_P (c))
1275 found = 1;
1276 if (src_base == coding->source
1277 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1278 bom_found = 1;
1279 continue;
1281 ONE_MORE_BYTE (c3);
1282 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1283 break;
1284 if (UTF_8_4_OCTET_LEADING_P (c))
1286 found = 1;
1287 continue;
1289 ONE_MORE_BYTE (c4);
1290 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1291 break;
1292 if (UTF_8_5_OCTET_LEADING_P (c))
1294 found = 1;
1295 continue;
1297 break;
1299 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1300 return 0;
1302 no_more_source:
1303 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1305 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1306 return 0;
1308 if (bom_found)
1310 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1311 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1313 else
1315 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1316 if (found)
1317 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1319 return 1;
1323 static void
1324 decode_coding_utf_8 (coding)
1325 struct coding_system *coding;
1327 const unsigned char *src = coding->source + coding->consumed;
1328 const unsigned char *src_end = coding->source + coding->src_bytes;
1329 const unsigned char *src_base;
1330 int *charbuf = coding->charbuf + coding->charbuf_used;
1331 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1332 int consumed_chars = 0, consumed_chars_base = 0;
1333 int multibytep = coding->src_multibyte;
1334 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1335 Lisp_Object attr, charset_list;
1336 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1337 int byte_after_cr = -1;
1339 CODING_GET_INFO (coding, attr, charset_list);
1341 if (bom != utf_without_bom)
1343 int c1, c2, c3;
1345 src_base = src;
1346 ONE_MORE_BYTE (c1);
1347 if (! UTF_8_3_OCTET_LEADING_P (c1))
1348 src = src_base;
1349 else
1351 ONE_MORE_BYTE (c2);
1352 if (! UTF_8_EXTRA_OCTET_P (c2))
1353 src = src_base;
1354 else
1356 ONE_MORE_BYTE (c3);
1357 if (! UTF_8_EXTRA_OCTET_P (c3))
1358 src = src_base;
1359 else
1361 if ((c1 != UTF_8_BOM_1)
1362 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1363 src = src_base;
1364 else
1365 CODING_UTF_8_BOM (coding) = utf_without_bom;
1370 CODING_UTF_8_BOM (coding) = utf_without_bom;
1374 while (1)
1376 int c, c1, c2, c3, c4, c5;
1378 src_base = src;
1379 consumed_chars_base = consumed_chars;
1381 if (charbuf >= charbuf_end)
1383 if (byte_after_cr >= 0)
1384 src_base--;
1385 break;
1388 if (byte_after_cr >= 0)
1389 c1 = byte_after_cr, byte_after_cr = -1;
1390 else
1391 ONE_MORE_BYTE (c1);
1392 if (c1 < 0)
1394 c = - c1;
1396 else if (UTF_8_1_OCTET_P(c1))
1398 if (eol_crlf && c1 == '\r')
1399 ONE_MORE_BYTE (byte_after_cr);
1400 c = c1;
1402 else
1404 ONE_MORE_BYTE (c2);
1405 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1406 goto invalid_code;
1407 if (UTF_8_2_OCTET_LEADING_P (c1))
1409 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1410 /* Reject overlong sequences here and below. Encoders
1411 producing them are incorrect, they can be misleading,
1412 and they mess up read/write invariance. */
1413 if (c < 128)
1414 goto invalid_code;
1416 else
1418 ONE_MORE_BYTE (c3);
1419 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1420 goto invalid_code;
1421 if (UTF_8_3_OCTET_LEADING_P (c1))
1423 c = (((c1 & 0xF) << 12)
1424 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1425 if (c < 0x800
1426 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1427 goto invalid_code;
1429 else
1431 ONE_MORE_BYTE (c4);
1432 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1433 goto invalid_code;
1434 if (UTF_8_4_OCTET_LEADING_P (c1))
1436 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1437 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1438 if (c < 0x10000)
1439 goto invalid_code;
1441 else
1443 ONE_MORE_BYTE (c5);
1444 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1445 goto invalid_code;
1446 if (UTF_8_5_OCTET_LEADING_P (c1))
1448 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1449 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1450 | (c5 & 0x3F));
1451 if ((c > MAX_CHAR) || (c < 0x200000))
1452 goto invalid_code;
1454 else
1455 goto invalid_code;
1461 *charbuf++ = c;
1462 continue;
1464 invalid_code:
1465 src = src_base;
1466 consumed_chars = consumed_chars_base;
1467 ONE_MORE_BYTE (c);
1468 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1469 coding->errors++;
1472 no_more_source:
1473 coding->consumed_char += consumed_chars_base;
1474 coding->consumed = src_base - coding->source;
1475 coding->charbuf_used = charbuf - coding->charbuf;
1479 static int
1480 encode_coding_utf_8 (coding)
1481 struct coding_system *coding;
1483 int multibytep = coding->dst_multibyte;
1484 int *charbuf = coding->charbuf;
1485 int *charbuf_end = charbuf + coding->charbuf_used;
1486 unsigned char *dst = coding->destination + coding->produced;
1487 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1488 int produced_chars = 0;
1489 int c;
1491 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1493 ASSURE_DESTINATION (3);
1494 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1495 CODING_UTF_8_BOM (coding) = utf_without_bom;
1498 if (multibytep)
1500 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1502 while (charbuf < charbuf_end)
1504 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1506 ASSURE_DESTINATION (safe_room);
1507 c = *charbuf++;
1508 if (CHAR_BYTE8_P (c))
1510 c = CHAR_TO_BYTE8 (c);
1511 EMIT_ONE_BYTE (c);
1513 else
1515 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1516 for (p = str; p < pend; p++)
1517 EMIT_ONE_BYTE (*p);
1521 else
1523 int safe_room = MAX_MULTIBYTE_LENGTH;
1525 while (charbuf < charbuf_end)
1527 ASSURE_DESTINATION (safe_room);
1528 c = *charbuf++;
1529 if (CHAR_BYTE8_P (c))
1530 *dst++ = CHAR_TO_BYTE8 (c);
1531 else
1532 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1533 produced_chars++;
1536 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1537 coding->produced_char += produced_chars;
1538 coding->produced = dst - coding->destination;
1539 return 0;
1543 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1544 Check if a text is encoded in one of UTF-16 based coding systems.
1545 If it is, return 1, else return 0. */
1547 #define UTF_16_HIGH_SURROGATE_P(val) \
1548 (((val) & 0xFC00) == 0xD800)
1550 #define UTF_16_LOW_SURROGATE_P(val) \
1551 (((val) & 0xFC00) == 0xDC00)
1553 #define UTF_16_INVALID_P(val) \
1554 (((val) == 0xFFFE) \
1555 || ((val) == 0xFFFF) \
1556 || UTF_16_LOW_SURROGATE_P (val))
1559 static int
1560 detect_coding_utf_16 (coding, detect_info)
1561 struct coding_system *coding;
1562 struct coding_detection_info *detect_info;
1564 const unsigned char *src = coding->source, *src_base = src;
1565 const unsigned char *src_end = coding->source + coding->src_bytes;
1566 int multibytep = coding->src_multibyte;
1567 int consumed_chars = 0;
1568 int c1, c2;
1570 detect_info->checked |= CATEGORY_MASK_UTF_16;
1571 if (coding->mode & CODING_MODE_LAST_BLOCK
1572 && (coding->src_chars & 1))
1574 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1575 return 0;
1578 ONE_MORE_BYTE (c1);
1579 ONE_MORE_BYTE (c2);
1580 if ((c1 == 0xFF) && (c2 == 0xFE))
1582 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1583 | CATEGORY_MASK_UTF_16_AUTO);
1584 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1585 | CATEGORY_MASK_UTF_16_BE_NOSIG
1586 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1588 else if ((c1 == 0xFE) && (c2 == 0xFF))
1590 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1591 | CATEGORY_MASK_UTF_16_AUTO);
1592 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1593 | CATEGORY_MASK_UTF_16_BE_NOSIG
1594 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1596 else
1598 /* We check the dispersion of Eth and Oth bytes where E is even and
1599 O is odd. If both are high, we assume binary data.*/
1600 unsigned char e[256], o[256];
1601 unsigned e_num = 1, o_num = 1;
1603 memset (e, 0, 256);
1604 memset (o, 0, 256);
1605 e[c1] = 1;
1606 o[c2] = 1;
1608 detect_info->rejected
1609 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1611 while (1)
1613 ONE_MORE_BYTE (c1);
1614 ONE_MORE_BYTE (c2);
1615 if (! e[c1])
1617 e[c1] = 1;
1618 e_num++;
1619 if (e_num >= 128)
1620 break;
1622 if (! o[c2])
1624 o[c1] = 1;
1625 o_num++;
1626 if (o_num >= 128)
1627 break;
1630 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1631 return 0;
1634 no_more_source:
1635 return 1;
1638 static void
1639 decode_coding_utf_16 (coding)
1640 struct coding_system *coding;
1642 const unsigned char *src = coding->source + coding->consumed;
1643 const unsigned char *src_end = coding->source + coding->src_bytes;
1644 const unsigned char *src_base;
1645 int *charbuf = coding->charbuf + coding->charbuf_used;
1646 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1647 int consumed_chars = 0, consumed_chars_base = 0;
1648 int multibytep = coding->src_multibyte;
1649 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1650 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1651 int surrogate = CODING_UTF_16_SURROGATE (coding);
1652 Lisp_Object attr, charset_list;
1653 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1654 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1656 CODING_GET_INFO (coding, attr, charset_list);
1658 if (bom == utf_with_bom)
1660 int c, c1, c2;
1662 src_base = src;
1663 ONE_MORE_BYTE (c1);
1664 ONE_MORE_BYTE (c2);
1665 c = (c1 << 8) | c2;
1667 if (endian == utf_16_big_endian
1668 ? c != 0xFEFF : c != 0xFFFE)
1670 /* The first two bytes are not BOM. Treat them as bytes
1671 for a normal character. */
1672 src = src_base;
1673 coding->errors++;
1675 CODING_UTF_16_BOM (coding) = utf_without_bom;
1677 else if (bom == utf_detect_bom)
1679 /* We have already tried to detect BOM and failed in
1680 detect_coding. */
1681 CODING_UTF_16_BOM (coding) = utf_without_bom;
1684 while (1)
1686 int c, c1, c2;
1688 src_base = src;
1689 consumed_chars_base = consumed_chars;
1691 if (charbuf + 2 >= charbuf_end)
1693 if (byte_after_cr1 >= 0)
1694 src_base -= 2;
1695 break;
1698 if (byte_after_cr1 >= 0)
1699 c1 = byte_after_cr1, byte_after_cr1 = -1;
1700 else
1701 ONE_MORE_BYTE (c1);
1702 if (c1 < 0)
1704 *charbuf++ = -c1;
1705 continue;
1707 if (byte_after_cr2 >= 0)
1708 c2 = byte_after_cr2, byte_after_cr2 = -1;
1709 else
1710 ONE_MORE_BYTE (c2);
1711 if (c2 < 0)
1713 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1714 *charbuf++ = -c2;
1715 continue;
1717 c = (endian == utf_16_big_endian
1718 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1720 if (surrogate)
1722 if (! UTF_16_LOW_SURROGATE_P (c))
1724 if (endian == utf_16_big_endian)
1725 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1726 else
1727 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1728 *charbuf++ = c1;
1729 *charbuf++ = c2;
1730 coding->errors++;
1731 if (UTF_16_HIGH_SURROGATE_P (c))
1732 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1733 else
1734 *charbuf++ = c;
1736 else
1738 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1739 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1740 *charbuf++ = 0x10000 + c;
1743 else
1745 if (UTF_16_HIGH_SURROGATE_P (c))
1746 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1747 else
1749 if (eol_crlf && c == '\r')
1751 ONE_MORE_BYTE (byte_after_cr1);
1752 ONE_MORE_BYTE (byte_after_cr2);
1754 *charbuf++ = c;
1759 no_more_source:
1760 coding->consumed_char += consumed_chars_base;
1761 coding->consumed = src_base - coding->source;
1762 coding->charbuf_used = charbuf - coding->charbuf;
1765 static int
1766 encode_coding_utf_16 (coding)
1767 struct coding_system *coding;
1769 int multibytep = coding->dst_multibyte;
1770 int *charbuf = coding->charbuf;
1771 int *charbuf_end = charbuf + coding->charbuf_used;
1772 unsigned char *dst = coding->destination + coding->produced;
1773 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1774 int safe_room = 8;
1775 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1776 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1777 int produced_chars = 0;
1778 Lisp_Object attrs, charset_list;
1779 int c;
1781 CODING_GET_INFO (coding, attrs, charset_list);
1783 if (bom != utf_without_bom)
1785 ASSURE_DESTINATION (safe_room);
1786 if (big_endian)
1787 EMIT_TWO_BYTES (0xFE, 0xFF);
1788 else
1789 EMIT_TWO_BYTES (0xFF, 0xFE);
1790 CODING_UTF_16_BOM (coding) = utf_without_bom;
1793 while (charbuf < charbuf_end)
1795 ASSURE_DESTINATION (safe_room);
1796 c = *charbuf++;
1797 if (c >= MAX_UNICODE_CHAR)
1798 c = coding->default_char;
1800 if (c < 0x10000)
1802 if (big_endian)
1803 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1804 else
1805 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1807 else
1809 int c1, c2;
1811 c -= 0x10000;
1812 c1 = (c >> 10) + 0xD800;
1813 c2 = (c & 0x3FF) + 0xDC00;
1814 if (big_endian)
1815 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1816 else
1817 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1820 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1821 coding->produced = dst - coding->destination;
1822 coding->produced_char += produced_chars;
1823 return 0;
1827 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1829 /* Emacs' internal format for representation of multiple character
1830 sets is a kind of multi-byte encoding, i.e. characters are
1831 represented by variable-length sequences of one-byte codes.
1833 ASCII characters and control characters (e.g. `tab', `newline') are
1834 represented by one-byte sequences which are their ASCII codes, in
1835 the range 0x00 through 0x7F.
1837 8-bit characters of the range 0x80..0x9F are represented by
1838 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1839 code + 0x20).
1841 8-bit characters of the range 0xA0..0xFF are represented by
1842 one-byte sequences which are their 8-bit code.
1844 The other characters are represented by a sequence of `base
1845 leading-code', optional `extended leading-code', and one or two
1846 `position-code's. The length of the sequence is determined by the
1847 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1848 whereas extended leading-code and position-code take the range 0xA0
1849 through 0xFF. See `charset.h' for more details about leading-code
1850 and position-code.
1852 --- CODE RANGE of Emacs' internal format ---
1853 character set range
1854 ------------- -----
1855 ascii 0x00..0x7F
1856 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1857 eight-bit-graphic 0xA0..0xBF
1858 ELSE 0x81..0x9D + [0xA0..0xFF]+
1859 ---------------------------------------------
1861 As this is the internal character representation, the format is
1862 usually not used externally (i.e. in a file or in a data sent to a
1863 process). But, it is possible to have a text externally in this
1864 format (i.e. by encoding by the coding system `emacs-mule').
1866 In that case, a sequence of one-byte codes has a slightly different
1867 form.
1869 At first, all characters in eight-bit-control are represented by
1870 one-byte sequences which are their 8-bit code.
1872 Next, character composition data are represented by the byte
1873 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1874 where,
1875 METHOD is 0xF0 plus one of composition method (enum
1876 composition_method),
1878 BYTES is 0xA0 plus a byte length of this composition data,
1880 CHARS is 0x20 plus a number of characters composed by this
1881 data,
1883 COMPONENTs are characters of multibye form or composition
1884 rules encoded by two-byte of ASCII codes.
1886 In addition, for backward compatibility, the following formats are
1887 also recognized as composition data on decoding.
1889 0x80 MSEQ ...
1890 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1892 Here,
1893 MSEQ is a multibyte form but in these special format:
1894 ASCII: 0xA0 ASCII_CODE+0x80,
1895 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1896 RULE is a one byte code of the range 0xA0..0xF0 that
1897 represents a composition rule.
1900 char emacs_mule_bytes[256];
1903 emacs_mule_char (coding, src, nbytes, nchars, id)
1904 struct coding_system *coding;
1905 const unsigned char *src;
1906 int *nbytes, *nchars, *id;
1908 const unsigned char *src_end = coding->source + coding->src_bytes;
1909 const unsigned char *src_base = src;
1910 int multibytep = coding->src_multibyte;
1911 struct charset *charset;
1912 unsigned code;
1913 int c;
1914 int consumed_chars = 0;
1916 ONE_MORE_BYTE (c);
1917 if (c < 0)
1919 c = -c;
1920 charset = emacs_mule_charset[0];
1922 else
1924 if (c >= 0xA0)
1926 /* Old style component character of a composition. */
1927 if (c == 0xA0)
1929 ONE_MORE_BYTE (c);
1930 c -= 0x80;
1932 else
1933 c -= 0x20;
1936 switch (emacs_mule_bytes[c])
1938 case 2:
1939 if (! (charset = emacs_mule_charset[c]))
1940 goto invalid_code;
1941 ONE_MORE_BYTE (c);
1942 if (c < 0xA0)
1943 goto invalid_code;
1944 code = c & 0x7F;
1945 break;
1947 case 3:
1948 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1949 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1951 ONE_MORE_BYTE (c);
1952 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1953 goto invalid_code;
1954 ONE_MORE_BYTE (c);
1955 if (c < 0xA0)
1956 goto invalid_code;
1957 code = c & 0x7F;
1959 else
1961 if (! (charset = emacs_mule_charset[c]))
1962 goto invalid_code;
1963 ONE_MORE_BYTE (c);
1964 if (c < 0xA0)
1965 goto invalid_code;
1966 code = (c & 0x7F) << 8;
1967 ONE_MORE_BYTE (c);
1968 if (c < 0xA0)
1969 goto invalid_code;
1970 code |= c & 0x7F;
1972 break;
1974 case 4:
1975 ONE_MORE_BYTE (c);
1976 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1977 goto invalid_code;
1978 ONE_MORE_BYTE (c);
1979 if (c < 0xA0)
1980 goto invalid_code;
1981 code = (c & 0x7F) << 8;
1982 ONE_MORE_BYTE (c);
1983 if (c < 0xA0)
1984 goto invalid_code;
1985 code |= c & 0x7F;
1986 break;
1988 case 1:
1989 code = c;
1990 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1991 ? charset_ascii : charset_eight_bit);
1992 break;
1994 default:
1995 abort ();
1997 c = DECODE_CHAR (charset, code);
1998 if (c < 0)
1999 goto invalid_code;
2001 *nbytes = src - src_base;
2002 *nchars = consumed_chars;
2003 if (id)
2004 *id = charset->id;
2005 return c;
2007 no_more_source:
2008 return -2;
2010 invalid_code:
2011 return -1;
2015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2016 Check if a text is encoded in `emacs-mule'. If it is, return 1,
2017 else return 0. */
2019 static int
2020 detect_coding_emacs_mule (coding, detect_info)
2021 struct coding_system *coding;
2022 struct coding_detection_info *detect_info;
2024 const unsigned char *src = coding->source, *src_base;
2025 const unsigned char *src_end = coding->source + coding->src_bytes;
2026 int multibytep = coding->src_multibyte;
2027 int consumed_chars = 0;
2028 int c;
2029 int found = 0;
2031 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
2032 /* A coding system of this category is always ASCII compatible. */
2033 src += coding->head_ascii;
2035 while (1)
2037 src_base = src;
2038 ONE_MORE_BYTE (c);
2039 if (c < 0)
2040 continue;
2041 if (c == 0x80)
2043 /* Perhaps the start of composite character. We simple skip
2044 it because analyzing it is too heavy for detecting. But,
2045 at least, we check that the composite character
2046 constitutes of more than 4 bytes. */
2047 const unsigned char *src_base;
2049 repeat:
2050 src_base = src;
2053 ONE_MORE_BYTE (c);
2055 while (c >= 0xA0);
2057 if (src - src_base <= 4)
2058 break;
2059 found = CATEGORY_MASK_EMACS_MULE;
2060 if (c == 0x80)
2061 goto repeat;
2064 if (c < 0x80)
2066 if (c < 0x20
2067 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2068 break;
2070 else
2072 int more_bytes = emacs_mule_bytes[*src_base] - 1;
2074 while (more_bytes > 0)
2076 ONE_MORE_BYTE (c);
2077 if (c < 0xA0)
2079 src--; /* Unread the last byte. */
2080 break;
2082 more_bytes--;
2084 if (more_bytes != 0)
2085 break;
2086 found = CATEGORY_MASK_EMACS_MULE;
2089 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2090 return 0;
2092 no_more_source:
2093 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2095 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2096 return 0;
2098 detect_info->found |= found;
2099 return 1;
2103 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2105 /* Decode a character represented as a component of composition
2106 sequence of Emacs 20/21 style at SRC. Set C to that character and
2107 update SRC to the head of next character (or an encoded composition
2108 rule). If SRC doesn't points a composition component, set C to -1.
2109 If SRC points an invalid byte sequence, global exit by a return
2110 value 0. */
2112 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
2113 do \
2115 int c; \
2116 int nbytes, nchars; \
2118 if (src == src_end) \
2119 break; \
2120 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
2121 if (c < 0) \
2123 if (c == -2) \
2124 break; \
2125 goto invalid_code; \
2127 *buf++ = c; \
2128 src += nbytes; \
2129 consumed_chars += nchars; \
2131 while (0)
2134 /* Decode a composition rule represented as a component of composition
2135 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2136 and increment BUF. If SRC points an invalid byte sequence, set C
2137 to -1. */
2139 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
2140 do { \
2141 int c, gref, nref; \
2143 if (src >= src_end) \
2144 goto invalid_code; \
2145 ONE_MORE_BYTE_NO_CHECK (c); \
2146 c -= 0xA0; \
2147 if (c < 0 || c >= 81) \
2148 goto invalid_code; \
2150 gref = c / 9, nref = c % 9; \
2151 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2152 } while (0)
2155 /* Decode a composition rule represented as a component of composition
2156 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2157 and increment BUF. If SRC points an invalid byte sequence, set C
2158 to -1. */
2160 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2161 do { \
2162 int gref, nref; \
2164 if (src + 1>= src_end) \
2165 goto invalid_code; \
2166 ONE_MORE_BYTE_NO_CHECK (gref); \
2167 gref -= 0x20; \
2168 ONE_MORE_BYTE_NO_CHECK (nref); \
2169 nref -= 0x20; \
2170 if (gref < 0 || gref >= 81 \
2171 || nref < 0 || nref >= 81) \
2172 goto invalid_code; \
2173 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2174 } while (0)
2177 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
2178 do { \
2179 /* Emacs 21 style format. The first three bytes at SRC are \
2180 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
2181 the byte length of this composition information, CHARS is the \
2182 number of characters composed by this composition. */ \
2183 enum composition_method method = c - 0xF2; \
2184 int *charbuf_base = charbuf; \
2185 int consumed_chars_limit; \
2186 int nbytes, nchars; \
2188 ONE_MORE_BYTE (c); \
2189 if (c < 0) \
2190 goto invalid_code; \
2191 nbytes = c - 0xA0; \
2192 if (nbytes < 3) \
2193 goto invalid_code; \
2194 ONE_MORE_BYTE (c); \
2195 if (c < 0) \
2196 goto invalid_code; \
2197 nchars = c - 0xA0; \
2198 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2199 consumed_chars_limit = consumed_chars_base + nbytes; \
2200 if (method != COMPOSITION_RELATIVE) \
2202 int i = 0; \
2203 while (consumed_chars < consumed_chars_limit) \
2205 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
2206 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
2207 else \
2208 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
2209 i++; \
2211 if (consumed_chars < consumed_chars_limit) \
2212 goto invalid_code; \
2213 charbuf_base[0] -= i; \
2215 } while (0)
2218 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2219 do { \
2220 /* Emacs 20 style format for relative composition. */ \
2221 /* Store multibyte form of characters to be composed. */ \
2222 enum composition_method method = COMPOSITION_RELATIVE; \
2223 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2224 int *buf = components; \
2225 int i, j; \
2227 src = src_base; \
2228 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2229 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2230 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2231 if (i < 2) \
2232 goto invalid_code; \
2233 ADD_COMPOSITION_DATA (charbuf, i, method); \
2234 for (j = 0; j < i; j++) \
2235 *charbuf++ = components[j]; \
2236 } while (0)
2239 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2240 do { \
2241 /* Emacs 20 style format for rule-base composition. */ \
2242 /* Store multibyte form of characters to be composed. */ \
2243 enum composition_method method = COMPOSITION_WITH_RULE; \
2244 int *charbuf_base = charbuf; \
2245 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2246 int *buf = components; \
2247 int i, j; \
2249 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2250 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2252 if (*src < 0xA0) \
2253 break; \
2254 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2255 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2257 if (i <= 1 || (buf - components) % 2 == 0) \
2258 goto invalid_code; \
2259 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2260 goto no_more_source; \
2261 ADD_COMPOSITION_DATA (charbuf, i, method); \
2262 i = i * 2 - 1; \
2263 for (j = 0; j < i; j++) \
2264 *charbuf++ = components[j]; \
2265 charbuf_base[0] -= i; \
2266 for (j = 0; j < i; j += 2) \
2267 *charbuf++ = components[j]; \
2268 } while (0)
2271 static void
2272 decode_coding_emacs_mule (coding)
2273 struct coding_system *coding;
2275 const unsigned char *src = coding->source + coding->consumed;
2276 const unsigned char *src_end = coding->source + coding->src_bytes;
2277 const unsigned char *src_base;
2278 int *charbuf = coding->charbuf + coding->charbuf_used;
2279 int *charbuf_end
2280 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2281 int consumed_chars = 0, consumed_chars_base;
2282 int multibytep = coding->src_multibyte;
2283 Lisp_Object attrs, charset_list;
2284 int char_offset = coding->produced_char;
2285 int last_offset = char_offset;
2286 int last_id = charset_ascii;
2287 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2288 int byte_after_cr = -1;
2290 CODING_GET_INFO (coding, attrs, charset_list);
2292 while (1)
2294 int c;
2296 src_base = src;
2297 consumed_chars_base = consumed_chars;
2299 if (charbuf >= charbuf_end)
2301 if (byte_after_cr >= 0)
2302 src_base--;
2303 break;
2306 if (byte_after_cr >= 0)
2307 c = byte_after_cr, byte_after_cr = -1;
2308 else
2309 ONE_MORE_BYTE (c);
2310 if (c < 0)
2312 *charbuf++ = -c;
2313 char_offset++;
2315 else if (c < 0x80)
2317 if (eol_crlf && c == '\r')
2318 ONE_MORE_BYTE (byte_after_cr);
2319 *charbuf++ = c;
2320 char_offset++;
2322 else if (c == 0x80)
2324 ONE_MORE_BYTE (c);
2325 if (c < 0)
2326 goto invalid_code;
2327 if (c - 0xF2 >= COMPOSITION_RELATIVE
2328 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2329 DECODE_EMACS_MULE_21_COMPOSITION (c);
2330 else if (c < 0xC0)
2331 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2332 else if (c == 0xFF)
2333 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2334 else
2335 goto invalid_code;
2337 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2339 int nbytes, nchars;
2340 int id;
2342 src = src_base;
2343 consumed_chars = consumed_chars_base;
2344 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2345 if (c < 0)
2347 if (c == -2)
2348 break;
2349 goto invalid_code;
2351 if (last_id != id)
2353 if (last_id != charset_ascii)
2354 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2355 last_id = id;
2356 last_offset = char_offset;
2358 *charbuf++ = c;
2359 src += nbytes;
2360 consumed_chars += nchars;
2361 char_offset++;
2363 else
2364 goto invalid_code;
2365 continue;
2367 invalid_code:
2368 src = src_base;
2369 consumed_chars = consumed_chars_base;
2370 ONE_MORE_BYTE (c);
2371 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2372 char_offset++;
2373 coding->errors++;
2376 no_more_source:
2377 if (last_id != charset_ascii)
2378 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2379 coding->consumed_char += consumed_chars_base;
2380 coding->consumed = src_base - coding->source;
2381 coding->charbuf_used = charbuf - coding->charbuf;
2385 #define EMACS_MULE_LEADING_CODES(id, codes) \
2386 do { \
2387 if (id < 0xA0) \
2388 codes[0] = id, codes[1] = 0; \
2389 else if (id < 0xE0) \
2390 codes[0] = 0x9A, codes[1] = id; \
2391 else if (id < 0xF0) \
2392 codes[0] = 0x9B, codes[1] = id; \
2393 else if (id < 0xF5) \
2394 codes[0] = 0x9C, codes[1] = id; \
2395 else \
2396 codes[0] = 0x9D, codes[1] = id; \
2397 } while (0);
2400 static int
2401 encode_coding_emacs_mule (coding)
2402 struct coding_system *coding;
2404 int multibytep = coding->dst_multibyte;
2405 int *charbuf = coding->charbuf;
2406 int *charbuf_end = charbuf + coding->charbuf_used;
2407 unsigned char *dst = coding->destination + coding->produced;
2408 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2409 int safe_room = 8;
2410 int produced_chars = 0;
2411 Lisp_Object attrs, charset_list;
2412 int c;
2413 int preferred_charset_id = -1;
2415 CODING_GET_INFO (coding, attrs, charset_list);
2416 if (! EQ (charset_list, Vemacs_mule_charset_list))
2418 CODING_ATTR_CHARSET_LIST (attrs)
2419 = charset_list = Vemacs_mule_charset_list;
2422 while (charbuf < charbuf_end)
2424 ASSURE_DESTINATION (safe_room);
2425 c = *charbuf++;
2427 if (c < 0)
2429 /* Handle an annotation. */
2430 switch (*charbuf)
2432 case CODING_ANNOTATE_COMPOSITION_MASK:
2433 /* Not yet implemented. */
2434 break;
2435 case CODING_ANNOTATE_CHARSET_MASK:
2436 preferred_charset_id = charbuf[3];
2437 if (preferred_charset_id >= 0
2438 && NILP (Fmemq (make_number (preferred_charset_id),
2439 charset_list)))
2440 preferred_charset_id = -1;
2441 break;
2442 default:
2443 abort ();
2445 charbuf += -c - 1;
2446 continue;
2449 if (ASCII_CHAR_P (c))
2450 EMIT_ONE_ASCII_BYTE (c);
2451 else if (CHAR_BYTE8_P (c))
2453 c = CHAR_TO_BYTE8 (c);
2454 EMIT_ONE_BYTE (c);
2456 else
2458 struct charset *charset;
2459 unsigned code;
2460 int dimension;
2461 int emacs_mule_id;
2462 unsigned char leading_codes[2];
2464 if (preferred_charset_id >= 0)
2466 charset = CHARSET_FROM_ID (preferred_charset_id);
2467 if (CHAR_CHARSET_P (c, charset))
2468 code = ENCODE_CHAR (charset, c);
2469 else
2470 charset = char_charset (c, charset_list, &code);
2472 else
2473 charset = char_charset (c, charset_list, &code);
2474 if (! charset)
2476 c = coding->default_char;
2477 if (ASCII_CHAR_P (c))
2479 EMIT_ONE_ASCII_BYTE (c);
2480 continue;
2482 charset = char_charset (c, charset_list, &code);
2484 dimension = CHARSET_DIMENSION (charset);
2485 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2486 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2487 EMIT_ONE_BYTE (leading_codes[0]);
2488 if (leading_codes[1])
2489 EMIT_ONE_BYTE (leading_codes[1]);
2490 if (dimension == 1)
2491 EMIT_ONE_BYTE (code | 0x80);
2492 else
2494 code |= 0x8080;
2495 EMIT_ONE_BYTE (code >> 8);
2496 EMIT_ONE_BYTE (code & 0xFF);
2500 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2501 coding->produced_char += produced_chars;
2502 coding->produced = dst - coding->destination;
2503 return 0;
2507 /*** 7. ISO2022 handlers ***/
2509 /* The following note describes the coding system ISO2022 briefly.
2510 Since the intention of this note is to help understand the
2511 functions in this file, some parts are NOT ACCURATE or are OVERLY
2512 SIMPLIFIED. For thorough understanding, please refer to the
2513 original document of ISO2022. This is equivalent to the standard
2514 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2516 ISO2022 provides many mechanisms to encode several character sets
2517 in 7-bit and 8-bit environments. For 7-bit environments, all text
2518 is encoded using bytes less than 128. This may make the encoded
2519 text a little bit longer, but the text passes more easily through
2520 several types of gateway, some of which strip off the MSB (Most
2521 Significant Bit).
2523 There are two kinds of character sets: control character sets and
2524 graphic character sets. The former contain control characters such
2525 as `newline' and `escape' to provide control functions (control
2526 functions are also provided by escape sequences). The latter
2527 contain graphic characters such as 'A' and '-'. Emacs recognizes
2528 two control character sets and many graphic character sets.
2530 Graphic character sets are classified into one of the following
2531 four classes, according to the number of bytes (DIMENSION) and
2532 number of characters in one dimension (CHARS) of the set:
2533 - DIMENSION1_CHARS94
2534 - DIMENSION1_CHARS96
2535 - DIMENSION2_CHARS94
2536 - DIMENSION2_CHARS96
2538 In addition, each character set is assigned an identification tag,
2539 unique for each set, called the "final character" (denoted as <F>
2540 hereafter). The <F> of each character set is decided by ECMA(*)
2541 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2542 (0x30..0x3F are for private use only).
2544 Note (*): ECMA = European Computer Manufacturers Association
2546 Here are examples of graphic character sets [NAME(<F>)]:
2547 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2548 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2549 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2550 o DIMENSION2_CHARS96 -- none for the moment
2552 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2553 C0 [0x00..0x1F] -- control character plane 0
2554 GL [0x20..0x7F] -- graphic character plane 0
2555 C1 [0x80..0x9F] -- control character plane 1
2556 GR [0xA0..0xFF] -- graphic character plane 1
2558 A control character set is directly designated and invoked to C0 or
2559 C1 by an escape sequence. The most common case is that:
2560 - ISO646's control character set is designated/invoked to C0, and
2561 - ISO6429's control character set is designated/invoked to C1,
2562 and usually these designations/invocations are omitted in encoded
2563 text. In a 7-bit environment, only C0 can be used, and a control
2564 character for C1 is encoded by an appropriate escape sequence to
2565 fit into the environment. All control characters for C1 are
2566 defined to have corresponding escape sequences.
2568 A graphic character set is at first designated to one of four
2569 graphic registers (G0 through G3), then these graphic registers are
2570 invoked to GL or GR. These designations and invocations can be
2571 done independently. The most common case is that G0 is invoked to
2572 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2573 these invocations and designations are omitted in encoded text.
2574 In a 7-bit environment, only GL can be used.
2576 When a graphic character set of CHARS94 is invoked to GL, codes
2577 0x20 and 0x7F of the GL area work as control characters SPACE and
2578 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2579 be used.
2581 There are two ways of invocation: locking-shift and single-shift.
2582 With locking-shift, the invocation lasts until the next different
2583 invocation, whereas with single-shift, the invocation affects the
2584 following character only and doesn't affect the locking-shift
2585 state. Invocations are done by the following control characters or
2586 escape sequences:
2588 ----------------------------------------------------------------------
2589 abbrev function cntrl escape seq description
2590 ----------------------------------------------------------------------
2591 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2592 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2593 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2594 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2595 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2596 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2597 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2598 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2599 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2600 ----------------------------------------------------------------------
2601 (*) These are not used by any known coding system.
2603 Control characters for these functions are defined by macros
2604 ISO_CODE_XXX in `coding.h'.
2606 Designations are done by the following escape sequences:
2607 ----------------------------------------------------------------------
2608 escape sequence description
2609 ----------------------------------------------------------------------
2610 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2611 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2612 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2613 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2614 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2615 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2616 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2617 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2618 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2619 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2620 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2621 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2622 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2623 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2624 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2625 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2626 ----------------------------------------------------------------------
2628 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2629 of dimension 1, chars 94, and final character <F>, etc...
2631 Note (*): Although these designations are not allowed in ISO2022,
2632 Emacs accepts them on decoding, and produces them on encoding
2633 CHARS96 character sets in a coding system which is characterized as
2634 7-bit environment, non-locking-shift, and non-single-shift.
2636 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2637 '(' must be omitted. We refer to this as "short-form" hereafter.
2639 Now you may notice that there are a lot of ways of encoding the
2640 same multilingual text in ISO2022. Actually, there exist many
2641 coding systems such as Compound Text (used in X11's inter client
2642 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2643 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2644 localized platforms), and all of these are variants of ISO2022.
2646 In addition to the above, Emacs handles two more kinds of escape
2647 sequences: ISO6429's direction specification and Emacs' private
2648 sequence for specifying character composition.
2650 ISO6429's direction specification takes the following form:
2651 o CSI ']' -- end of the current direction
2652 o CSI '0' ']' -- end of the current direction
2653 o CSI '1' ']' -- start of left-to-right text
2654 o CSI '2' ']' -- start of right-to-left text
2655 The control character CSI (0x9B: control sequence introducer) is
2656 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2658 Character composition specification takes the following form:
2659 o ESC '0' -- start relative composition
2660 o ESC '1' -- end composition
2661 o ESC '2' -- start rule-base composition (*)
2662 o ESC '3' -- start relative composition with alternate chars (**)
2663 o ESC '4' -- start rule-base composition with alternate chars (**)
2664 Since these are not standard escape sequences of any ISO standard,
2665 the use of them with these meanings is restricted to Emacs only.
2667 (*) This form is used only in Emacs 20.7 and older versions,
2668 but newer versions can safely decode it.
2669 (**) This form is used only in Emacs 21.1 and newer versions,
2670 and older versions can't decode it.
2672 Here's a list of example usages of these composition escape
2673 sequences (categorized by `enum composition_method').
2675 COMPOSITION_RELATIVE:
2676 ESC 0 CHAR [ CHAR ] ESC 1
2677 COMPOSITION_WITH_RULE:
2678 ESC 2 CHAR [ RULE CHAR ] ESC 1
2679 COMPOSITION_WITH_ALTCHARS:
2680 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2681 COMPOSITION_WITH_RULE_ALTCHARS:
2682 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2684 enum iso_code_class_type iso_code_class[256];
2686 #define SAFE_CHARSET_P(coding, id) \
2687 ((id) <= (coding)->max_charset_id \
2688 && (coding)->safe_charsets[id] >= 0)
2691 #define SHIFT_OUT_OK(category) \
2692 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2694 static void
2695 setup_iso_safe_charsets (attrs)
2696 Lisp_Object attrs;
2698 Lisp_Object charset_list, safe_charsets;
2699 Lisp_Object request;
2700 Lisp_Object reg_usage;
2701 Lisp_Object tail;
2702 int reg94, reg96;
2703 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2704 int max_charset_id;
2706 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2707 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2708 && ! EQ (charset_list, Viso_2022_charset_list))
2710 CODING_ATTR_CHARSET_LIST (attrs)
2711 = charset_list = Viso_2022_charset_list;
2712 ASET (attrs, coding_attr_safe_charsets, Qnil);
2715 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2716 return;
2718 max_charset_id = 0;
2719 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2721 int id = XINT (XCAR (tail));
2722 if (max_charset_id < id)
2723 max_charset_id = id;
2726 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2727 make_number (255));
2728 request = AREF (attrs, coding_attr_iso_request);
2729 reg_usage = AREF (attrs, coding_attr_iso_usage);
2730 reg94 = XINT (XCAR (reg_usage));
2731 reg96 = XINT (XCDR (reg_usage));
2733 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2735 Lisp_Object id;
2736 Lisp_Object reg;
2737 struct charset *charset;
2739 id = XCAR (tail);
2740 charset = CHARSET_FROM_ID (XINT (id));
2741 reg = Fcdr (Fassq (id, request));
2742 if (! NILP (reg))
2743 SSET (safe_charsets, XINT (id), XINT (reg));
2744 else if (charset->iso_chars_96)
2746 if (reg96 < 4)
2747 SSET (safe_charsets, XINT (id), reg96);
2749 else
2751 if (reg94 < 4)
2752 SSET (safe_charsets, XINT (id), reg94);
2755 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2759 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2760 Check if a text is encoded in one of ISO-2022 based codig systems.
2761 If it is, return 1, else return 0. */
2763 static int
2764 detect_coding_iso_2022 (coding, detect_info)
2765 struct coding_system *coding;
2766 struct coding_detection_info *detect_info;
2768 const unsigned char *src = coding->source, *src_base = src;
2769 const unsigned char *src_end = coding->source + coding->src_bytes;
2770 int multibytep = coding->src_multibyte;
2771 int single_shifting = 0;
2772 int id;
2773 int c, c1;
2774 int consumed_chars = 0;
2775 int i;
2776 int rejected = 0;
2777 int found = 0;
2778 int composition_count = -1;
2780 detect_info->checked |= CATEGORY_MASK_ISO;
2782 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2784 struct coding_system *this = &(coding_categories[i]);
2785 Lisp_Object attrs, val;
2787 if (this->id < 0)
2788 continue;
2789 attrs = CODING_ID_ATTRS (this->id);
2790 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2791 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2792 setup_iso_safe_charsets (attrs);
2793 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2794 this->max_charset_id = SCHARS (val) - 1;
2795 this->safe_charsets = (char *) SDATA (val);
2798 /* A coding system of this category is always ASCII compatible. */
2799 src += coding->head_ascii;
2801 while (rejected != CATEGORY_MASK_ISO)
2803 src_base = src;
2804 ONE_MORE_BYTE (c);
2805 switch (c)
2807 case ISO_CODE_ESC:
2808 if (inhibit_iso_escape_detection)
2809 break;
2810 single_shifting = 0;
2811 ONE_MORE_BYTE (c);
2812 if (c >= '(' && c <= '/')
2814 /* Designation sequence for a charset of dimension 1. */
2815 ONE_MORE_BYTE (c1);
2816 if (c1 < ' ' || c1 >= 0x80
2817 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2818 /* Invalid designation sequence. Just ignore. */
2819 break;
2821 else if (c == '$')
2823 /* Designation sequence for a charset of dimension 2. */
2824 ONE_MORE_BYTE (c);
2825 if (c >= '@' && c <= 'B')
2826 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2827 id = iso_charset_table[1][0][c];
2828 else if (c >= '(' && c <= '/')
2830 ONE_MORE_BYTE (c1);
2831 if (c1 < ' ' || c1 >= 0x80
2832 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2833 /* Invalid designation sequence. Just ignore. */
2834 break;
2836 else
2837 /* Invalid designation sequence. Just ignore it. */
2838 break;
2840 else if (c == 'N' || c == 'O')
2842 /* ESC <Fe> for SS2 or SS3. */
2843 single_shifting = 1;
2844 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2845 break;
2847 else if (c == '1')
2849 /* End of composition. */
2850 if (composition_count < 0
2851 || composition_count > MAX_COMPOSITION_COMPONENTS)
2852 /* Invalid */
2853 break;
2854 composition_count = -1;
2855 found |= CATEGORY_MASK_ISO;
2857 else if (c >= '0' && c <= '4')
2859 /* ESC <Fp> for start/end composition. */
2860 composition_count = 0;
2861 break;
2863 else
2865 /* Invalid escape sequence. Just ignore it. */
2866 break;
2869 /* We found a valid designation sequence for CHARSET. */
2870 rejected |= CATEGORY_MASK_ISO_8BIT;
2871 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2872 id))
2873 found |= CATEGORY_MASK_ISO_7;
2874 else
2875 rejected |= CATEGORY_MASK_ISO_7;
2876 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2877 id))
2878 found |= CATEGORY_MASK_ISO_7_TIGHT;
2879 else
2880 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2881 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2882 id))
2883 found |= CATEGORY_MASK_ISO_7_ELSE;
2884 else
2885 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2886 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2887 id))
2888 found |= CATEGORY_MASK_ISO_8_ELSE;
2889 else
2890 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2891 break;
2893 case ISO_CODE_SO:
2894 case ISO_CODE_SI:
2895 /* Locking shift out/in. */
2896 if (inhibit_iso_escape_detection)
2897 break;
2898 single_shifting = 0;
2899 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2900 break;
2902 case ISO_CODE_CSI:
2903 /* Control sequence introducer. */
2904 single_shifting = 0;
2905 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2906 found |= CATEGORY_MASK_ISO_8_ELSE;
2907 goto check_extra_latin;
2909 case ISO_CODE_SS2:
2910 case ISO_CODE_SS3:
2911 /* Single shift. */
2912 if (inhibit_iso_escape_detection)
2913 break;
2914 single_shifting = 0;
2915 rejected |= CATEGORY_MASK_ISO_7BIT;
2916 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2917 & CODING_ISO_FLAG_SINGLE_SHIFT)
2918 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2919 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2920 & CODING_ISO_FLAG_SINGLE_SHIFT)
2921 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2922 if (single_shifting)
2923 break;
2924 goto check_extra_latin;
2926 default:
2927 if (c < 0)
2928 continue;
2929 if (c < 0x80)
2931 if (composition_count >= 0)
2932 composition_count++;
2933 single_shifting = 0;
2934 break;
2936 if (c >= 0xA0)
2938 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2939 found |= CATEGORY_MASK_ISO_8_1;
2940 /* Check the length of succeeding codes of the range
2941 0xA0..0FF. If the byte length is even, we include
2942 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2943 only when we are not single shifting. */
2944 if (! single_shifting
2945 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2947 int i = 1;
2948 while (src < src_end)
2950 ONE_MORE_BYTE (c);
2951 if (c < 0xA0)
2952 break;
2953 i++;
2956 if (i & 1 && src < src_end)
2958 rejected |= CATEGORY_MASK_ISO_8_2;
2959 if (composition_count >= 0)
2960 composition_count += i;
2962 else
2964 found |= CATEGORY_MASK_ISO_8_2;
2965 if (composition_count >= 0)
2966 composition_count += i / 2;
2969 break;
2971 check_extra_latin:
2972 single_shifting = 0;
2973 if (! VECTORP (Vlatin_extra_code_table)
2974 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2976 rejected = CATEGORY_MASK_ISO;
2977 break;
2979 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2980 & CODING_ISO_FLAG_LATIN_EXTRA)
2981 found |= CATEGORY_MASK_ISO_8_1;
2982 else
2983 rejected |= CATEGORY_MASK_ISO_8_1;
2984 rejected |= CATEGORY_MASK_ISO_8_2;
2987 detect_info->rejected |= CATEGORY_MASK_ISO;
2988 return 0;
2990 no_more_source:
2991 detect_info->rejected |= rejected;
2992 detect_info->found |= (found & ~rejected);
2993 return 1;
2997 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2998 escape sequence should be kept. */
2999 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3000 do { \
3001 int id, prev; \
3003 if (final < '0' || final >= 128 \
3004 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3005 || !SAFE_CHARSET_P (coding, id)) \
3007 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3008 chars_96 = -1; \
3009 break; \
3011 prev = CODING_ISO_DESIGNATION (coding, reg); \
3012 if (id == charset_jisx0201_roman) \
3014 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3015 id = charset_ascii; \
3017 else if (id == charset_jisx0208_1978) \
3019 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3020 id = charset_jisx0208; \
3022 CODING_ISO_DESIGNATION (coding, reg) = id; \
3023 /* If there was an invalid designation to REG previously, and this \
3024 designation is ASCII to REG, we should keep this designation \
3025 sequence. */ \
3026 if (prev == -2 && id == charset_ascii) \
3027 chars_96 = -1; \
3028 } while (0)
3031 #define MAYBE_FINISH_COMPOSITION() \
3032 do { \
3033 int i; \
3034 if (composition_state == COMPOSING_NO) \
3035 break; \
3036 /* It is assured that we have enough room for producing \
3037 characters stored in the table `components'. */ \
3038 if (charbuf + component_idx > charbuf_end) \
3039 goto no_more_source; \
3040 composition_state = COMPOSING_NO; \
3041 if (method == COMPOSITION_RELATIVE \
3042 || method == COMPOSITION_WITH_ALTCHARS) \
3044 for (i = 0; i < component_idx; i++) \
3045 *charbuf++ = components[i]; \
3046 char_offset += component_idx; \
3048 else \
3050 for (i = 0; i < component_idx; i += 2) \
3051 *charbuf++ = components[i]; \
3052 char_offset += (component_idx / 2) + 1; \
3054 } while (0)
3057 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3058 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3059 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3060 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3061 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3064 #define DECODE_COMPOSITION_START(c1) \
3065 do { \
3066 if (c1 == '0' \
3067 && composition_state == COMPOSING_COMPONENT_RULE) \
3069 component_len = component_idx; \
3070 composition_state = COMPOSING_CHAR; \
3072 else \
3074 const unsigned char *p; \
3076 MAYBE_FINISH_COMPOSITION (); \
3077 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
3078 goto no_more_source; \
3079 for (p = src; p < src_end - 1; p++) \
3080 if (*p == ISO_CODE_ESC && p[1] == '1') \
3081 break; \
3082 if (p == src_end - 1) \
3084 if (coding->mode & CODING_MODE_LAST_BLOCK) \
3085 goto invalid_code; \
3086 /* The current composition doesn't end in the current \
3087 source. */ \
3088 record_conversion_result \
3089 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
3090 goto no_more_source; \
3093 /* This is surely the start of a composition. */ \
3094 method = (c1 == '0' ? COMPOSITION_RELATIVE \
3095 : c1 == '2' ? COMPOSITION_WITH_RULE \
3096 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3097 : COMPOSITION_WITH_RULE_ALTCHARS); \
3098 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
3099 : COMPOSING_COMPONENT_CHAR); \
3100 component_idx = component_len = 0; \
3102 } while (0)
3105 /* Handle compositoin end sequence ESC 1. */
3107 #define DECODE_COMPOSITION_END() \
3108 do { \
3109 int nchars = (component_len > 0 ? component_idx - component_len \
3110 : method == COMPOSITION_RELATIVE ? component_idx \
3111 : (component_idx + 1) / 2); \
3112 int i; \
3113 int *saved_charbuf = charbuf; \
3115 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
3116 if (method != COMPOSITION_RELATIVE) \
3118 if (component_len == 0) \
3119 for (i = 0; i < component_idx; i++) \
3120 *charbuf++ = components[i]; \
3121 else \
3122 for (i = 0; i < component_len; i++) \
3123 *charbuf++ = components[i]; \
3124 *saved_charbuf = saved_charbuf - charbuf; \
3126 if (method == COMPOSITION_WITH_RULE) \
3127 for (i = 0; i < component_idx; i += 2, char_offset++) \
3128 *charbuf++ = components[i]; \
3129 else \
3130 for (i = component_len; i < component_idx; i++, char_offset++) \
3131 *charbuf++ = components[i]; \
3132 coding->annotated = 1; \
3133 composition_state = COMPOSING_NO; \
3134 } while (0)
3137 /* Decode a composition rule from the byte C1 (and maybe one more byte
3138 from SRC) and store one encoded composition rule in
3139 coding->cmp_data. */
3141 #define DECODE_COMPOSITION_RULE(c1) \
3142 do { \
3143 (c1) -= 32; \
3144 if (c1 < 81) /* old format (before ver.21) */ \
3146 int gref = (c1) / 9; \
3147 int nref = (c1) % 9; \
3148 if (gref == 4) gref = 10; \
3149 if (nref == 4) nref = 10; \
3150 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
3152 else if (c1 < 93) /* new format (after ver.21) */ \
3154 ONE_MORE_BYTE (c2); \
3155 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
3157 else \
3158 c1 = 0; \
3159 } while (0)
3162 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3164 static void
3165 decode_coding_iso_2022 (coding)
3166 struct coding_system *coding;
3168 const unsigned char *src = coding->source + coding->consumed;
3169 const unsigned char *src_end = coding->source + coding->src_bytes;
3170 const unsigned char *src_base;
3171 int *charbuf = coding->charbuf + coding->charbuf_used;
3172 int *charbuf_end
3173 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
3174 int consumed_chars = 0, consumed_chars_base;
3175 int multibytep = coding->src_multibyte;
3176 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3177 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3178 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3179 int charset_id_2, charset_id_3;
3180 struct charset *charset;
3181 int c;
3182 /* For handling composition sequence. */
3183 #define COMPOSING_NO 0
3184 #define COMPOSING_CHAR 1
3185 #define COMPOSING_RULE 2
3186 #define COMPOSING_COMPONENT_CHAR 3
3187 #define COMPOSING_COMPONENT_RULE 4
3189 int composition_state = COMPOSING_NO;
3190 enum composition_method method;
3191 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3192 int component_idx;
3193 int component_len;
3194 Lisp_Object attrs, charset_list;
3195 int char_offset = coding->produced_char;
3196 int last_offset = char_offset;
3197 int last_id = charset_ascii;
3198 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3199 int byte_after_cr = -1;
3201 CODING_GET_INFO (coding, attrs, charset_list);
3202 setup_iso_safe_charsets (attrs);
3203 /* Charset list may have been changed. */
3204 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3205 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3207 while (1)
3209 int c1, c2;
3211 src_base = src;
3212 consumed_chars_base = consumed_chars;
3214 if (charbuf >= charbuf_end)
3216 if (byte_after_cr >= 0)
3217 src_base--;
3218 break;
3221 if (byte_after_cr >= 0)
3222 c1 = byte_after_cr, byte_after_cr = -1;
3223 else
3224 ONE_MORE_BYTE (c1);
3225 if (c1 < 0)
3226 goto invalid_code;
3228 /* We produce at most one character. */
3229 switch (iso_code_class [c1])
3231 case ISO_0x20_or_0x7F:
3232 if (composition_state != COMPOSING_NO)
3234 if (composition_state == COMPOSING_RULE
3235 || composition_state == COMPOSING_COMPONENT_RULE)
3237 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3239 DECODE_COMPOSITION_RULE (c1);
3240 components[component_idx++] = c1;
3241 composition_state--;
3242 continue;
3244 /* Too long composition. */
3245 MAYBE_FINISH_COMPOSITION ();
3248 if (charset_id_0 < 0
3249 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3250 /* This is SPACE or DEL. */
3251 charset = CHARSET_FROM_ID (charset_ascii);
3252 else
3253 charset = CHARSET_FROM_ID (charset_id_0);
3254 break;
3256 case ISO_graphic_plane_0:
3257 if (composition_state != COMPOSING_NO)
3259 if (composition_state == COMPOSING_RULE
3260 || composition_state == COMPOSING_COMPONENT_RULE)
3262 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3264 DECODE_COMPOSITION_RULE (c1);
3265 components[component_idx++] = c1;
3266 composition_state--;
3267 continue;
3269 MAYBE_FINISH_COMPOSITION ();
3272 if (charset_id_0 < 0)
3273 charset = CHARSET_FROM_ID (charset_ascii);
3274 else
3275 charset = CHARSET_FROM_ID (charset_id_0);
3276 break;
3278 case ISO_0xA0_or_0xFF:
3279 if (charset_id_1 < 0
3280 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3281 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3282 goto invalid_code;
3283 /* This is a graphic character, we fall down ... */
3285 case ISO_graphic_plane_1:
3286 if (charset_id_1 < 0)
3287 goto invalid_code;
3288 charset = CHARSET_FROM_ID (charset_id_1);
3289 break;
3291 case ISO_control_0:
3292 if (eol_crlf && c1 == '\r')
3293 ONE_MORE_BYTE (byte_after_cr);
3294 MAYBE_FINISH_COMPOSITION ();
3295 charset = CHARSET_FROM_ID (charset_ascii);
3296 break;
3298 case ISO_control_1:
3299 MAYBE_FINISH_COMPOSITION ();
3300 goto invalid_code;
3302 case ISO_shift_out:
3303 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3304 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3305 goto invalid_code;
3306 CODING_ISO_INVOCATION (coding, 0) = 1;
3307 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3308 continue;
3310 case ISO_shift_in:
3311 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3312 goto invalid_code;
3313 CODING_ISO_INVOCATION (coding, 0) = 0;
3314 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3315 continue;
3317 case ISO_single_shift_2_7:
3318 case ISO_single_shift_2:
3319 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3320 goto invalid_code;
3321 /* SS2 is handled as an escape sequence of ESC 'N' */
3322 c1 = 'N';
3323 goto label_escape_sequence;
3325 case ISO_single_shift_3:
3326 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3327 goto invalid_code;
3328 /* SS2 is handled as an escape sequence of ESC 'O' */
3329 c1 = 'O';
3330 goto label_escape_sequence;
3332 case ISO_control_sequence_introducer:
3333 /* CSI is handled as an escape sequence of ESC '[' ... */
3334 c1 = '[';
3335 goto label_escape_sequence;
3337 case ISO_escape:
3338 ONE_MORE_BYTE (c1);
3339 label_escape_sequence:
3340 /* Escape sequences handled here are invocation,
3341 designation, direction specification, and character
3342 composition specification. */
3343 switch (c1)
3345 case '&': /* revision of following character set */
3346 ONE_MORE_BYTE (c1);
3347 if (!(c1 >= '@' && c1 <= '~'))
3348 goto invalid_code;
3349 ONE_MORE_BYTE (c1);
3350 if (c1 != ISO_CODE_ESC)
3351 goto invalid_code;
3352 ONE_MORE_BYTE (c1);
3353 goto label_escape_sequence;
3355 case '$': /* designation of 2-byte character set */
3356 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3357 goto invalid_code;
3359 int reg, chars96;
3361 ONE_MORE_BYTE (c1);
3362 if (c1 >= '@' && c1 <= 'B')
3363 { /* designation of JISX0208.1978, GB2312.1980,
3364 or JISX0208.1980 */
3365 reg = 0, chars96 = 0;
3367 else if (c1 >= 0x28 && c1 <= 0x2B)
3368 { /* designation of DIMENSION2_CHARS94 character set */
3369 reg = c1 - 0x28, chars96 = 0;
3370 ONE_MORE_BYTE (c1);
3372 else if (c1 >= 0x2C && c1 <= 0x2F)
3373 { /* designation of DIMENSION2_CHARS96 character set */
3374 reg = c1 - 0x2C, chars96 = 1;
3375 ONE_MORE_BYTE (c1);
3377 else
3378 goto invalid_code;
3379 DECODE_DESIGNATION (reg, 2, chars96, c1);
3380 /* We must update these variables now. */
3381 if (reg == 0)
3382 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3383 else if (reg == 1)
3384 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3385 if (chars96 < 0)
3386 goto invalid_code;
3388 continue;
3390 case 'n': /* invocation of locking-shift-2 */
3391 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3392 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3393 goto invalid_code;
3394 CODING_ISO_INVOCATION (coding, 0) = 2;
3395 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3396 continue;
3398 case 'o': /* invocation of locking-shift-3 */
3399 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3400 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3401 goto invalid_code;
3402 CODING_ISO_INVOCATION (coding, 0) = 3;
3403 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3404 continue;
3406 case 'N': /* invocation of single-shift-2 */
3407 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3408 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3409 goto invalid_code;
3410 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3411 if (charset_id_2 < 0)
3412 charset = CHARSET_FROM_ID (charset_ascii);
3413 else
3414 charset = CHARSET_FROM_ID (charset_id_2);
3415 ONE_MORE_BYTE (c1);
3416 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3417 goto invalid_code;
3418 break;
3420 case 'O': /* invocation of single-shift-3 */
3421 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3422 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3423 goto invalid_code;
3424 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3425 if (charset_id_3 < 0)
3426 charset = CHARSET_FROM_ID (charset_ascii);
3427 else
3428 charset = CHARSET_FROM_ID (charset_id_3);
3429 ONE_MORE_BYTE (c1);
3430 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3431 goto invalid_code;
3432 break;
3434 case '0': case '2': case '3': case '4': /* start composition */
3435 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3436 goto invalid_code;
3437 DECODE_COMPOSITION_START (c1);
3438 continue;
3440 case '1': /* end composition */
3441 if (composition_state == COMPOSING_NO)
3442 goto invalid_code;
3443 DECODE_COMPOSITION_END ();
3444 continue;
3446 case '[': /* specification of direction */
3447 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3448 goto invalid_code;
3449 /* For the moment, nested direction is not supported.
3450 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3451 left-to-right, and nozero means right-to-left. */
3452 ONE_MORE_BYTE (c1);
3453 switch (c1)
3455 case ']': /* end of the current direction */
3456 coding->mode &= ~CODING_MODE_DIRECTION;
3458 case '0': /* end of the current direction */
3459 case '1': /* start of left-to-right direction */
3460 ONE_MORE_BYTE (c1);
3461 if (c1 == ']')
3462 coding->mode &= ~CODING_MODE_DIRECTION;
3463 else
3464 goto invalid_code;
3465 break;
3467 case '2': /* start of right-to-left direction */
3468 ONE_MORE_BYTE (c1);
3469 if (c1 == ']')
3470 coding->mode |= CODING_MODE_DIRECTION;
3471 else
3472 goto invalid_code;
3473 break;
3475 default:
3476 goto invalid_code;
3478 continue;
3480 case '%':
3481 ONE_MORE_BYTE (c1);
3482 if (c1 == '/')
3484 /* CTEXT extended segment:
3485 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3486 We keep these bytes as is for the moment.
3487 They may be decoded by post-read-conversion. */
3488 int dim, M, L;
3489 int size;
3491 ONE_MORE_BYTE (dim);
3492 ONE_MORE_BYTE (M);
3493 ONE_MORE_BYTE (L);
3494 size = ((M - 128) * 128) + (L - 128);
3495 if (charbuf + 8 + size > charbuf_end)
3496 goto break_loop;
3497 *charbuf++ = ISO_CODE_ESC;
3498 *charbuf++ = '%';
3499 *charbuf++ = '/';
3500 *charbuf++ = dim;
3501 *charbuf++ = BYTE8_TO_CHAR (M);
3502 *charbuf++ = BYTE8_TO_CHAR (L);
3503 while (size-- > 0)
3505 ONE_MORE_BYTE (c1);
3506 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3509 else if (c1 == 'G')
3511 /* XFree86 extension for embedding UTF-8 in CTEXT:
3512 ESC % G --UTF-8-BYTES-- ESC % @
3513 We keep these bytes as is for the moment.
3514 They may be decoded by post-read-conversion. */
3515 int *p = charbuf;
3517 if (p + 6 > charbuf_end)
3518 goto break_loop;
3519 *p++ = ISO_CODE_ESC;
3520 *p++ = '%';
3521 *p++ = 'G';
3522 while (p < charbuf_end)
3524 ONE_MORE_BYTE (c1);
3525 if (c1 == ISO_CODE_ESC
3526 && src + 1 < src_end
3527 && src[0] == '%'
3528 && src[1] == '@')
3530 src += 2;
3531 break;
3533 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3535 if (p + 3 > charbuf_end)
3536 goto break_loop;
3537 *p++ = ISO_CODE_ESC;
3538 *p++ = '%';
3539 *p++ = '@';
3540 charbuf = p;
3542 else
3543 goto invalid_code;
3544 continue;
3545 break;
3547 default:
3548 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3549 goto invalid_code;
3551 int reg, chars96;
3553 if (c1 >= 0x28 && c1 <= 0x2B)
3554 { /* designation of DIMENSION1_CHARS94 character set */
3555 reg = c1 - 0x28, chars96 = 0;
3556 ONE_MORE_BYTE (c1);
3558 else if (c1 >= 0x2C && c1 <= 0x2F)
3559 { /* designation of DIMENSION1_CHARS96 character set */
3560 reg = c1 - 0x2C, chars96 = 1;
3561 ONE_MORE_BYTE (c1);
3563 else
3564 goto invalid_code;
3565 DECODE_DESIGNATION (reg, 1, chars96, c1);
3566 /* We must update these variables now. */
3567 if (reg == 0)
3568 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3569 else if (reg == 1)
3570 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3571 if (chars96 < 0)
3572 goto invalid_code;
3574 continue;
3578 if (charset->id != charset_ascii
3579 && last_id != charset->id)
3581 if (last_id != charset_ascii)
3582 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3583 last_id = charset->id;
3584 last_offset = char_offset;
3587 /* Now we know CHARSET and 1st position code C1 of a character.
3588 Produce a decoded character while getting 2nd position code
3589 C2 if necessary. */
3590 c1 &= 0x7F;
3591 if (CHARSET_DIMENSION (charset) > 1)
3593 ONE_MORE_BYTE (c2);
3594 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3595 /* C2 is not in a valid range. */
3596 goto invalid_code;
3597 c1 = (c1 << 8) | (c2 & 0x7F);
3598 if (CHARSET_DIMENSION (charset) > 2)
3600 ONE_MORE_BYTE (c2);
3601 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3602 /* C2 is not in a valid range. */
3603 goto invalid_code;
3604 c1 = (c1 << 8) | (c2 & 0x7F);
3608 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3609 if (c < 0)
3611 MAYBE_FINISH_COMPOSITION ();
3612 for (; src_base < src; src_base++, char_offset++)
3614 if (ASCII_BYTE_P (*src_base))
3615 *charbuf++ = *src_base;
3616 else
3617 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3620 else if (composition_state == COMPOSING_NO)
3622 *charbuf++ = c;
3623 char_offset++;
3625 else
3627 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3629 components[component_idx++] = c;
3630 if (method == COMPOSITION_WITH_RULE
3631 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3632 && composition_state == COMPOSING_COMPONENT_CHAR))
3633 composition_state++;
3635 else
3637 MAYBE_FINISH_COMPOSITION ();
3638 *charbuf++ = c;
3639 char_offset++;
3642 continue;
3644 invalid_code:
3645 MAYBE_FINISH_COMPOSITION ();
3646 src = src_base;
3647 consumed_chars = consumed_chars_base;
3648 ONE_MORE_BYTE (c);
3649 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3650 char_offset++;
3651 coding->errors++;
3652 continue;
3654 break_loop:
3655 break;
3658 no_more_source:
3659 if (last_id != charset_ascii)
3660 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3661 coding->consumed_char += consumed_chars_base;
3662 coding->consumed = src_base - coding->source;
3663 coding->charbuf_used = charbuf - coding->charbuf;
3667 /* ISO2022 encoding stuff. */
3670 It is not enough to say just "ISO2022" on encoding, we have to
3671 specify more details. In Emacs, each coding system of ISO2022
3672 variant has the following specifications:
3673 1. Initial designation to G0 thru G3.
3674 2. Allows short-form designation?
3675 3. ASCII should be designated to G0 before control characters?
3676 4. ASCII should be designated to G0 at end of line?
3677 5. 7-bit environment or 8-bit environment?
3678 6. Use locking-shift?
3679 7. Use Single-shift?
3680 And the following two are only for Japanese:
3681 8. Use ASCII in place of JIS0201-1976-Roman?
3682 9. Use JISX0208-1983 in place of JISX0208-1978?
3683 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3684 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3685 details.
3688 /* Produce codes (escape sequence) for designating CHARSET to graphic
3689 register REG at DST, and increment DST. If <final-char> of CHARSET is
3690 '@', 'A', or 'B' and the coding system CODING allows, produce
3691 designation sequence of short-form. */
3693 #define ENCODE_DESIGNATION(charset, reg, coding) \
3694 do { \
3695 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3696 char *intermediate_char_94 = "()*+"; \
3697 char *intermediate_char_96 = ",-./"; \
3698 int revision = -1; \
3699 int c; \
3701 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3702 revision = CHARSET_ISO_REVISION (charset); \
3704 if (revision >= 0) \
3706 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3707 EMIT_ONE_BYTE ('@' + revision); \
3709 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3710 if (CHARSET_DIMENSION (charset) == 1) \
3712 if (! CHARSET_ISO_CHARS_96 (charset)) \
3713 c = intermediate_char_94[reg]; \
3714 else \
3715 c = intermediate_char_96[reg]; \
3716 EMIT_ONE_ASCII_BYTE (c); \
3718 else \
3720 EMIT_ONE_ASCII_BYTE ('$'); \
3721 if (! CHARSET_ISO_CHARS_96 (charset)) \
3723 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3724 || reg != 0 \
3725 || final_char < '@' || final_char > 'B') \
3726 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3728 else \
3729 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3731 EMIT_ONE_ASCII_BYTE (final_char); \
3733 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3734 } while (0)
3737 /* The following two macros produce codes (control character or escape
3738 sequence) for ISO2022 single-shift functions (single-shift-2 and
3739 single-shift-3). */
3741 #define ENCODE_SINGLE_SHIFT_2 \
3742 do { \
3743 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3744 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3745 else \
3746 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3747 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3748 } while (0)
3751 #define ENCODE_SINGLE_SHIFT_3 \
3752 do { \
3753 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3754 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3755 else \
3756 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3757 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3758 } while (0)
3761 /* The following four macros produce codes (control character or
3762 escape sequence) for ISO2022 locking-shift functions (shift-in,
3763 shift-out, locking-shift-2, and locking-shift-3). */
3765 #define ENCODE_SHIFT_IN \
3766 do { \
3767 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3768 CODING_ISO_INVOCATION (coding, 0) = 0; \
3769 } while (0)
3772 #define ENCODE_SHIFT_OUT \
3773 do { \
3774 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3775 CODING_ISO_INVOCATION (coding, 0) = 1; \
3776 } while (0)
3779 #define ENCODE_LOCKING_SHIFT_2 \
3780 do { \
3781 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3782 CODING_ISO_INVOCATION (coding, 0) = 2; \
3783 } while (0)
3786 #define ENCODE_LOCKING_SHIFT_3 \
3787 do { \
3788 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3789 CODING_ISO_INVOCATION (coding, 0) = 3; \
3790 } while (0)
3793 /* Produce codes for a DIMENSION1 character whose character set is
3794 CHARSET and whose position-code is C1. Designation and invocation
3795 sequences are also produced in advance if necessary. */
3797 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3798 do { \
3799 int id = CHARSET_ID (charset); \
3801 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3802 && id == charset_ascii) \
3804 id = charset_jisx0201_roman; \
3805 charset = CHARSET_FROM_ID (id); \
3808 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3810 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3811 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3812 else \
3813 EMIT_ONE_BYTE (c1 | 0x80); \
3814 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3815 break; \
3817 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3819 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3820 break; \
3822 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3824 EMIT_ONE_BYTE (c1 | 0x80); \
3825 break; \
3827 else \
3828 /* Since CHARSET is not yet invoked to any graphic planes, we \
3829 must invoke it, or, at first, designate it to some graphic \
3830 register. Then repeat the loop to actually produce the \
3831 character. */ \
3832 dst = encode_invocation_designation (charset, coding, dst, \
3833 &produced_chars); \
3834 } while (1)
3837 /* Produce codes for a DIMENSION2 character whose character set is
3838 CHARSET and whose position-codes are C1 and C2. Designation and
3839 invocation codes are also produced in advance if necessary. */
3841 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3842 do { \
3843 int id = CHARSET_ID (charset); \
3845 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3846 && id == charset_jisx0208) \
3848 id = charset_jisx0208_1978; \
3849 charset = CHARSET_FROM_ID (id); \
3852 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3854 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3855 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3856 else \
3857 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3858 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3859 break; \
3861 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3863 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3864 break; \
3866 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3868 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3869 break; \
3871 else \
3872 /* Since CHARSET is not yet invoked to any graphic planes, we \
3873 must invoke it, or, at first, designate it to some graphic \
3874 register. Then repeat the loop to actually produce the \
3875 character. */ \
3876 dst = encode_invocation_designation (charset, coding, dst, \
3877 &produced_chars); \
3878 } while (1)
3881 #define ENCODE_ISO_CHARACTER(charset, c) \
3882 do { \
3883 int code = ENCODE_CHAR ((charset),(c)); \
3885 if (CHARSET_DIMENSION (charset) == 1) \
3886 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3887 else \
3888 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3889 } while (0)
3892 /* Produce designation and invocation codes at a place pointed by DST
3893 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3894 Return new DST. */
3896 unsigned char *
3897 encode_invocation_designation (charset, coding, dst, p_nchars)
3898 struct charset *charset;
3899 struct coding_system *coding;
3900 unsigned char *dst;
3901 int *p_nchars;
3903 int multibytep = coding->dst_multibyte;
3904 int produced_chars = *p_nchars;
3905 int reg; /* graphic register number */
3906 int id = CHARSET_ID (charset);
3908 /* At first, check designations. */
3909 for (reg = 0; reg < 4; reg++)
3910 if (id == CODING_ISO_DESIGNATION (coding, reg))
3911 break;
3913 if (reg >= 4)
3915 /* CHARSET is not yet designated to any graphic registers. */
3916 /* At first check the requested designation. */
3917 reg = CODING_ISO_REQUEST (coding, id);
3918 if (reg < 0)
3919 /* Since CHARSET requests no special designation, designate it
3920 to graphic register 0. */
3921 reg = 0;
3923 ENCODE_DESIGNATION (charset, reg, coding);
3926 if (CODING_ISO_INVOCATION (coding, 0) != reg
3927 && CODING_ISO_INVOCATION (coding, 1) != reg)
3929 /* Since the graphic register REG is not invoked to any graphic
3930 planes, invoke it to graphic plane 0. */
3931 switch (reg)
3933 case 0: /* graphic register 0 */
3934 ENCODE_SHIFT_IN;
3935 break;
3937 case 1: /* graphic register 1 */
3938 ENCODE_SHIFT_OUT;
3939 break;
3941 case 2: /* graphic register 2 */
3942 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3943 ENCODE_SINGLE_SHIFT_2;
3944 else
3945 ENCODE_LOCKING_SHIFT_2;
3946 break;
3948 case 3: /* graphic register 3 */
3949 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3950 ENCODE_SINGLE_SHIFT_3;
3951 else
3952 ENCODE_LOCKING_SHIFT_3;
3953 break;
3957 *p_nchars = produced_chars;
3958 return dst;
3961 /* The following three macros produce codes for indicating direction
3962 of text. */
3963 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3964 do { \
3965 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3966 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3967 else \
3968 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3969 } while (0)
3972 #define ENCODE_DIRECTION_R2L() \
3973 do { \
3974 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3975 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3976 } while (0)
3979 #define ENCODE_DIRECTION_L2R() \
3980 do { \
3981 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3982 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3983 } while (0)
3986 /* Produce codes for designation and invocation to reset the graphic
3987 planes and registers to initial state. */
3988 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3989 do { \
3990 int reg; \
3991 struct charset *charset; \
3993 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3994 ENCODE_SHIFT_IN; \
3995 for (reg = 0; reg < 4; reg++) \
3996 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3997 && (CODING_ISO_DESIGNATION (coding, reg) \
3998 != CODING_ISO_INITIAL (coding, reg))) \
4000 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4001 ENCODE_DESIGNATION (charset, reg, coding); \
4003 } while (0)
4006 /* Produce designation sequences of charsets in the line started from
4007 SRC to a place pointed by DST, and return updated DST.
4009 If the current block ends before any end-of-line, we may fail to
4010 find all the necessary designations. */
4012 static unsigned char *
4013 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4014 struct coding_system *coding;
4015 int *charbuf, *charbuf_end;
4016 unsigned char *dst;
4018 struct charset *charset;
4019 /* Table of charsets to be designated to each graphic register. */
4020 int r[4];
4021 int c, found = 0, reg;
4022 int produced_chars = 0;
4023 int multibytep = coding->dst_multibyte;
4024 Lisp_Object attrs;
4025 Lisp_Object charset_list;
4027 attrs = CODING_ID_ATTRS (coding->id);
4028 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4029 if (EQ (charset_list, Qiso_2022))
4030 charset_list = Viso_2022_charset_list;
4032 for (reg = 0; reg < 4; reg++)
4033 r[reg] = -1;
4035 while (found < 4)
4037 int id;
4039 c = *charbuf++;
4040 if (c == '\n')
4041 break;
4042 charset = char_charset (c, charset_list, NULL);
4043 id = CHARSET_ID (charset);
4044 reg = CODING_ISO_REQUEST (coding, id);
4045 if (reg >= 0 && r[reg] < 0)
4047 found++;
4048 r[reg] = id;
4052 if (found)
4054 for (reg = 0; reg < 4; reg++)
4055 if (r[reg] >= 0
4056 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4057 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4060 return dst;
4063 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4065 static int
4066 encode_coding_iso_2022 (coding)
4067 struct coding_system *coding;
4069 int multibytep = coding->dst_multibyte;
4070 int *charbuf = coding->charbuf;
4071 int *charbuf_end = charbuf + coding->charbuf_used;
4072 unsigned char *dst = coding->destination + coding->produced;
4073 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4074 int safe_room = 16;
4075 int bol_designation
4076 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4077 && CODING_ISO_BOL (coding));
4078 int produced_chars = 0;
4079 Lisp_Object attrs, eol_type, charset_list;
4080 int ascii_compatible;
4081 int c;
4082 int preferred_charset_id = -1;
4084 CODING_GET_INFO (coding, attrs, charset_list);
4085 eol_type = CODING_ID_EOL_TYPE (coding->id);
4086 if (VECTORP (eol_type))
4087 eol_type = Qunix;
4089 setup_iso_safe_charsets (attrs);
4090 /* Charset list may have been changed. */
4091 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4092 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
4094 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4096 while (charbuf < charbuf_end)
4098 ASSURE_DESTINATION (safe_room);
4100 if (bol_designation)
4102 unsigned char *dst_prev = dst;
4104 /* We have to produce designation sequences if any now. */
4105 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4106 bol_designation = 0;
4107 /* We are sure that designation sequences are all ASCII bytes. */
4108 produced_chars += dst - dst_prev;
4111 c = *charbuf++;
4113 if (c < 0)
4115 /* Handle an annotation. */
4116 switch (*charbuf)
4118 case CODING_ANNOTATE_COMPOSITION_MASK:
4119 /* Not yet implemented. */
4120 break;
4121 case CODING_ANNOTATE_CHARSET_MASK:
4122 preferred_charset_id = charbuf[2];
4123 if (preferred_charset_id >= 0
4124 && NILP (Fmemq (make_number (preferred_charset_id),
4125 charset_list)))
4126 preferred_charset_id = -1;
4127 break;
4128 default:
4129 abort ();
4131 charbuf += -c - 1;
4132 continue;
4135 /* Now encode the character C. */
4136 if (c < 0x20 || c == 0x7F)
4138 if (c == '\n'
4139 || (c == '\r' && EQ (eol_type, Qmac)))
4141 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4142 ENCODE_RESET_PLANE_AND_REGISTER ();
4143 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4145 int i;
4147 for (i = 0; i < 4; i++)
4148 CODING_ISO_DESIGNATION (coding, i)
4149 = CODING_ISO_INITIAL (coding, i);
4151 bol_designation
4152 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4154 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4155 ENCODE_RESET_PLANE_AND_REGISTER ();
4156 EMIT_ONE_ASCII_BYTE (c);
4158 else if (ASCII_CHAR_P (c))
4160 if (ascii_compatible)
4161 EMIT_ONE_ASCII_BYTE (c);
4162 else
4164 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4165 ENCODE_ISO_CHARACTER (charset, c);
4168 else if (CHAR_BYTE8_P (c))
4170 c = CHAR_TO_BYTE8 (c);
4171 EMIT_ONE_BYTE (c);
4173 else
4175 struct charset *charset;
4177 if (preferred_charset_id >= 0)
4179 charset = CHARSET_FROM_ID (preferred_charset_id);
4180 if (! CHAR_CHARSET_P (c, charset))
4181 charset = char_charset (c, charset_list, NULL);
4183 else
4184 charset = char_charset (c, charset_list, NULL);
4185 if (!charset)
4187 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4189 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4190 charset = CHARSET_FROM_ID (charset_ascii);
4192 else
4194 c = coding->default_char;
4195 charset = char_charset (c, charset_list, NULL);
4198 ENCODE_ISO_CHARACTER (charset, c);
4202 if (coding->mode & CODING_MODE_LAST_BLOCK
4203 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4205 ASSURE_DESTINATION (safe_room);
4206 ENCODE_RESET_PLANE_AND_REGISTER ();
4208 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4209 CODING_ISO_BOL (coding) = bol_designation;
4210 coding->produced_char += produced_chars;
4211 coding->produced = dst - coding->destination;
4212 return 0;
4216 /*** 8,9. SJIS and BIG5 handlers ***/
4218 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4219 quite widely. So, for the moment, Emacs supports them in the bare
4220 C code. But, in the future, they may be supported only by CCL. */
4222 /* SJIS is a coding system encoding three character sets: ASCII, right
4223 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4224 as is. A character of charset katakana-jisx0201 is encoded by
4225 "position-code + 0x80". A character of charset japanese-jisx0208
4226 is encoded in 2-byte but two position-codes are divided and shifted
4227 so that it fit in the range below.
4229 --- CODE RANGE of SJIS ---
4230 (character set) (range)
4231 ASCII 0x00 .. 0x7F
4232 KATAKANA-JISX0201 0xA0 .. 0xDF
4233 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4234 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4235 -------------------------------
4239 /* BIG5 is a coding system encoding two character sets: ASCII and
4240 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4241 character set and is encoded in two-byte.
4243 --- CODE RANGE of BIG5 ---
4244 (character set) (range)
4245 ASCII 0x00 .. 0x7F
4246 Big5 (1st byte) 0xA1 .. 0xFE
4247 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4248 --------------------------
4252 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4253 Check if a text is encoded in SJIS. If it is, return
4254 CATEGORY_MASK_SJIS, else return 0. */
4256 static int
4257 detect_coding_sjis (coding, detect_info)
4258 struct coding_system *coding;
4259 struct coding_detection_info *detect_info;
4261 const unsigned char *src = coding->source, *src_base;
4262 const unsigned char *src_end = coding->source + coding->src_bytes;
4263 int multibytep = coding->src_multibyte;
4264 int consumed_chars = 0;
4265 int found = 0;
4266 int c;
4268 detect_info->checked |= CATEGORY_MASK_SJIS;
4269 /* A coding system of this category is always ASCII compatible. */
4270 src += coding->head_ascii;
4272 while (1)
4274 src_base = src;
4275 ONE_MORE_BYTE (c);
4276 if (c < 0x80)
4277 continue;
4278 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4280 ONE_MORE_BYTE (c);
4281 if (c < 0x40 || c == 0x7F || c > 0xFC)
4282 break;
4283 found = CATEGORY_MASK_SJIS;
4285 else if (c >= 0xA0 && c < 0xE0)
4286 found = CATEGORY_MASK_SJIS;
4287 else
4288 break;
4290 detect_info->rejected |= CATEGORY_MASK_SJIS;
4291 return 0;
4293 no_more_source:
4294 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4296 detect_info->rejected |= CATEGORY_MASK_SJIS;
4297 return 0;
4299 detect_info->found |= found;
4300 return 1;
4303 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4304 Check if a text is encoded in BIG5. If it is, return
4305 CATEGORY_MASK_BIG5, else return 0. */
4307 static int
4308 detect_coding_big5 (coding, detect_info)
4309 struct coding_system *coding;
4310 struct coding_detection_info *detect_info;
4312 const unsigned char *src = coding->source, *src_base;
4313 const unsigned char *src_end = coding->source + coding->src_bytes;
4314 int multibytep = coding->src_multibyte;
4315 int consumed_chars = 0;
4316 int found = 0;
4317 int c;
4319 detect_info->checked |= CATEGORY_MASK_BIG5;
4320 /* A coding system of this category is always ASCII compatible. */
4321 src += coding->head_ascii;
4323 while (1)
4325 src_base = src;
4326 ONE_MORE_BYTE (c);
4327 if (c < 0x80)
4328 continue;
4329 if (c >= 0xA1)
4331 ONE_MORE_BYTE (c);
4332 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4333 return 0;
4334 found = CATEGORY_MASK_BIG5;
4336 else
4337 break;
4339 detect_info->rejected |= CATEGORY_MASK_BIG5;
4340 return 0;
4342 no_more_source:
4343 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4345 detect_info->rejected |= CATEGORY_MASK_BIG5;
4346 return 0;
4348 detect_info->found |= found;
4349 return 1;
4352 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4353 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4355 static void
4356 decode_coding_sjis (coding)
4357 struct coding_system *coding;
4359 const unsigned char *src = coding->source + coding->consumed;
4360 const unsigned char *src_end = coding->source + coding->src_bytes;
4361 const unsigned char *src_base;
4362 int *charbuf = coding->charbuf + coding->charbuf_used;
4363 int *charbuf_end
4364 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4365 int consumed_chars = 0, consumed_chars_base;
4366 int multibytep = coding->src_multibyte;
4367 struct charset *charset_roman, *charset_kanji, *charset_kana;
4368 struct charset *charset_kanji2;
4369 Lisp_Object attrs, charset_list, val;
4370 int char_offset = coding->produced_char;
4371 int last_offset = char_offset;
4372 int last_id = charset_ascii;
4373 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4374 int byte_after_cr = -1;
4376 CODING_GET_INFO (coding, attrs, charset_list);
4378 val = charset_list;
4379 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4380 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4381 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4382 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4384 while (1)
4386 int c, c1;
4387 struct charset *charset;
4389 src_base = src;
4390 consumed_chars_base = consumed_chars;
4392 if (charbuf >= charbuf_end)
4394 if (byte_after_cr >= 0)
4395 src_base--;
4396 break;
4399 if (byte_after_cr >= 0)
4400 c = byte_after_cr, byte_after_cr = -1;
4401 else
4402 ONE_MORE_BYTE (c);
4403 if (c < 0)
4404 goto invalid_code;
4405 if (c < 0x80)
4407 if (eol_crlf && c == '\r')
4408 ONE_MORE_BYTE (byte_after_cr);
4409 charset = charset_roman;
4411 else if (c == 0x80 || c == 0xA0)
4412 goto invalid_code;
4413 else if (c >= 0xA1 && c <= 0xDF)
4415 /* SJIS -> JISX0201-Kana */
4416 c &= 0x7F;
4417 charset = charset_kana;
4419 else if (c <= 0xEF)
4421 /* SJIS -> JISX0208 */
4422 ONE_MORE_BYTE (c1);
4423 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4424 goto invalid_code;
4425 c = (c << 8) | c1;
4426 SJIS_TO_JIS (c);
4427 charset = charset_kanji;
4429 else if (c <= 0xFC && charset_kanji2)
4431 /* SJIS -> JISX0213-2 */
4432 ONE_MORE_BYTE (c1);
4433 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4434 goto invalid_code;
4435 c = (c << 8) | c1;
4436 SJIS_TO_JIS2 (c);
4437 charset = charset_kanji2;
4439 else
4440 goto invalid_code;
4441 if (charset->id != charset_ascii
4442 && last_id != charset->id)
4444 if (last_id != charset_ascii)
4445 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4446 last_id = charset->id;
4447 last_offset = char_offset;
4449 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4450 *charbuf++ = c;
4451 char_offset++;
4452 continue;
4454 invalid_code:
4455 src = src_base;
4456 consumed_chars = consumed_chars_base;
4457 ONE_MORE_BYTE (c);
4458 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4459 char_offset++;
4460 coding->errors++;
4463 no_more_source:
4464 if (last_id != charset_ascii)
4465 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4466 coding->consumed_char += consumed_chars_base;
4467 coding->consumed = src_base - coding->source;
4468 coding->charbuf_used = charbuf - coding->charbuf;
4471 static void
4472 decode_coding_big5 (coding)
4473 struct coding_system *coding;
4475 const unsigned char *src = coding->source + coding->consumed;
4476 const unsigned char *src_end = coding->source + coding->src_bytes;
4477 const unsigned char *src_base;
4478 int *charbuf = coding->charbuf + coding->charbuf_used;
4479 int *charbuf_end
4480 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4481 int consumed_chars = 0, consumed_chars_base;
4482 int multibytep = coding->src_multibyte;
4483 struct charset *charset_roman, *charset_big5;
4484 Lisp_Object attrs, charset_list, val;
4485 int char_offset = coding->produced_char;
4486 int last_offset = char_offset;
4487 int last_id = charset_ascii;
4488 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4489 int byte_after_cr = -1;
4491 CODING_GET_INFO (coding, attrs, charset_list);
4492 val = charset_list;
4493 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4494 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4496 while (1)
4498 int c, c1;
4499 struct charset *charset;
4501 src_base = src;
4502 consumed_chars_base = consumed_chars;
4504 if (charbuf >= charbuf_end)
4506 if (byte_after_cr >= 0)
4507 src_base--;
4508 break;
4511 if (byte_after_cr >= 0)
4512 c = byte_after_cr, byte_after_cr = -1;
4513 else
4514 ONE_MORE_BYTE (c);
4516 if (c < 0)
4517 goto invalid_code;
4518 if (c < 0x80)
4520 if (eol_crlf && c == '\r')
4521 ONE_MORE_BYTE (byte_after_cr);
4522 charset = charset_roman;
4524 else
4526 /* BIG5 -> Big5 */
4527 if (c < 0xA1 || c > 0xFE)
4528 goto invalid_code;
4529 ONE_MORE_BYTE (c1);
4530 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4531 goto invalid_code;
4532 c = c << 8 | c1;
4533 charset = charset_big5;
4535 if (charset->id != charset_ascii
4536 && last_id != charset->id)
4538 if (last_id != charset_ascii)
4539 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4540 last_id = charset->id;
4541 last_offset = char_offset;
4543 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4544 *charbuf++ = c;
4545 char_offset++;
4546 continue;
4548 invalid_code:
4549 src = src_base;
4550 consumed_chars = consumed_chars_base;
4551 ONE_MORE_BYTE (c);
4552 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4553 char_offset++;
4554 coding->errors++;
4557 no_more_source:
4558 if (last_id != charset_ascii)
4559 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4560 coding->consumed_char += consumed_chars_base;
4561 coding->consumed = src_base - coding->source;
4562 coding->charbuf_used = charbuf - coding->charbuf;
4565 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4566 This function can encode charsets `ascii', `katakana-jisx0201',
4567 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4568 are sure that all these charsets are registered as official charset
4569 (i.e. do not have extended leading-codes). Characters of other
4570 charsets are produced without any encoding. If SJIS_P is 1, encode
4571 SJIS text, else encode BIG5 text. */
4573 static int
4574 encode_coding_sjis (coding)
4575 struct coding_system *coding;
4577 int multibytep = coding->dst_multibyte;
4578 int *charbuf = coding->charbuf;
4579 int *charbuf_end = charbuf + coding->charbuf_used;
4580 unsigned char *dst = coding->destination + coding->produced;
4581 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4582 int safe_room = 4;
4583 int produced_chars = 0;
4584 Lisp_Object attrs, charset_list, val;
4585 int ascii_compatible;
4586 struct charset *charset_roman, *charset_kanji, *charset_kana;
4587 struct charset *charset_kanji2;
4588 int c;
4590 CODING_GET_INFO (coding, attrs, charset_list);
4591 val = charset_list;
4592 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4593 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4594 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4595 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4597 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4599 while (charbuf < charbuf_end)
4601 ASSURE_DESTINATION (safe_room);
4602 c = *charbuf++;
4603 /* Now encode the character C. */
4604 if (ASCII_CHAR_P (c) && ascii_compatible)
4605 EMIT_ONE_ASCII_BYTE (c);
4606 else if (CHAR_BYTE8_P (c))
4608 c = CHAR_TO_BYTE8 (c);
4609 EMIT_ONE_BYTE (c);
4611 else
4613 unsigned code;
4614 struct charset *charset = char_charset (c, charset_list, &code);
4616 if (!charset)
4618 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4620 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4621 charset = CHARSET_FROM_ID (charset_ascii);
4623 else
4625 c = coding->default_char;
4626 charset = char_charset (c, charset_list, &code);
4629 if (code == CHARSET_INVALID_CODE (charset))
4630 abort ();
4631 if (charset == charset_kanji)
4633 int c1, c2;
4634 JIS_TO_SJIS (code);
4635 c1 = code >> 8, c2 = code & 0xFF;
4636 EMIT_TWO_BYTES (c1, c2);
4638 else if (charset == charset_kana)
4639 EMIT_ONE_BYTE (code | 0x80);
4640 else if (charset_kanji2 && charset == charset_kanji2)
4642 int c1, c2;
4644 c1 = code >> 8;
4645 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4646 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4648 JIS_TO_SJIS2 (code);
4649 c1 = code >> 8, c2 = code & 0xFF;
4650 EMIT_TWO_BYTES (c1, c2);
4652 else
4653 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4655 else
4656 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4659 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4660 coding->produced_char += produced_chars;
4661 coding->produced = dst - coding->destination;
4662 return 0;
4665 static int
4666 encode_coding_big5 (coding)
4667 struct coding_system *coding;
4669 int multibytep = coding->dst_multibyte;
4670 int *charbuf = coding->charbuf;
4671 int *charbuf_end = charbuf + coding->charbuf_used;
4672 unsigned char *dst = coding->destination + coding->produced;
4673 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4674 int safe_room = 4;
4675 int produced_chars = 0;
4676 Lisp_Object attrs, charset_list, val;
4677 int ascii_compatible;
4678 struct charset *charset_roman, *charset_big5;
4679 int c;
4681 CODING_GET_INFO (coding, attrs, charset_list);
4682 val = charset_list;
4683 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4684 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4685 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4687 while (charbuf < charbuf_end)
4689 ASSURE_DESTINATION (safe_room);
4690 c = *charbuf++;
4691 /* Now encode the character C. */
4692 if (ASCII_CHAR_P (c) && ascii_compatible)
4693 EMIT_ONE_ASCII_BYTE (c);
4694 else if (CHAR_BYTE8_P (c))
4696 c = CHAR_TO_BYTE8 (c);
4697 EMIT_ONE_BYTE (c);
4699 else
4701 unsigned code;
4702 struct charset *charset = char_charset (c, charset_list, &code);
4704 if (! charset)
4706 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4708 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4709 charset = CHARSET_FROM_ID (charset_ascii);
4711 else
4713 c = coding->default_char;
4714 charset = char_charset (c, charset_list, &code);
4717 if (code == CHARSET_INVALID_CODE (charset))
4718 abort ();
4719 if (charset == charset_big5)
4721 int c1, c2;
4723 c1 = code >> 8, c2 = code & 0xFF;
4724 EMIT_TWO_BYTES (c1, c2);
4726 else
4727 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4730 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4731 coding->produced_char += produced_chars;
4732 coding->produced = dst - coding->destination;
4733 return 0;
4737 /*** 10. CCL handlers ***/
4739 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4740 Check if a text is encoded in a coding system of which
4741 encoder/decoder are written in CCL program. If it is, return
4742 CATEGORY_MASK_CCL, else return 0. */
4744 static int
4745 detect_coding_ccl (coding, detect_info)
4746 struct coding_system *coding;
4747 struct coding_detection_info *detect_info;
4749 const unsigned char *src = coding->source, *src_base;
4750 const unsigned char *src_end = coding->source + coding->src_bytes;
4751 int multibytep = coding->src_multibyte;
4752 int consumed_chars = 0;
4753 int found = 0;
4754 unsigned char *valids;
4755 int head_ascii = coding->head_ascii;
4756 Lisp_Object attrs;
4758 detect_info->checked |= CATEGORY_MASK_CCL;
4760 coding = &coding_categories[coding_category_ccl];
4761 valids = CODING_CCL_VALIDS (coding);
4762 attrs = CODING_ID_ATTRS (coding->id);
4763 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4764 src += head_ascii;
4766 while (1)
4768 int c;
4770 src_base = src;
4771 ONE_MORE_BYTE (c);
4772 if (c < 0 || ! valids[c])
4773 break;
4774 if ((valids[c] > 1))
4775 found = CATEGORY_MASK_CCL;
4777 detect_info->rejected |= CATEGORY_MASK_CCL;
4778 return 0;
4780 no_more_source:
4781 detect_info->found |= found;
4782 return 1;
4785 static void
4786 decode_coding_ccl (coding)
4787 struct coding_system *coding;
4789 const unsigned char *src = coding->source + coding->consumed;
4790 const unsigned char *src_end = coding->source + coding->src_bytes;
4791 int *charbuf = coding->charbuf + coding->charbuf_used;
4792 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4793 int consumed_chars = 0;
4794 int multibytep = coding->src_multibyte;
4795 struct ccl_program ccl;
4796 int source_charbuf[1024];
4797 int source_byteidx[1024];
4798 Lisp_Object attrs, charset_list;
4800 CODING_GET_INFO (coding, attrs, charset_list);
4801 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4803 while (src < src_end)
4805 const unsigned char *p = src;
4806 int *source, *source_end;
4807 int i = 0;
4809 if (multibytep)
4810 while (i < 1024 && p < src_end)
4812 source_byteidx[i] = p - src;
4813 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4815 else
4816 while (i < 1024 && p < src_end)
4817 source_charbuf[i++] = *p++;
4819 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4820 ccl.last_block = 1;
4822 source = source_charbuf;
4823 source_end = source + i;
4824 while (source < source_end)
4826 ccl_driver (&ccl, source, charbuf,
4827 source_end - source, charbuf_end - charbuf,
4828 charset_list);
4829 source += ccl.consumed;
4830 charbuf += ccl.produced;
4831 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4832 break;
4834 if (source < source_end)
4835 src += source_byteidx[source - source_charbuf];
4836 else
4837 src = p;
4838 consumed_chars += source - source_charbuf;
4840 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4841 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4842 break;
4845 switch (ccl.status)
4847 case CCL_STAT_SUSPEND_BY_SRC:
4848 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4849 break;
4850 case CCL_STAT_SUSPEND_BY_DST:
4851 break;
4852 case CCL_STAT_QUIT:
4853 case CCL_STAT_INVALID_CMD:
4854 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4855 break;
4856 default:
4857 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4858 break;
4860 coding->consumed_char += consumed_chars;
4861 coding->consumed = src - coding->source;
4862 coding->charbuf_used = charbuf - coding->charbuf;
4865 static int
4866 encode_coding_ccl (coding)
4867 struct coding_system *coding;
4869 struct ccl_program ccl;
4870 int multibytep = coding->dst_multibyte;
4871 int *charbuf = coding->charbuf;
4872 int *charbuf_end = charbuf + coding->charbuf_used;
4873 unsigned char *dst = coding->destination + coding->produced;
4874 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4875 int destination_charbuf[1024];
4876 int i, produced_chars = 0;
4877 Lisp_Object attrs, charset_list;
4879 CODING_GET_INFO (coding, attrs, charset_list);
4880 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4882 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4883 ccl.dst_multibyte = coding->dst_multibyte;
4885 while (charbuf < charbuf_end)
4887 ccl_driver (&ccl, charbuf, destination_charbuf,
4888 charbuf_end - charbuf, 1024, charset_list);
4889 if (multibytep)
4891 ASSURE_DESTINATION (ccl.produced * 2);
4892 for (i = 0; i < ccl.produced; i++)
4893 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4895 else
4897 ASSURE_DESTINATION (ccl.produced);
4898 for (i = 0; i < ccl.produced; i++)
4899 *dst++ = destination_charbuf[i] & 0xFF;
4900 produced_chars += ccl.produced;
4902 charbuf += ccl.consumed;
4903 if (ccl.status == CCL_STAT_QUIT
4904 || ccl.status == CCL_STAT_INVALID_CMD)
4905 break;
4908 switch (ccl.status)
4910 case CCL_STAT_SUSPEND_BY_SRC:
4911 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4912 break;
4913 case CCL_STAT_SUSPEND_BY_DST:
4914 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4915 break;
4916 case CCL_STAT_QUIT:
4917 case CCL_STAT_INVALID_CMD:
4918 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4919 break;
4920 default:
4921 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4922 break;
4925 coding->produced_char += produced_chars;
4926 coding->produced = dst - coding->destination;
4927 return 0;
4932 /*** 10, 11. no-conversion handlers ***/
4934 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4936 static void
4937 decode_coding_raw_text (coding)
4938 struct coding_system *coding;
4940 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4942 coding->chars_at_source = 1;
4943 coding->consumed_char = coding->src_chars;
4944 coding->consumed = coding->src_bytes;
4945 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4947 coding->consumed_char--;
4948 coding->consumed--;
4949 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4951 else
4952 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4955 static int
4956 encode_coding_raw_text (coding)
4957 struct coding_system *coding;
4959 int multibytep = coding->dst_multibyte;
4960 int *charbuf = coding->charbuf;
4961 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4962 unsigned char *dst = coding->destination + coding->produced;
4963 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4964 int produced_chars = 0;
4965 int c;
4967 if (multibytep)
4969 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4971 if (coding->src_multibyte)
4972 while (charbuf < charbuf_end)
4974 ASSURE_DESTINATION (safe_room);
4975 c = *charbuf++;
4976 if (ASCII_CHAR_P (c))
4977 EMIT_ONE_ASCII_BYTE (c);
4978 else if (CHAR_BYTE8_P (c))
4980 c = CHAR_TO_BYTE8 (c);
4981 EMIT_ONE_BYTE (c);
4983 else
4985 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4987 CHAR_STRING_ADVANCE (c, p1);
4988 while (p0 < p1)
4990 EMIT_ONE_BYTE (*p0);
4991 p0++;
4995 else
4996 while (charbuf < charbuf_end)
4998 ASSURE_DESTINATION (safe_room);
4999 c = *charbuf++;
5000 EMIT_ONE_BYTE (c);
5003 else
5005 if (coding->src_multibyte)
5007 int safe_room = MAX_MULTIBYTE_LENGTH;
5009 while (charbuf < charbuf_end)
5011 ASSURE_DESTINATION (safe_room);
5012 c = *charbuf++;
5013 if (ASCII_CHAR_P (c))
5014 *dst++ = c;
5015 else if (CHAR_BYTE8_P (c))
5016 *dst++ = CHAR_TO_BYTE8 (c);
5017 else
5018 CHAR_STRING_ADVANCE (c, dst);
5021 else
5023 ASSURE_DESTINATION (charbuf_end - charbuf);
5024 while (charbuf < charbuf_end && dst < dst_end)
5025 *dst++ = *charbuf++;
5027 produced_chars = dst - (coding->destination + coding->produced);
5029 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5030 coding->produced_char += produced_chars;
5031 coding->produced = dst - coding->destination;
5032 return 0;
5035 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5036 Check if a text is encoded in a charset-based coding system. If it
5037 is, return 1, else return 0. */
5039 static int
5040 detect_coding_charset (coding, detect_info)
5041 struct coding_system *coding;
5042 struct coding_detection_info *detect_info;
5044 const unsigned char *src = coding->source, *src_base;
5045 const unsigned char *src_end = coding->source + coding->src_bytes;
5046 int multibytep = coding->src_multibyte;
5047 int consumed_chars = 0;
5048 Lisp_Object attrs, valids, name;
5049 int found = 0;
5050 int head_ascii = coding->head_ascii;
5051 int check_latin_extra = 0;
5053 detect_info->checked |= CATEGORY_MASK_CHARSET;
5055 coding = &coding_categories[coding_category_charset];
5056 attrs = CODING_ID_ATTRS (coding->id);
5057 valids = AREF (attrs, coding_attr_charset_valids);
5058 name = CODING_ID_NAME (coding->id);
5059 if (VECTORP (Vlatin_extra_code_table)
5060 && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-"))
5061 check_latin_extra = 1;
5062 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5063 src += head_ascii;
5065 while (1)
5067 int c;
5068 Lisp_Object val;
5069 struct charset *charset;
5070 int dim, idx;
5072 src_base = src;
5073 ONE_MORE_BYTE (c);
5074 if (c < 0)
5075 continue;
5076 val = AREF (valids, c);
5077 if (NILP (val))
5078 break;
5079 if (c >= 0x80)
5081 if (c < 0xA0
5082 && check_latin_extra
5083 && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
5084 break;
5085 found = CATEGORY_MASK_CHARSET;
5087 if (INTEGERP (val))
5089 charset = CHARSET_FROM_ID (XFASTINT (val));
5090 dim = CHARSET_DIMENSION (charset);
5091 for (idx = 1; idx < dim; idx++)
5093 if (src == src_end)
5094 goto too_short;
5095 ONE_MORE_BYTE (c);
5096 if (c < charset->code_space[(dim - 1 - idx) * 2]
5097 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5098 break;
5100 if (idx < dim)
5101 break;
5103 else
5105 idx = 1;
5106 for (; CONSP (val); val = XCDR (val))
5108 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5109 dim = CHARSET_DIMENSION (charset);
5110 while (idx < dim)
5112 if (src == src_end)
5113 goto too_short;
5114 ONE_MORE_BYTE (c);
5115 if (c < charset->code_space[(dim - 1 - idx) * 4]
5116 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5117 break;
5118 idx++;
5120 if (idx == dim)
5122 val = Qnil;
5123 break;
5126 if (CONSP (val))
5127 break;
5130 too_short:
5131 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5132 return 0;
5134 no_more_source:
5135 detect_info->found |= found;
5136 return 1;
5139 static void
5140 decode_coding_charset (coding)
5141 struct coding_system *coding;
5143 const unsigned char *src = coding->source + coding->consumed;
5144 const unsigned char *src_end = coding->source + coding->src_bytes;
5145 const unsigned char *src_base;
5146 int *charbuf = coding->charbuf + coding->charbuf_used;
5147 int *charbuf_end
5148 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
5149 int consumed_chars = 0, consumed_chars_base;
5150 int multibytep = coding->src_multibyte;
5151 Lisp_Object attrs, charset_list, valids;
5152 int char_offset = coding->produced_char;
5153 int last_offset = char_offset;
5154 int last_id = charset_ascii;
5155 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5156 int byte_after_cr = -1;
5158 CODING_GET_INFO (coding, attrs, charset_list);
5159 valids = AREF (attrs, coding_attr_charset_valids);
5161 while (1)
5163 int c;
5164 Lisp_Object val;
5165 struct charset *charset;
5166 int dim;
5167 int len = 1;
5168 unsigned code;
5170 src_base = src;
5171 consumed_chars_base = consumed_chars;
5173 if (charbuf >= charbuf_end)
5175 if (byte_after_cr >= 0)
5176 src_base--;
5177 break;
5180 if (byte_after_cr >= 0)
5182 c = byte_after_cr;
5183 byte_after_cr = -1;
5185 else
5187 ONE_MORE_BYTE (c);
5188 if (eol_crlf && c == '\r')
5189 ONE_MORE_BYTE (byte_after_cr);
5191 if (c < 0)
5192 goto invalid_code;
5193 code = c;
5195 val = AREF (valids, c);
5196 if (! INTEGERP (val) && ! CONSP (val))
5197 goto invalid_code;
5198 if (INTEGERP (val))
5200 charset = CHARSET_FROM_ID (XFASTINT (val));
5201 dim = CHARSET_DIMENSION (charset);
5202 while (len < dim)
5204 ONE_MORE_BYTE (c);
5205 code = (code << 8) | c;
5206 len++;
5208 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5209 charset, code, c);
5211 else
5213 /* VAL is a list of charset IDs. It is assured that the
5214 list is sorted by charset dimensions (smaller one
5215 comes first). */
5216 while (CONSP (val))
5218 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5219 dim = CHARSET_DIMENSION (charset);
5220 while (len < dim)
5222 ONE_MORE_BYTE (c);
5223 code = (code << 8) | c;
5224 len++;
5226 CODING_DECODE_CHAR (coding, src, src_base,
5227 src_end, charset, code, c);
5228 if (c >= 0)
5229 break;
5230 val = XCDR (val);
5233 if (c < 0)
5234 goto invalid_code;
5235 if (charset->id != charset_ascii
5236 && last_id != charset->id)
5238 if (last_id != charset_ascii)
5239 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5240 last_id = charset->id;
5241 last_offset = char_offset;
5244 *charbuf++ = c;
5245 char_offset++;
5246 continue;
5248 invalid_code:
5249 src = src_base;
5250 consumed_chars = consumed_chars_base;
5251 ONE_MORE_BYTE (c);
5252 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5253 char_offset++;
5254 coding->errors++;
5257 no_more_source:
5258 if (last_id != charset_ascii)
5259 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5260 coding->consumed_char += consumed_chars_base;
5261 coding->consumed = src_base - coding->source;
5262 coding->charbuf_used = charbuf - coding->charbuf;
5265 static int
5266 encode_coding_charset (coding)
5267 struct coding_system *coding;
5269 int multibytep = coding->dst_multibyte;
5270 int *charbuf = coding->charbuf;
5271 int *charbuf_end = charbuf + coding->charbuf_used;
5272 unsigned char *dst = coding->destination + coding->produced;
5273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5274 int safe_room = MAX_MULTIBYTE_LENGTH;
5275 int produced_chars = 0;
5276 Lisp_Object attrs, charset_list;
5277 int ascii_compatible;
5278 int c;
5280 CODING_GET_INFO (coding, attrs, charset_list);
5281 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5283 while (charbuf < charbuf_end)
5285 struct charset *charset;
5286 unsigned code;
5288 ASSURE_DESTINATION (safe_room);
5289 c = *charbuf++;
5290 if (ascii_compatible && ASCII_CHAR_P (c))
5291 EMIT_ONE_ASCII_BYTE (c);
5292 else if (CHAR_BYTE8_P (c))
5294 c = CHAR_TO_BYTE8 (c);
5295 EMIT_ONE_BYTE (c);
5297 else
5299 charset = char_charset (c, charset_list, &code);
5300 if (charset)
5302 if (CHARSET_DIMENSION (charset) == 1)
5303 EMIT_ONE_BYTE (code);
5304 else if (CHARSET_DIMENSION (charset) == 2)
5305 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5306 else if (CHARSET_DIMENSION (charset) == 3)
5307 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5308 else
5309 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5310 (code >> 8) & 0xFF, code & 0xFF);
5312 else
5314 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5315 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5316 else
5317 c = coding->default_char;
5318 EMIT_ONE_BYTE (c);
5323 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5324 coding->produced_char += produced_chars;
5325 coding->produced = dst - coding->destination;
5326 return 0;
5330 /*** 7. C library functions ***/
5332 /* Setup coding context CODING from information about CODING_SYSTEM.
5333 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5334 CODING_SYSTEM is invalid, signal an error. */
5336 void
5337 setup_coding_system (coding_system, coding)
5338 Lisp_Object coding_system;
5339 struct coding_system *coding;
5341 Lisp_Object attrs;
5342 Lisp_Object eol_type;
5343 Lisp_Object coding_type;
5344 Lisp_Object val;
5346 if (NILP (coding_system))
5347 coding_system = Qundecided;
5349 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5351 attrs = CODING_ID_ATTRS (coding->id);
5352 eol_type = CODING_ID_EOL_TYPE (coding->id);
5354 coding->mode = 0;
5355 coding->head_ascii = -1;
5356 if (VECTORP (eol_type))
5357 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5358 | CODING_REQUIRE_DETECTION_MASK);
5359 else if (! EQ (eol_type, Qunix))
5360 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5361 | CODING_REQUIRE_ENCODING_MASK);
5362 else
5363 coding->common_flags = 0;
5364 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5365 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5366 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5367 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5368 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5369 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5371 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5372 coding->max_charset_id = SCHARS (val) - 1;
5373 coding->safe_charsets = (char *) SDATA (val);
5374 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5376 coding_type = CODING_ATTR_TYPE (attrs);
5377 if (EQ (coding_type, Qundecided))
5379 coding->detector = NULL;
5380 coding->decoder = decode_coding_raw_text;
5381 coding->encoder = encode_coding_raw_text;
5382 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5384 else if (EQ (coding_type, Qiso_2022))
5386 int i;
5387 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5389 /* Invoke graphic register 0 to plane 0. */
5390 CODING_ISO_INVOCATION (coding, 0) = 0;
5391 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5392 CODING_ISO_INVOCATION (coding, 1)
5393 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5394 /* Setup the initial status of designation. */
5395 for (i = 0; i < 4; i++)
5396 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5397 /* Not single shifting initially. */
5398 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5399 /* Beginning of buffer should also be regarded as bol. */
5400 CODING_ISO_BOL (coding) = 1;
5401 coding->detector = detect_coding_iso_2022;
5402 coding->decoder = decode_coding_iso_2022;
5403 coding->encoder = encode_coding_iso_2022;
5404 if (flags & CODING_ISO_FLAG_SAFE)
5405 coding->mode |= CODING_MODE_SAFE_ENCODING;
5406 coding->common_flags
5407 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5408 | CODING_REQUIRE_FLUSHING_MASK);
5409 if (flags & CODING_ISO_FLAG_COMPOSITION)
5410 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5411 if (flags & CODING_ISO_FLAG_DESIGNATION)
5412 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5413 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5415 setup_iso_safe_charsets (attrs);
5416 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5417 coding->max_charset_id = SCHARS (val) - 1;
5418 coding->safe_charsets = (char *) SDATA (val);
5420 CODING_ISO_FLAGS (coding) = flags;
5422 else if (EQ (coding_type, Qcharset))
5424 coding->detector = detect_coding_charset;
5425 coding->decoder = decode_coding_charset;
5426 coding->encoder = encode_coding_charset;
5427 coding->common_flags
5428 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5430 else if (EQ (coding_type, Qutf_8))
5432 val = AREF (attrs, coding_attr_utf_bom);
5433 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5434 : EQ (val, Qt) ? utf_with_bom
5435 : utf_without_bom);
5436 coding->detector = detect_coding_utf_8;
5437 coding->decoder = decode_coding_utf_8;
5438 coding->encoder = encode_coding_utf_8;
5439 coding->common_flags
5440 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5441 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5442 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5444 else if (EQ (coding_type, Qutf_16))
5446 val = AREF (attrs, coding_attr_utf_bom);
5447 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5448 : EQ (val, Qt) ? utf_with_bom
5449 : utf_without_bom);
5450 val = AREF (attrs, coding_attr_utf_16_endian);
5451 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5452 : utf_16_little_endian);
5453 CODING_UTF_16_SURROGATE (coding) = 0;
5454 coding->detector = detect_coding_utf_16;
5455 coding->decoder = decode_coding_utf_16;
5456 coding->encoder = encode_coding_utf_16;
5457 coding->common_flags
5458 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5459 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5460 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5462 else if (EQ (coding_type, Qccl))
5464 coding->detector = detect_coding_ccl;
5465 coding->decoder = decode_coding_ccl;
5466 coding->encoder = encode_coding_ccl;
5467 coding->common_flags
5468 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5469 | CODING_REQUIRE_FLUSHING_MASK);
5471 else if (EQ (coding_type, Qemacs_mule))
5473 coding->detector = detect_coding_emacs_mule;
5474 coding->decoder = decode_coding_emacs_mule;
5475 coding->encoder = encode_coding_emacs_mule;
5476 coding->common_flags
5477 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5478 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5479 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5481 Lisp_Object tail, safe_charsets;
5482 int max_charset_id = 0;
5484 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5485 tail = XCDR (tail))
5486 if (max_charset_id < XFASTINT (XCAR (tail)))
5487 max_charset_id = XFASTINT (XCAR (tail));
5488 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5489 make_number (255));
5490 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5491 tail = XCDR (tail))
5492 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5493 coding->max_charset_id = max_charset_id;
5494 coding->safe_charsets = (char *) SDATA (safe_charsets);
5497 else if (EQ (coding_type, Qshift_jis))
5499 coding->detector = detect_coding_sjis;
5500 coding->decoder = decode_coding_sjis;
5501 coding->encoder = encode_coding_sjis;
5502 coding->common_flags
5503 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5505 else if (EQ (coding_type, Qbig5))
5507 coding->detector = detect_coding_big5;
5508 coding->decoder = decode_coding_big5;
5509 coding->encoder = encode_coding_big5;
5510 coding->common_flags
5511 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5513 else /* EQ (coding_type, Qraw_text) */
5515 coding->detector = NULL;
5516 coding->decoder = decode_coding_raw_text;
5517 coding->encoder = encode_coding_raw_text;
5518 if (! EQ (eol_type, Qunix))
5520 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5521 if (! VECTORP (eol_type))
5522 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5527 return;
5530 /* Return a list of charsets supported by CODING. */
5532 Lisp_Object
5533 coding_charset_list (coding)
5534 struct coding_system *coding;
5536 Lisp_Object attrs, charset_list;
5538 CODING_GET_INFO (coding, attrs, charset_list);
5539 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5541 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5543 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5544 charset_list = Viso_2022_charset_list;
5546 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5548 charset_list = Vemacs_mule_charset_list;
5550 return charset_list;
5554 /* Return raw-text or one of its subsidiaries that has the same
5555 eol_type as CODING-SYSTEM. */
5557 Lisp_Object
5558 raw_text_coding_system (coding_system)
5559 Lisp_Object coding_system;
5561 Lisp_Object spec, attrs;
5562 Lisp_Object eol_type, raw_text_eol_type;
5564 if (NILP (coding_system))
5565 return Qraw_text;
5566 spec = CODING_SYSTEM_SPEC (coding_system);
5567 attrs = AREF (spec, 0);
5569 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5570 return coding_system;
5572 eol_type = AREF (spec, 2);
5573 if (VECTORP (eol_type))
5574 return Qraw_text;
5575 spec = CODING_SYSTEM_SPEC (Qraw_text);
5576 raw_text_eol_type = AREF (spec, 2);
5577 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5578 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5579 : AREF (raw_text_eol_type, 2));
5583 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5584 does, return one of the subsidiary that has the same eol-spec as
5585 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5586 inherit end-of-line format from the system's setting
5587 (system_eol_type). */
5589 Lisp_Object
5590 coding_inherit_eol_type (coding_system, parent)
5591 Lisp_Object coding_system, parent;
5593 Lisp_Object spec, eol_type;
5595 if (NILP (coding_system))
5596 coding_system = Qraw_text;
5597 spec = CODING_SYSTEM_SPEC (coding_system);
5598 eol_type = AREF (spec, 2);
5599 if (VECTORP (eol_type))
5601 Lisp_Object parent_eol_type;
5603 if (! NILP (parent))
5605 Lisp_Object parent_spec;
5607 parent_spec = CODING_SYSTEM_SPEC (parent);
5608 parent_eol_type = AREF (parent_spec, 2);
5610 else
5611 parent_eol_type = system_eol_type;
5612 if (EQ (parent_eol_type, Qunix))
5613 coding_system = AREF (eol_type, 0);
5614 else if (EQ (parent_eol_type, Qdos))
5615 coding_system = AREF (eol_type, 1);
5616 else if (EQ (parent_eol_type, Qmac))
5617 coding_system = AREF (eol_type, 2);
5619 return coding_system;
5622 /* Emacs has a mechanism to automatically detect a coding system if it
5623 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5624 it's impossible to distinguish some coding systems accurately
5625 because they use the same range of codes. So, at first, coding
5626 systems are categorized into 7, those are:
5628 o coding-category-emacs-mule
5630 The category for a coding system which has the same code range
5631 as Emacs' internal format. Assigned the coding-system (Lisp
5632 symbol) `emacs-mule' by default.
5634 o coding-category-sjis
5636 The category for a coding system which has the same code range
5637 as SJIS. Assigned the coding-system (Lisp
5638 symbol) `japanese-shift-jis' by default.
5640 o coding-category-iso-7
5642 The category for a coding system which has the same code range
5643 as ISO2022 of 7-bit environment. This doesn't use any locking
5644 shift and single shift functions. This can encode/decode all
5645 charsets. Assigned the coding-system (Lisp symbol)
5646 `iso-2022-7bit' by default.
5648 o coding-category-iso-7-tight
5650 Same as coding-category-iso-7 except that this can
5651 encode/decode only the specified charsets.
5653 o coding-category-iso-8-1
5655 The category for a coding system which has the same code range
5656 as ISO2022 of 8-bit environment and graphic plane 1 used only
5657 for DIMENSION1 charset. This doesn't use any locking shift
5658 and single shift functions. Assigned the coding-system (Lisp
5659 symbol) `iso-latin-1' by default.
5661 o coding-category-iso-8-2
5663 The category for a coding system which has the same code range
5664 as ISO2022 of 8-bit environment and graphic plane 1 used only
5665 for DIMENSION2 charset. This doesn't use any locking shift
5666 and single shift functions. Assigned the coding-system (Lisp
5667 symbol) `japanese-iso-8bit' by default.
5669 o coding-category-iso-7-else
5671 The category for a coding system which has the same code range
5672 as ISO2022 of 7-bit environemnt but uses locking shift or
5673 single shift functions. Assigned the coding-system (Lisp
5674 symbol) `iso-2022-7bit-lock' by default.
5676 o coding-category-iso-8-else
5678 The category for a coding system which has the same code range
5679 as ISO2022 of 8-bit environemnt but uses locking shift or
5680 single shift functions. Assigned the coding-system (Lisp
5681 symbol) `iso-2022-8bit-ss2' by default.
5683 o coding-category-big5
5685 The category for a coding system which has the same code range
5686 as BIG5. Assigned the coding-system (Lisp symbol)
5687 `cn-big5' by default.
5689 o coding-category-utf-8
5691 The category for a coding system which has the same code range
5692 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5693 symbol) `utf-8' by default.
5695 o coding-category-utf-16-be
5697 The category for a coding system in which a text has an
5698 Unicode signature (cf. Unicode Standard) in the order of BIG
5699 endian at the head. Assigned the coding-system (Lisp symbol)
5700 `utf-16-be' by default.
5702 o coding-category-utf-16-le
5704 The category for a coding system in which a text has an
5705 Unicode signature (cf. Unicode Standard) in the order of
5706 LITTLE endian at the head. Assigned the coding-system (Lisp
5707 symbol) `utf-16-le' by default.
5709 o coding-category-ccl
5711 The category for a coding system of which encoder/decoder is
5712 written in CCL programs. The default value is nil, i.e., no
5713 coding system is assigned.
5715 o coding-category-binary
5717 The category for a coding system not categorized in any of the
5718 above. Assigned the coding-system (Lisp symbol)
5719 `no-conversion' by default.
5721 Each of them is a Lisp symbol and the value is an actual
5722 `coding-system's (this is also a Lisp symbol) assigned by a user.
5723 What Emacs does actually is to detect a category of coding system.
5724 Then, it uses a `coding-system' assigned to it. If Emacs can't
5725 decide only one possible category, it selects a category of the
5726 highest priority. Priorities of categories are also specified by a
5727 user in a Lisp variable `coding-category-list'.
5731 #define EOL_SEEN_NONE 0
5732 #define EOL_SEEN_LF 1
5733 #define EOL_SEEN_CR 2
5734 #define EOL_SEEN_CRLF 4
5736 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5737 SOURCE is encoded. If CATEGORY is one of
5738 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5739 two-byte, else they are encoded by one-byte.
5741 Return one of EOL_SEEN_XXX. */
5743 #define MAX_EOL_CHECK_COUNT 3
5745 static int
5746 detect_eol (source, src_bytes, category)
5747 const unsigned char *source;
5748 EMACS_INT src_bytes;
5749 enum coding_category category;
5751 const unsigned char *src = source, *src_end = src + src_bytes;
5752 unsigned char c;
5753 int total = 0;
5754 int eol_seen = EOL_SEEN_NONE;
5756 if ((1 << category) & CATEGORY_MASK_UTF_16)
5758 int msb, lsb;
5760 msb = category == (coding_category_utf_16_le
5761 | coding_category_utf_16_le_nosig);
5762 lsb = 1 - msb;
5764 while (src + 1 < src_end)
5766 c = src[lsb];
5767 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5769 int this_eol;
5771 if (c == '\n')
5772 this_eol = EOL_SEEN_LF;
5773 else if (src + 3 >= src_end
5774 || src[msb + 2] != 0
5775 || src[lsb + 2] != '\n')
5776 this_eol = EOL_SEEN_CR;
5777 else
5778 this_eol = EOL_SEEN_CRLF;
5780 if (eol_seen == EOL_SEEN_NONE)
5781 /* This is the first end-of-line. */
5782 eol_seen = this_eol;
5783 else if (eol_seen != this_eol)
5785 /* The found type is different from what found before. */
5786 eol_seen = EOL_SEEN_LF;
5787 break;
5789 if (++total == MAX_EOL_CHECK_COUNT)
5790 break;
5792 src += 2;
5795 else
5797 while (src < src_end)
5799 c = *src++;
5800 if (c == '\n' || c == '\r')
5802 int this_eol;
5804 if (c == '\n')
5805 this_eol = EOL_SEEN_LF;
5806 else if (src >= src_end || *src != '\n')
5807 this_eol = EOL_SEEN_CR;
5808 else
5809 this_eol = EOL_SEEN_CRLF, src++;
5811 if (eol_seen == EOL_SEEN_NONE)
5812 /* This is the first end-of-line. */
5813 eol_seen = this_eol;
5814 else if (eol_seen != this_eol)
5816 /* The found type is different from what found before. */
5817 eol_seen = EOL_SEEN_LF;
5818 break;
5820 if (++total == MAX_EOL_CHECK_COUNT)
5821 break;
5825 return eol_seen;
5829 static Lisp_Object
5830 adjust_coding_eol_type (coding, eol_seen)
5831 struct coding_system *coding;
5832 int eol_seen;
5834 Lisp_Object eol_type;
5836 eol_type = CODING_ID_EOL_TYPE (coding->id);
5837 if (eol_seen & EOL_SEEN_LF)
5839 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5840 eol_type = Qunix;
5842 else if (eol_seen & EOL_SEEN_CRLF)
5844 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5845 eol_type = Qdos;
5847 else if (eol_seen & EOL_SEEN_CR)
5849 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5850 eol_type = Qmac;
5852 return eol_type;
5855 /* Detect how a text specified in CODING is encoded. If a coding
5856 system is detected, update fields of CODING by the detected coding
5857 system. */
5859 void
5860 detect_coding (coding)
5861 struct coding_system *coding;
5863 const unsigned char *src, *src_end;
5865 coding->consumed = coding->consumed_char = 0;
5866 coding->produced = coding->produced_char = 0;
5867 coding_set_source (coding);
5869 src_end = coding->source + coding->src_bytes;
5870 coding->head_ascii = 0;
5872 /* If we have not yet decided the text encoding type, detect it
5873 now. */
5874 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5876 int c, i;
5877 struct coding_detection_info detect_info;
5878 int null_byte_found = 0, eight_bit_found = 0;
5880 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5881 for (src = coding->source; src < src_end; src++)
5883 c = *src;
5884 if (c & 0x80)
5886 eight_bit_found = 1;
5887 if (null_byte_found)
5888 break;
5890 else if (c < 0x20)
5892 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5893 && ! inhibit_iso_escape_detection
5894 && ! detect_info.checked)
5896 if (detect_coding_iso_2022 (coding, &detect_info))
5898 /* We have scanned the whole data. */
5899 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5901 /* We didn't find an 8-bit code. We may
5902 have found a null-byte, but it's very
5903 rare that a binary file confirm to
5904 ISO-2022. */
5905 src = src_end;
5906 coding->head_ascii = src - coding->source;
5908 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
5909 break;
5912 else if (! c && !inhibit_null_byte_detection)
5914 null_byte_found = 1;
5915 if (eight_bit_found)
5916 break;
5918 if (! eight_bit_found)
5919 coding->head_ascii++;
5921 else if (! eight_bit_found)
5922 coding->head_ascii++;
5925 if (null_byte_found || eight_bit_found
5926 || coding->head_ascii < coding->src_bytes
5927 || detect_info.found)
5929 enum coding_category category;
5930 struct coding_system *this;
5932 if (coding->head_ascii == coding->src_bytes)
5933 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5934 for (i = 0; i < coding_category_raw_text; i++)
5936 category = coding_priorities[i];
5937 this = coding_categories + category;
5938 if (detect_info.found & (1 << category))
5939 break;
5941 else
5943 if (null_byte_found)
5945 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5946 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
5948 for (i = 0; i < coding_category_raw_text; i++)
5950 category = coding_priorities[i];
5951 this = coding_categories + category;
5952 if (this->id < 0)
5954 /* No coding system of this category is defined. */
5955 detect_info.rejected |= (1 << category);
5957 else if (category >= coding_category_raw_text)
5958 continue;
5959 else if (detect_info.checked & (1 << category))
5961 if (detect_info.found & (1 << category))
5962 break;
5964 else if ((*(this->detector)) (coding, &detect_info)
5965 && detect_info.found & (1 << category))
5967 if (category == coding_category_utf_16_auto)
5969 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5970 category = coding_category_utf_16_le;
5971 else
5972 category = coding_category_utf_16_be;
5974 break;
5979 if (i < coding_category_raw_text)
5980 setup_coding_system (CODING_ID_NAME (this->id), coding);
5981 else if (null_byte_found)
5982 setup_coding_system (Qno_conversion, coding);
5983 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5984 == CATEGORY_MASK_ANY)
5985 setup_coding_system (Qraw_text, coding);
5986 else if (detect_info.rejected)
5987 for (i = 0; i < coding_category_raw_text; i++)
5988 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5990 this = coding_categories + coding_priorities[i];
5991 setup_coding_system (CODING_ID_NAME (this->id), coding);
5992 break;
5996 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5997 == coding_category_utf_8_auto)
5999 Lisp_Object coding_systems;
6000 struct coding_detection_info detect_info;
6002 coding_systems
6003 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6004 detect_info.found = detect_info.rejected = 0;
6005 coding->head_ascii = 0;
6006 if (CONSP (coding_systems)
6007 && detect_coding_utf_8 (coding, &detect_info))
6009 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6010 setup_coding_system (XCAR (coding_systems), coding);
6011 else
6012 setup_coding_system (XCDR (coding_systems), coding);
6015 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6016 == coding_category_utf_16_auto)
6018 Lisp_Object coding_systems;
6019 struct coding_detection_info detect_info;
6021 coding_systems
6022 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6023 detect_info.found = detect_info.rejected = 0;
6024 coding->head_ascii = 0;
6025 if (CONSP (coding_systems)
6026 && detect_coding_utf_16 (coding, &detect_info))
6028 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6029 setup_coding_system (XCAR (coding_systems), coding);
6030 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6031 setup_coding_system (XCDR (coding_systems), coding);
6037 static void
6038 decode_eol (coding)
6039 struct coding_system *coding;
6041 Lisp_Object eol_type;
6042 unsigned char *p, *pbeg, *pend;
6044 eol_type = CODING_ID_EOL_TYPE (coding->id);
6045 if (EQ (eol_type, Qunix))
6046 return;
6048 if (NILP (coding->dst_object))
6049 pbeg = coding->destination;
6050 else
6051 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6052 pend = pbeg + coding->produced;
6054 if (VECTORP (eol_type))
6056 int eol_seen = EOL_SEEN_NONE;
6058 for (p = pbeg; p < pend; p++)
6060 if (*p == '\n')
6061 eol_seen |= EOL_SEEN_LF;
6062 else if (*p == '\r')
6064 if (p + 1 < pend && *(p + 1) == '\n')
6066 eol_seen |= EOL_SEEN_CRLF;
6067 p++;
6069 else
6070 eol_seen |= EOL_SEEN_CR;
6073 if (eol_seen != EOL_SEEN_NONE
6074 && eol_seen != EOL_SEEN_LF
6075 && eol_seen != EOL_SEEN_CRLF
6076 && eol_seen != EOL_SEEN_CR)
6077 eol_seen = EOL_SEEN_LF;
6078 if (eol_seen != EOL_SEEN_NONE)
6079 eol_type = adjust_coding_eol_type (coding, eol_seen);
6082 if (EQ (eol_type, Qmac))
6084 for (p = pbeg; p < pend; p++)
6085 if (*p == '\r')
6086 *p = '\n';
6088 else if (EQ (eol_type, Qdos))
6090 int n = 0;
6092 if (NILP (coding->dst_object))
6094 /* Start deleting '\r' from the tail to minimize the memory
6095 movement. */
6096 for (p = pend - 2; p >= pbeg; p--)
6097 if (*p == '\r')
6099 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6100 n++;
6103 else
6105 int pos_byte = coding->dst_pos_byte;
6106 int pos = coding->dst_pos;
6107 int pos_end = pos + coding->produced_char - 1;
6109 while (pos < pos_end)
6111 p = BYTE_POS_ADDR (pos_byte);
6112 if (*p == '\r' && p[1] == '\n')
6114 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6115 n++;
6116 pos_end--;
6118 pos++;
6119 if (coding->dst_multibyte)
6120 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6121 else
6122 pos_byte++;
6125 coding->produced -= n;
6126 coding->produced_char -= n;
6131 /* Return a translation table (or list of them) from coding system
6132 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6133 decoding (ENCODEP is zero). */
6135 static Lisp_Object
6136 get_translation_table (attrs, encodep, max_lookup)
6137 Lisp_Object attrs;
6138 int encodep, *max_lookup;
6140 Lisp_Object standard, translation_table;
6141 Lisp_Object val;
6143 if (encodep)
6144 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6145 standard = Vstandard_translation_table_for_encode;
6146 else
6147 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6148 standard = Vstandard_translation_table_for_decode;
6149 if (NILP (translation_table))
6150 translation_table = standard;
6151 else
6153 if (SYMBOLP (translation_table))
6154 translation_table = Fget (translation_table, Qtranslation_table);
6155 else if (CONSP (translation_table))
6157 translation_table = Fcopy_sequence (translation_table);
6158 for (val = translation_table; CONSP (val); val = XCDR (val))
6159 if (SYMBOLP (XCAR (val)))
6160 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6162 if (CHAR_TABLE_P (standard))
6164 if (CONSP (translation_table))
6165 translation_table = nconc2 (translation_table,
6166 Fcons (standard, Qnil));
6167 else
6168 translation_table = Fcons (translation_table,
6169 Fcons (standard, Qnil));
6173 if (max_lookup)
6175 *max_lookup = 1;
6176 if (CHAR_TABLE_P (translation_table)
6177 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6179 val = XCHAR_TABLE (translation_table)->extras[1];
6180 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6181 *max_lookup = XFASTINT (val);
6183 else if (CONSP (translation_table))
6185 Lisp_Object tail, val;
6187 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6188 if (CHAR_TABLE_P (XCAR (tail))
6189 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6191 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6192 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6193 *max_lookup = XFASTINT (val);
6197 return translation_table;
6200 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6201 do { \
6202 trans = Qnil; \
6203 if (CHAR_TABLE_P (table)) \
6205 trans = CHAR_TABLE_REF (table, c); \
6206 if (CHARACTERP (trans)) \
6207 c = XFASTINT (trans), trans = Qnil; \
6209 else if (CONSP (table)) \
6211 Lisp_Object tail; \
6213 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6214 if (CHAR_TABLE_P (XCAR (tail))) \
6216 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6217 if (CHARACTERP (trans)) \
6218 c = XFASTINT (trans), trans = Qnil; \
6219 else if (! NILP (trans)) \
6220 break; \
6223 } while (0)
6226 static Lisp_Object
6227 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6228 Lisp_Object val;
6229 int *buf, *buf_end;
6230 int last_block;
6231 int *from_nchars, *to_nchars;
6233 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
6234 [TO-CHAR ...]. */
6235 if (CONSP (val))
6237 Lisp_Object from, tail;
6238 int i, len;
6240 for (tail = val; CONSP (tail); tail = XCDR (tail))
6242 val = XCAR (tail);
6243 from = XCAR (val);
6244 len = ASIZE (from);
6245 for (i = 0; i < len; i++)
6247 if (buf + i == buf_end)
6249 if (! last_block)
6250 return Qt;
6251 break;
6253 if (XINT (AREF (from, i)) != buf[i])
6254 break;
6256 if (i == len)
6258 val = XCDR (val);
6259 *from_nchars = len;
6260 break;
6263 if (! CONSP (tail))
6264 return Qnil;
6266 if (VECTORP (val))
6267 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6268 else
6269 *buf = XINT (val);
6270 return val;
6274 static int
6275 produce_chars (coding, translation_table, last_block)
6276 struct coding_system *coding;
6277 Lisp_Object translation_table;
6278 int last_block;
6280 unsigned char *dst = coding->destination + coding->produced;
6281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6282 EMACS_INT produced;
6283 EMACS_INT produced_chars = 0;
6284 int carryover = 0;
6286 if (! coding->chars_at_source)
6288 /* Source characters are in coding->charbuf. */
6289 int *buf = coding->charbuf;
6290 int *buf_end = buf + coding->charbuf_used;
6292 if (EQ (coding->src_object, coding->dst_object))
6294 coding_set_source (coding);
6295 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6298 while (buf < buf_end)
6300 int c = *buf, i;
6302 if (c >= 0)
6304 int from_nchars = 1, to_nchars = 1;
6305 Lisp_Object trans = Qnil;
6307 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6308 if (! NILP (trans))
6310 trans = get_translation (trans, buf, buf_end, last_block,
6311 &from_nchars, &to_nchars);
6312 if (EQ (trans, Qt))
6313 break;
6314 c = *buf;
6317 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6319 dst = alloc_destination (coding,
6320 buf_end - buf
6321 + MAX_MULTIBYTE_LENGTH * to_nchars,
6322 dst);
6323 if (EQ (coding->src_object, coding->dst_object))
6325 coding_set_source (coding);
6326 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6328 else
6329 dst_end = coding->destination + coding->dst_bytes;
6332 for (i = 0; i < to_nchars; i++)
6334 if (i > 0)
6335 c = XINT (AREF (trans, i));
6336 if (coding->dst_multibyte
6337 || ! CHAR_BYTE8_P (c))
6338 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6339 else
6340 *dst++ = CHAR_TO_BYTE8 (c);
6342 produced_chars += to_nchars;
6343 *buf++ = to_nchars;
6344 while (--from_nchars > 0)
6345 *buf++ = 0;
6347 else
6348 /* This is an annotation datum. (-C) is the length. */
6349 buf += -c;
6351 carryover = buf_end - buf;
6353 else
6355 /* Source characters are at coding->source. */
6356 const unsigned char *src = coding->source;
6357 const unsigned char *src_end = src + coding->consumed;
6359 if (EQ (coding->dst_object, coding->src_object))
6360 dst_end = (unsigned char *) src;
6361 if (coding->src_multibyte != coding->dst_multibyte)
6363 if (coding->src_multibyte)
6365 int multibytep = 1;
6366 EMACS_INT consumed_chars = 0;
6368 while (1)
6370 const unsigned char *src_base = src;
6371 int c;
6373 ONE_MORE_BYTE (c);
6374 if (dst == dst_end)
6376 if (EQ (coding->src_object, coding->dst_object))
6377 dst_end = (unsigned char *) src;
6378 if (dst == dst_end)
6380 EMACS_INT offset = src - coding->source;
6382 dst = alloc_destination (coding, src_end - src + 1,
6383 dst);
6384 dst_end = coding->destination + coding->dst_bytes;
6385 coding_set_source (coding);
6386 src = coding->source + offset;
6387 src_end = coding->source + coding->src_bytes;
6388 if (EQ (coding->src_object, coding->dst_object))
6389 dst_end = (unsigned char *) src;
6392 *dst++ = c;
6393 produced_chars++;
6395 no_more_source:
6398 else
6399 while (src < src_end)
6401 int multibytep = 1;
6402 int c = *src++;
6404 if (dst >= dst_end - 1)
6406 if (EQ (coding->src_object, coding->dst_object))
6407 dst_end = (unsigned char *) src;
6408 if (dst >= dst_end - 1)
6410 EMACS_INT offset = src - coding->source;
6411 EMACS_INT more_bytes;
6413 if (EQ (coding->src_object, coding->dst_object))
6414 more_bytes = ((src_end - src) / 2) + 2;
6415 else
6416 more_bytes = src_end - src + 2;
6417 dst = alloc_destination (coding, more_bytes, dst);
6418 dst_end = coding->destination + coding->dst_bytes;
6419 coding_set_source (coding);
6420 src = coding->source + offset;
6421 src_end = coding->source + coding->src_bytes;
6422 if (EQ (coding->src_object, coding->dst_object))
6423 dst_end = (unsigned char *) src;
6426 EMIT_ONE_BYTE (c);
6429 else
6431 if (!EQ (coding->src_object, coding->dst_object))
6433 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6435 if (require > 0)
6437 EMACS_INT offset = src - coding->source;
6439 dst = alloc_destination (coding, require, dst);
6440 coding_set_source (coding);
6441 src = coding->source + offset;
6442 src_end = coding->source + coding->src_bytes;
6445 produced_chars = coding->consumed_char;
6446 while (src < src_end)
6447 *dst++ = *src++;
6451 produced = dst - (coding->destination + coding->produced);
6452 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6453 insert_from_gap (produced_chars, produced);
6454 coding->produced += produced;
6455 coding->produced_char += produced_chars;
6456 return carryover;
6459 /* Compose text in CODING->object according to the annotation data at
6460 CHARBUF. CHARBUF is an array:
6461 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6464 static INLINE void
6465 produce_composition (coding, charbuf, pos)
6466 struct coding_system *coding;
6467 int *charbuf;
6468 EMACS_INT pos;
6470 int len;
6471 EMACS_INT to;
6472 enum composition_method method;
6473 Lisp_Object components;
6475 len = -charbuf[0];
6476 to = pos + charbuf[2];
6477 if (to <= pos)
6478 return;
6479 method = (enum composition_method) (charbuf[3]);
6481 if (method == COMPOSITION_RELATIVE)
6482 components = Qnil;
6483 else if (method >= COMPOSITION_WITH_RULE
6484 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6486 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6487 int i;
6489 len -= 4;
6490 charbuf += 4;
6491 for (i = 0; i < len; i++)
6493 args[i] = make_number (charbuf[i]);
6494 if (charbuf[i] < 0)
6495 return;
6497 components = (method == COMPOSITION_WITH_ALTCHARS
6498 ? Fstring (len, args) : Fvector (len, args));
6500 else
6501 return;
6502 compose_text (pos, to, components, Qnil, coding->dst_object);
6506 /* Put `charset' property on text in CODING->object according to
6507 the annotation data at CHARBUF. CHARBUF is an array:
6508 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6511 static INLINE void
6512 produce_charset (coding, charbuf, pos)
6513 struct coding_system *coding;
6514 int *charbuf;
6515 EMACS_INT pos;
6517 EMACS_INT from = pos - charbuf[2];
6518 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6520 Fput_text_property (make_number (from), make_number (pos),
6521 Qcharset, CHARSET_NAME (charset),
6522 coding->dst_object);
6526 #define CHARBUF_SIZE 0x4000
6528 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6529 do { \
6530 int size = CHARBUF_SIZE;; \
6532 coding->charbuf = NULL; \
6533 while (size > 1024) \
6535 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6536 if (coding->charbuf) \
6537 break; \
6538 size >>= 1; \
6540 if (! coding->charbuf) \
6542 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6543 return coding->result; \
6545 coding->charbuf_size = size; \
6546 } while (0)
6549 static void
6550 produce_annotation (coding, pos)
6551 struct coding_system *coding;
6552 EMACS_INT pos;
6554 int *charbuf = coding->charbuf;
6555 int *charbuf_end = charbuf + coding->charbuf_used;
6557 if (NILP (coding->dst_object))
6558 return;
6560 while (charbuf < charbuf_end)
6562 if (*charbuf >= 0)
6563 pos += *charbuf++;
6564 else
6566 int len = -*charbuf;
6567 switch (charbuf[1])
6569 case CODING_ANNOTATE_COMPOSITION_MASK:
6570 produce_composition (coding, charbuf, pos);
6571 break;
6572 case CODING_ANNOTATE_CHARSET_MASK:
6573 produce_charset (coding, charbuf, pos);
6574 break;
6575 default:
6576 abort ();
6578 charbuf += len;
6583 /* Decode the data at CODING->src_object into CODING->dst_object.
6584 CODING->src_object is a buffer, a string, or nil.
6585 CODING->dst_object is a buffer.
6587 If CODING->src_object is a buffer, it must be the current buffer.
6588 In this case, if CODING->src_pos is positive, it is a position of
6589 the source text in the buffer, otherwise, the source text is in the
6590 gap area of the buffer, and CODING->src_pos specifies the offset of
6591 the text from GPT (which must be the same as PT). If this is the
6592 same buffer as CODING->dst_object, CODING->src_pos must be
6593 negative.
6595 If CODING->src_object is a string, CODING->src_pos is an index to
6596 that string.
6598 If CODING->src_object is nil, CODING->source must already point to
6599 the non-relocatable memory area. In this case, CODING->src_pos is
6600 an offset from CODING->source.
6602 The decoded data is inserted at the current point of the buffer
6603 CODING->dst_object.
6606 static int
6607 decode_coding (coding)
6608 struct coding_system *coding;
6610 Lisp_Object attrs;
6611 Lisp_Object undo_list;
6612 Lisp_Object translation_table;
6613 int carryover;
6614 int i;
6616 if (BUFFERP (coding->src_object)
6617 && coding->src_pos > 0
6618 && coding->src_pos < GPT
6619 && coding->src_pos + coding->src_chars > GPT)
6620 move_gap_both (coding->src_pos, coding->src_pos_byte);
6622 undo_list = Qt;
6623 if (BUFFERP (coding->dst_object))
6625 if (current_buffer != XBUFFER (coding->dst_object))
6626 set_buffer_internal (XBUFFER (coding->dst_object));
6627 if (GPT != PT)
6628 move_gap_both (PT, PT_BYTE);
6629 undo_list = current_buffer->undo_list;
6630 current_buffer->undo_list = Qt;
6633 coding->consumed = coding->consumed_char = 0;
6634 coding->produced = coding->produced_char = 0;
6635 coding->chars_at_source = 0;
6636 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6637 coding->errors = 0;
6639 ALLOC_CONVERSION_WORK_AREA (coding);
6641 attrs = CODING_ID_ATTRS (coding->id);
6642 translation_table = get_translation_table (attrs, 0, NULL);
6644 carryover = 0;
6647 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6649 coding_set_source (coding);
6650 coding->annotated = 0;
6651 coding->charbuf_used = carryover;
6652 (*(coding->decoder)) (coding);
6653 coding_set_destination (coding);
6654 carryover = produce_chars (coding, translation_table, 0);
6655 if (coding->annotated)
6656 produce_annotation (coding, pos);
6657 for (i = 0; i < carryover; i++)
6658 coding->charbuf[i]
6659 = coding->charbuf[coding->charbuf_used - carryover + i];
6661 while (coding->consumed < coding->src_bytes
6662 && (coding->result == CODING_RESULT_SUCCESS
6663 || coding->result == CODING_RESULT_INVALID_SRC));
6665 if (carryover > 0)
6667 coding_set_destination (coding);
6668 coding->charbuf_used = carryover;
6669 produce_chars (coding, translation_table, 1);
6672 coding->carryover_bytes = 0;
6673 if (coding->consumed < coding->src_bytes)
6675 int nbytes = coding->src_bytes - coding->consumed;
6676 const unsigned char *src;
6678 coding_set_source (coding);
6679 coding_set_destination (coding);
6680 src = coding->source + coding->consumed;
6682 if (coding->mode & CODING_MODE_LAST_BLOCK)
6684 /* Flush out unprocessed data as binary chars. We are sure
6685 that the number of data is less than the size of
6686 coding->charbuf. */
6687 coding->charbuf_used = 0;
6688 coding->chars_at_source = 0;
6690 while (nbytes-- > 0)
6692 int c = *src++;
6694 if (c & 0x80)
6695 c = BYTE8_TO_CHAR (c);
6696 coding->charbuf[coding->charbuf_used++] = c;
6698 produce_chars (coding, Qnil, 1);
6700 else
6702 /* Record unprocessed bytes in coding->carryover. We are
6703 sure that the number of data is less than the size of
6704 coding->carryover. */
6705 unsigned char *p = coding->carryover;
6707 coding->carryover_bytes = nbytes;
6708 while (nbytes-- > 0)
6709 *p++ = *src++;
6711 coding->consumed = coding->src_bytes;
6714 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6715 decode_eol (coding);
6716 if (BUFFERP (coding->dst_object))
6718 current_buffer->undo_list = undo_list;
6719 record_insert (coding->dst_pos, coding->produced_char);
6721 return coding->result;
6725 /* Extract an annotation datum from a composition starting at POS and
6726 ending before LIMIT of CODING->src_object (buffer or string), store
6727 the data in BUF, set *STOP to a starting position of the next
6728 composition (if any) or to LIMIT, and return the address of the
6729 next element of BUF.
6731 If such an annotation is not found, set *STOP to a starting
6732 position of a composition after POS (if any) or to LIMIT, and
6733 return BUF. */
6735 static INLINE int *
6736 handle_composition_annotation (pos, limit, coding, buf, stop)
6737 EMACS_INT pos, limit;
6738 struct coding_system *coding;
6739 int *buf;
6740 EMACS_INT *stop;
6742 EMACS_INT start, end;
6743 Lisp_Object prop;
6745 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6746 || end > limit)
6747 *stop = limit;
6748 else if (start > pos)
6749 *stop = start;
6750 else
6752 if (start == pos)
6754 /* We found a composition. Store the corresponding
6755 annotation data in BUF. */
6756 int *head = buf;
6757 enum composition_method method = COMPOSITION_METHOD (prop);
6758 int nchars = COMPOSITION_LENGTH (prop);
6760 ADD_COMPOSITION_DATA (buf, nchars, method);
6761 if (method != COMPOSITION_RELATIVE)
6763 Lisp_Object components;
6764 int len, i, i_byte;
6766 components = COMPOSITION_COMPONENTS (prop);
6767 if (VECTORP (components))
6769 len = XVECTOR (components)->size;
6770 for (i = 0; i < len; i++)
6771 *buf++ = XINT (AREF (components, i));
6773 else if (STRINGP (components))
6775 len = SCHARS (components);
6776 i = i_byte = 0;
6777 while (i < len)
6779 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6780 buf++;
6783 else if (INTEGERP (components))
6785 len = 1;
6786 *buf++ = XINT (components);
6788 else if (CONSP (components))
6790 for (len = 0; CONSP (components);
6791 len++, components = XCDR (components))
6792 *buf++ = XINT (XCAR (components));
6794 else
6795 abort ();
6796 *head -= len;
6800 if (find_composition (end, limit, &start, &end, &prop,
6801 coding->src_object)
6802 && end <= limit)
6803 *stop = start;
6804 else
6805 *stop = limit;
6807 return buf;
6811 /* Extract an annotation datum from a text property `charset' at POS of
6812 CODING->src_object (buffer of string), store the data in BUF, set
6813 *STOP to the position where the value of `charset' property changes
6814 (limiting by LIMIT), and return the address of the next element of
6815 BUF.
6817 If the property value is nil, set *STOP to the position where the
6818 property value is non-nil (limiting by LIMIT), and return BUF. */
6820 static INLINE int *
6821 handle_charset_annotation (pos, limit, coding, buf, stop)
6822 EMACS_INT pos, limit;
6823 struct coding_system *coding;
6824 int *buf;
6825 EMACS_INT *stop;
6827 Lisp_Object val, next;
6828 int id;
6830 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6831 if (! NILP (val) && CHARSETP (val))
6832 id = XINT (CHARSET_SYMBOL_ID (val));
6833 else
6834 id = -1;
6835 ADD_CHARSET_DATA (buf, 0, id);
6836 next = Fnext_single_property_change (make_number (pos), Qcharset,
6837 coding->src_object,
6838 make_number (limit));
6839 *stop = XINT (next);
6840 return buf;
6844 static void
6845 consume_chars (coding, translation_table, max_lookup)
6846 struct coding_system *coding;
6847 Lisp_Object translation_table;
6848 int max_lookup;
6850 int *buf = coding->charbuf;
6851 int *buf_end = coding->charbuf + coding->charbuf_size;
6852 const unsigned char *src = coding->source + coding->consumed;
6853 const unsigned char *src_end = coding->source + coding->src_bytes;
6854 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6855 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6856 int multibytep = coding->src_multibyte;
6857 Lisp_Object eol_type;
6858 int c;
6859 EMACS_INT stop, stop_composition, stop_charset;
6860 int *lookup_buf = NULL;
6862 if (! NILP (translation_table))
6863 lookup_buf = alloca (sizeof (int) * max_lookup);
6865 eol_type = CODING_ID_EOL_TYPE (coding->id);
6866 if (VECTORP (eol_type))
6867 eol_type = Qunix;
6869 /* Note: composition handling is not yet implemented. */
6870 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6872 if (NILP (coding->src_object))
6873 stop = stop_composition = stop_charset = end_pos;
6874 else
6876 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6877 stop = stop_composition = pos;
6878 else
6879 stop = stop_composition = end_pos;
6880 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6881 stop = stop_charset = pos;
6882 else
6883 stop_charset = end_pos;
6886 /* Compensate for CRLF and conversion. */
6887 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6888 while (buf < buf_end)
6890 Lisp_Object trans;
6892 if (pos == stop)
6894 if (pos == end_pos)
6895 break;
6896 if (pos == stop_composition)
6897 buf = handle_composition_annotation (pos, end_pos, coding,
6898 buf, &stop_composition);
6899 if (pos == stop_charset)
6900 buf = handle_charset_annotation (pos, end_pos, coding,
6901 buf, &stop_charset);
6902 stop = (stop_composition < stop_charset
6903 ? stop_composition : stop_charset);
6906 if (! multibytep)
6908 EMACS_INT bytes;
6910 if (coding->encoder == encode_coding_raw_text)
6911 c = *src++, pos++;
6912 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6913 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
6914 else
6915 c = BYTE8_TO_CHAR (*src), src++, pos++;
6917 else
6918 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
6919 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6920 c = '\n';
6921 if (! EQ (eol_type, Qunix))
6923 if (c == '\n')
6925 if (EQ (eol_type, Qdos))
6926 *buf++ = '\r';
6927 else
6928 c = '\r';
6932 trans = Qnil;
6933 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6934 if (NILP (trans))
6935 *buf++ = c;
6936 else
6938 int from_nchars = 1, to_nchars = 1;
6939 int *lookup_buf_end;
6940 const unsigned char *p = src;
6941 int i;
6943 lookup_buf[0] = c;
6944 for (i = 1; i < max_lookup && p < src_end; i++)
6945 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6946 lookup_buf_end = lookup_buf + i;
6947 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6948 &from_nchars, &to_nchars);
6949 if (EQ (trans, Qt)
6950 || buf + to_nchars > buf_end)
6951 break;
6952 *buf++ = *lookup_buf;
6953 for (i = 1; i < to_nchars; i++)
6954 *buf++ = XINT (AREF (trans, i));
6955 for (i = 1; i < from_nchars; i++, pos++)
6956 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6960 coding->consumed = src - coding->source;
6961 coding->consumed_char = pos - coding->src_pos;
6962 coding->charbuf_used = buf - coding->charbuf;
6963 coding->chars_at_source = 0;
6967 /* Encode the text at CODING->src_object into CODING->dst_object.
6968 CODING->src_object is a buffer or a string.
6969 CODING->dst_object is a buffer or nil.
6971 If CODING->src_object is a buffer, it must be the current buffer.
6972 In this case, if CODING->src_pos is positive, it is a position of
6973 the source text in the buffer, otherwise. the source text is in the
6974 gap area of the buffer, and coding->src_pos specifies the offset of
6975 the text from GPT (which must be the same as PT). If this is the
6976 same buffer as CODING->dst_object, CODING->src_pos must be
6977 negative and CODING should not have `pre-write-conversion'.
6979 If CODING->src_object is a string, CODING should not have
6980 `pre-write-conversion'.
6982 If CODING->dst_object is a buffer, the encoded data is inserted at
6983 the current point of that buffer.
6985 If CODING->dst_object is nil, the encoded data is placed at the
6986 memory area specified by CODING->destination. */
6988 static int
6989 encode_coding (coding)
6990 struct coding_system *coding;
6992 Lisp_Object attrs;
6993 Lisp_Object translation_table;
6994 int max_lookup;
6996 attrs = CODING_ID_ATTRS (coding->id);
6997 if (coding->encoder == encode_coding_raw_text)
6998 translation_table = Qnil, max_lookup = 0;
6999 else
7000 translation_table = get_translation_table (attrs, 1, &max_lookup);
7002 if (BUFFERP (coding->dst_object))
7004 set_buffer_internal (XBUFFER (coding->dst_object));
7005 coding->dst_multibyte
7006 = ! NILP (current_buffer->enable_multibyte_characters);
7009 coding->consumed = coding->consumed_char = 0;
7010 coding->produced = coding->produced_char = 0;
7011 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7012 coding->errors = 0;
7014 ALLOC_CONVERSION_WORK_AREA (coding);
7016 do {
7017 coding_set_source (coding);
7018 consume_chars (coding, translation_table, max_lookup);
7019 coding_set_destination (coding);
7020 (*(coding->encoder)) (coding);
7021 } while (coding->consumed_char < coding->src_chars);
7023 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7024 insert_from_gap (coding->produced_char, coding->produced);
7026 return (coding->result);
7030 /* Name (or base name) of work buffer for code conversion. */
7031 static Lisp_Object Vcode_conversion_workbuf_name;
7033 /* A working buffer used by the top level conversion. Once it is
7034 created, it is never destroyed. It has the name
7035 Vcode_conversion_workbuf_name. The other working buffers are
7036 destroyed after the use is finished, and their names are modified
7037 versions of Vcode_conversion_workbuf_name. */
7038 static Lisp_Object Vcode_conversion_reused_workbuf;
7040 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7041 static int reused_workbuf_in_use;
7044 /* Return a working buffer of code convesion. MULTIBYTE specifies the
7045 multibyteness of returning buffer. */
7047 static Lisp_Object
7048 make_conversion_work_buffer (multibyte)
7049 int multibyte;
7051 Lisp_Object name, workbuf;
7052 struct buffer *current;
7054 if (reused_workbuf_in_use++)
7056 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7057 workbuf = Fget_buffer_create (name);
7059 else
7061 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7062 Vcode_conversion_reused_workbuf
7063 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7064 workbuf = Vcode_conversion_reused_workbuf;
7066 current = current_buffer;
7067 set_buffer_internal (XBUFFER (workbuf));
7068 /* We can't allow modification hooks to run in the work buffer. For
7069 instance, directory_files_internal assumes that file decoding
7070 doesn't compile new regexps. */
7071 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7072 Ferase_buffer ();
7073 current_buffer->undo_list = Qt;
7074 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7075 set_buffer_internal (current);
7076 return workbuf;
7080 static Lisp_Object
7081 code_conversion_restore (arg)
7082 Lisp_Object arg;
7084 Lisp_Object current, workbuf;
7085 struct gcpro gcpro1;
7087 GCPRO1 (arg);
7088 current = XCAR (arg);
7089 workbuf = XCDR (arg);
7090 if (! NILP (workbuf))
7092 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7093 reused_workbuf_in_use = 0;
7094 else if (! NILP (Fbuffer_live_p (workbuf)))
7095 Fkill_buffer (workbuf);
7097 set_buffer_internal (XBUFFER (current));
7098 UNGCPRO;
7099 return Qnil;
7102 Lisp_Object
7103 code_conversion_save (with_work_buf, multibyte)
7104 int with_work_buf, multibyte;
7106 Lisp_Object workbuf = Qnil;
7108 if (with_work_buf)
7109 workbuf = make_conversion_work_buffer (multibyte);
7110 record_unwind_protect (code_conversion_restore,
7111 Fcons (Fcurrent_buffer (), workbuf));
7112 return workbuf;
7116 decode_coding_gap (coding, chars, bytes)
7117 struct coding_system *coding;
7118 EMACS_INT chars, bytes;
7120 int count = specpdl_ptr - specpdl;
7121 Lisp_Object attrs;
7123 code_conversion_save (0, 0);
7125 coding->src_object = Fcurrent_buffer ();
7126 coding->src_chars = chars;
7127 coding->src_bytes = bytes;
7128 coding->src_pos = -chars;
7129 coding->src_pos_byte = -bytes;
7130 coding->src_multibyte = chars < bytes;
7131 coding->dst_object = coding->src_object;
7132 coding->dst_pos = PT;
7133 coding->dst_pos_byte = PT_BYTE;
7134 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7136 if (CODING_REQUIRE_DETECTION (coding))
7137 detect_coding (coding);
7139 coding->mode |= CODING_MODE_LAST_BLOCK;
7140 current_buffer->text->inhibit_shrinking = 1;
7141 decode_coding (coding);
7142 current_buffer->text->inhibit_shrinking = 0;
7144 attrs = CODING_ID_ATTRS (coding->id);
7145 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7147 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7148 Lisp_Object val;
7150 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7151 val = call1 (CODING_ATTR_POST_READ (attrs),
7152 make_number (coding->produced_char));
7153 CHECK_NATNUM (val);
7154 coding->produced_char += Z - prev_Z;
7155 coding->produced += Z_BYTE - prev_Z_BYTE;
7158 unbind_to (count, Qnil);
7159 return coding->result;
7163 encode_coding_gap (coding, chars, bytes)
7164 struct coding_system *coding;
7165 EMACS_INT chars, bytes;
7167 int count = specpdl_ptr - specpdl;
7169 code_conversion_save (0, 0);
7171 coding->src_object = Fcurrent_buffer ();
7172 coding->src_chars = chars;
7173 coding->src_bytes = bytes;
7174 coding->src_pos = -chars;
7175 coding->src_pos_byte = -bytes;
7176 coding->src_multibyte = chars < bytes;
7177 coding->dst_object = coding->src_object;
7178 coding->dst_pos = PT;
7179 coding->dst_pos_byte = PT_BYTE;
7181 encode_coding (coding);
7183 unbind_to (count, Qnil);
7184 return coding->result;
7188 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7189 SRC_OBJECT into DST_OBJECT by coding context CODING.
7191 SRC_OBJECT is a buffer, a string, or Qnil.
7193 If it is a buffer, the text is at point of the buffer. FROM and TO
7194 are positions in the buffer.
7196 If it is a string, the text is at the beginning of the string.
7197 FROM and TO are indices to the string.
7199 If it is nil, the text is at coding->source. FROM and TO are
7200 indices to coding->source.
7202 DST_OBJECT is a buffer, Qt, or Qnil.
7204 If it is a buffer, the decoded text is inserted at point of the
7205 buffer. If the buffer is the same as SRC_OBJECT, the source text
7206 is deleted.
7208 If it is Qt, a string is made from the decoded text, and
7209 set in CODING->dst_object.
7211 If it is Qnil, the decoded text is stored at CODING->destination.
7212 The caller must allocate CODING->dst_bytes bytes at
7213 CODING->destination by xmalloc. If the decoded text is longer than
7214 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7217 void
7218 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7219 dst_object)
7220 struct coding_system *coding;
7221 Lisp_Object src_object;
7222 EMACS_INT from, from_byte, to, to_byte;
7223 Lisp_Object dst_object;
7225 int count = specpdl_ptr - specpdl;
7226 unsigned char *destination;
7227 EMACS_INT dst_bytes;
7228 EMACS_INT chars = to - from;
7229 EMACS_INT bytes = to_byte - from_byte;
7230 Lisp_Object attrs;
7231 int saved_pt = -1, saved_pt_byte;
7232 int need_marker_adjustment = 0;
7233 Lisp_Object old_deactivate_mark;
7235 old_deactivate_mark = Vdeactivate_mark;
7237 if (NILP (dst_object))
7239 destination = coding->destination;
7240 dst_bytes = coding->dst_bytes;
7243 coding->src_object = src_object;
7244 coding->src_chars = chars;
7245 coding->src_bytes = bytes;
7246 coding->src_multibyte = chars < bytes;
7248 if (STRINGP (src_object))
7250 coding->src_pos = from;
7251 coding->src_pos_byte = from_byte;
7253 else if (BUFFERP (src_object))
7255 set_buffer_internal (XBUFFER (src_object));
7256 if (from != GPT)
7257 move_gap_both (from, from_byte);
7258 if (EQ (src_object, dst_object))
7260 struct Lisp_Marker *tail;
7262 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7264 tail->need_adjustment
7265 = tail->charpos == (tail->insertion_type ? from : to);
7266 need_marker_adjustment |= tail->need_adjustment;
7268 saved_pt = PT, saved_pt_byte = PT_BYTE;
7269 TEMP_SET_PT_BOTH (from, from_byte);
7270 current_buffer->text->inhibit_shrinking = 1;
7271 del_range_both (from, from_byte, to, to_byte, 1);
7272 coding->src_pos = -chars;
7273 coding->src_pos_byte = -bytes;
7275 else
7277 coding->src_pos = from;
7278 coding->src_pos_byte = from_byte;
7282 if (CODING_REQUIRE_DETECTION (coding))
7283 detect_coding (coding);
7284 attrs = CODING_ID_ATTRS (coding->id);
7286 if (EQ (dst_object, Qt)
7287 || (! NILP (CODING_ATTR_POST_READ (attrs))
7288 && NILP (dst_object)))
7290 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7291 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7292 coding->dst_pos = BEG;
7293 coding->dst_pos_byte = BEG_BYTE;
7295 else if (BUFFERP (dst_object))
7297 code_conversion_save (0, 0);
7298 coding->dst_object = dst_object;
7299 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7300 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7301 coding->dst_multibyte
7302 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7304 else
7306 code_conversion_save (0, 0);
7307 coding->dst_object = Qnil;
7308 /* Most callers presume this will return a multibyte result, and they
7309 won't use `binary' or `raw-text' anyway, so let's not worry about
7310 CODING_FOR_UNIBYTE. */
7311 coding->dst_multibyte = 1;
7314 decode_coding (coding);
7316 if (BUFFERP (coding->dst_object))
7317 set_buffer_internal (XBUFFER (coding->dst_object));
7319 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7321 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7322 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7323 Lisp_Object val;
7325 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7326 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7327 old_deactivate_mark);
7328 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7329 make_number (coding->produced_char));
7330 UNGCPRO;
7331 CHECK_NATNUM (val);
7332 coding->produced_char += Z - prev_Z;
7333 coding->produced += Z_BYTE - prev_Z_BYTE;
7336 if (EQ (dst_object, Qt))
7338 coding->dst_object = Fbuffer_string ();
7340 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7342 set_buffer_internal (XBUFFER (coding->dst_object));
7343 if (dst_bytes < coding->produced)
7345 destination = xrealloc (destination, coding->produced);
7346 if (! destination)
7348 record_conversion_result (coding,
7349 CODING_RESULT_INSUFFICIENT_DST);
7350 unbind_to (count, Qnil);
7351 return;
7353 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7354 move_gap_both (BEGV, BEGV_BYTE);
7355 bcopy (BEGV_ADDR, destination, coding->produced);
7356 coding->destination = destination;
7360 if (saved_pt >= 0)
7362 /* This is the case of:
7363 (BUFFERP (src_object) && EQ (src_object, dst_object))
7364 As we have moved PT while replacing the original buffer
7365 contents, we must recover it now. */
7366 set_buffer_internal (XBUFFER (src_object));
7367 current_buffer->text->inhibit_shrinking = 0;
7368 if (saved_pt < from)
7369 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7370 else if (saved_pt < from + chars)
7371 TEMP_SET_PT_BOTH (from, from_byte);
7372 else if (! NILP (current_buffer->enable_multibyte_characters))
7373 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7374 saved_pt_byte + (coding->produced - bytes));
7375 else
7376 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7377 saved_pt_byte + (coding->produced - bytes));
7379 if (need_marker_adjustment)
7381 struct Lisp_Marker *tail;
7383 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7384 if (tail->need_adjustment)
7386 tail->need_adjustment = 0;
7387 if (tail->insertion_type)
7389 tail->bytepos = from_byte;
7390 tail->charpos = from;
7392 else
7394 tail->bytepos = from_byte + coding->produced;
7395 tail->charpos
7396 = (NILP (current_buffer->enable_multibyte_characters)
7397 ? tail->bytepos : from + coding->produced_char);
7403 Vdeactivate_mark = old_deactivate_mark;
7404 unbind_to (count, coding->dst_object);
7408 void
7409 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7410 dst_object)
7411 struct coding_system *coding;
7412 Lisp_Object src_object;
7413 EMACS_INT from, from_byte, to, to_byte;
7414 Lisp_Object dst_object;
7416 int count = specpdl_ptr - specpdl;
7417 EMACS_INT chars = to - from;
7418 EMACS_INT bytes = to_byte - from_byte;
7419 Lisp_Object attrs;
7420 int saved_pt = -1, saved_pt_byte;
7421 int need_marker_adjustment = 0;
7422 int kill_src_buffer = 0;
7423 Lisp_Object old_deactivate_mark;
7425 old_deactivate_mark = Vdeactivate_mark;
7427 coding->src_object = src_object;
7428 coding->src_chars = chars;
7429 coding->src_bytes = bytes;
7430 coding->src_multibyte = chars < bytes;
7432 attrs = CODING_ID_ATTRS (coding->id);
7434 if (EQ (src_object, dst_object))
7436 struct Lisp_Marker *tail;
7438 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7440 tail->need_adjustment
7441 = tail->charpos == (tail->insertion_type ? from : to);
7442 need_marker_adjustment |= tail->need_adjustment;
7446 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7448 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7449 set_buffer_internal (XBUFFER (coding->src_object));
7450 if (STRINGP (src_object))
7451 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7452 else if (BUFFERP (src_object))
7453 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7454 else
7455 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7457 if (EQ (src_object, dst_object))
7459 set_buffer_internal (XBUFFER (src_object));
7460 saved_pt = PT, saved_pt_byte = PT_BYTE;
7461 del_range_both (from, from_byte, to, to_byte, 1);
7462 set_buffer_internal (XBUFFER (coding->src_object));
7466 Lisp_Object args[3];
7467 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7469 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7470 old_deactivate_mark);
7471 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7472 args[1] = make_number (BEG);
7473 args[2] = make_number (Z);
7474 safe_call (3, args);
7475 UNGCPRO;
7477 if (XBUFFER (coding->src_object) != current_buffer)
7478 kill_src_buffer = 1;
7479 coding->src_object = Fcurrent_buffer ();
7480 if (BEG != GPT)
7481 move_gap_both (BEG, BEG_BYTE);
7482 coding->src_chars = Z - BEG;
7483 coding->src_bytes = Z_BYTE - BEG_BYTE;
7484 coding->src_pos = BEG;
7485 coding->src_pos_byte = BEG_BYTE;
7486 coding->src_multibyte = Z < Z_BYTE;
7488 else if (STRINGP (src_object))
7490 code_conversion_save (0, 0);
7491 coding->src_pos = from;
7492 coding->src_pos_byte = from_byte;
7494 else if (BUFFERP (src_object))
7496 code_conversion_save (0, 0);
7497 set_buffer_internal (XBUFFER (src_object));
7498 if (EQ (src_object, dst_object))
7500 saved_pt = PT, saved_pt_byte = PT_BYTE;
7501 coding->src_object = del_range_1 (from, to, 1, 1);
7502 coding->src_pos = 0;
7503 coding->src_pos_byte = 0;
7505 else
7507 if (from < GPT && to >= GPT)
7508 move_gap_both (from, from_byte);
7509 coding->src_pos = from;
7510 coding->src_pos_byte = from_byte;
7513 else
7514 code_conversion_save (0, 0);
7516 if (BUFFERP (dst_object))
7518 coding->dst_object = dst_object;
7519 if (EQ (src_object, dst_object))
7521 coding->dst_pos = from;
7522 coding->dst_pos_byte = from_byte;
7524 else
7526 struct buffer *current = current_buffer;
7528 set_buffer_temp (XBUFFER (dst_object));
7529 coding->dst_pos = PT;
7530 coding->dst_pos_byte = PT_BYTE;
7531 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7532 set_buffer_temp (current);
7534 coding->dst_multibyte
7535 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7537 else if (EQ (dst_object, Qt))
7539 coding->dst_object = Qnil;
7540 coding->dst_bytes = coding->src_chars;
7541 if (coding->dst_bytes == 0)
7542 coding->dst_bytes = 1;
7543 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7544 coding->dst_multibyte = 0;
7546 else
7548 coding->dst_object = Qnil;
7549 coding->dst_multibyte = 0;
7552 encode_coding (coding);
7554 if (EQ (dst_object, Qt))
7556 if (BUFFERP (coding->dst_object))
7557 coding->dst_object = Fbuffer_string ();
7558 else
7560 coding->dst_object
7561 = make_unibyte_string ((char *) coding->destination,
7562 coding->produced);
7563 xfree (coding->destination);
7567 if (saved_pt >= 0)
7569 /* This is the case of:
7570 (BUFFERP (src_object) && EQ (src_object, dst_object))
7571 As we have moved PT while replacing the original buffer
7572 contents, we must recover it now. */
7573 set_buffer_internal (XBUFFER (src_object));
7574 if (saved_pt < from)
7575 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7576 else if (saved_pt < from + chars)
7577 TEMP_SET_PT_BOTH (from, from_byte);
7578 else if (! NILP (current_buffer->enable_multibyte_characters))
7579 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7580 saved_pt_byte + (coding->produced - bytes));
7581 else
7582 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7583 saved_pt_byte + (coding->produced - bytes));
7585 if (need_marker_adjustment)
7587 struct Lisp_Marker *tail;
7589 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7590 if (tail->need_adjustment)
7592 tail->need_adjustment = 0;
7593 if (tail->insertion_type)
7595 tail->bytepos = from_byte;
7596 tail->charpos = from;
7598 else
7600 tail->bytepos = from_byte + coding->produced;
7601 tail->charpos
7602 = (NILP (current_buffer->enable_multibyte_characters)
7603 ? tail->bytepos : from + coding->produced_char);
7609 if (kill_src_buffer)
7610 Fkill_buffer (coding->src_object);
7612 Vdeactivate_mark = old_deactivate_mark;
7613 unbind_to (count, Qnil);
7617 Lisp_Object
7618 preferred_coding_system ()
7620 int id = coding_categories[coding_priorities[0]].id;
7622 return CODING_ID_NAME (id);
7626 #ifdef emacs
7627 /*** 8. Emacs Lisp library functions ***/
7629 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7630 doc: /* Return t if OBJECT is nil or a coding-system.
7631 See the documentation of `define-coding-system' for information
7632 about coding-system objects. */)
7633 (object)
7634 Lisp_Object object;
7636 if (NILP (object)
7637 || CODING_SYSTEM_ID (object) >= 0)
7638 return Qt;
7639 if (! SYMBOLP (object)
7640 || NILP (Fget (object, Qcoding_system_define_form)))
7641 return Qnil;
7642 return Qt;
7645 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7646 Sread_non_nil_coding_system, 1, 1, 0,
7647 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7648 (prompt)
7649 Lisp_Object prompt;
7651 Lisp_Object val;
7654 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7655 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7657 while (SCHARS (val) == 0);
7658 return (Fintern (val, Qnil));
7661 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7662 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7663 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7664 Ignores case when completing coding systems (all Emacs coding systems
7665 are lower-case). */)
7666 (prompt, default_coding_system)
7667 Lisp_Object prompt, default_coding_system;
7669 Lisp_Object val;
7670 int count = SPECPDL_INDEX ();
7672 if (SYMBOLP (default_coding_system))
7673 default_coding_system = SYMBOL_NAME (default_coding_system);
7674 specbind (Qcompletion_ignore_case, Qt);
7675 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7676 Qt, Qnil, Qcoding_system_history,
7677 default_coding_system, Qnil);
7678 unbind_to (count, Qnil);
7679 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7682 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7683 1, 1, 0,
7684 doc: /* Check validity of CODING-SYSTEM.
7685 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7686 It is valid if it is nil or a symbol defined as a coding system by the
7687 function `define-coding-system'. */)
7688 (coding_system)
7689 Lisp_Object coding_system;
7691 Lisp_Object define_form;
7693 define_form = Fget (coding_system, Qcoding_system_define_form);
7694 if (! NILP (define_form))
7696 Fput (coding_system, Qcoding_system_define_form, Qnil);
7697 safe_eval (define_form);
7699 if (!NILP (Fcoding_system_p (coding_system)))
7700 return coding_system;
7701 xsignal1 (Qcoding_system_error, coding_system);
7705 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7706 HIGHEST is nonzero, return the coding system of the highest
7707 priority among the detected coding systems. Otherwize return a
7708 list of detected coding systems sorted by their priorities. If
7709 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7710 multibyte form but contains only ASCII and eight-bit chars.
7711 Otherwise, the bytes are raw bytes.
7713 CODING-SYSTEM controls the detection as below:
7715 If it is nil, detect both text-format and eol-format. If the
7716 text-format part of CODING-SYSTEM is already specified
7717 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7718 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7719 detect only text-format. */
7721 Lisp_Object
7722 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7723 coding_system)
7724 const unsigned char *src;
7725 EMACS_INT src_chars, src_bytes;
7726 int highest;
7727 int multibytep;
7728 Lisp_Object coding_system;
7730 const unsigned char *src_end = src + src_bytes;
7731 Lisp_Object attrs, eol_type;
7732 Lisp_Object val = Qnil;
7733 struct coding_system coding;
7734 int id;
7735 struct coding_detection_info detect_info;
7736 enum coding_category base_category;
7737 int null_byte_found = 0, eight_bit_found = 0;
7739 if (NILP (coding_system))
7740 coding_system = Qundecided;
7741 setup_coding_system (coding_system, &coding);
7742 attrs = CODING_ID_ATTRS (coding.id);
7743 eol_type = CODING_ID_EOL_TYPE (coding.id);
7744 coding_system = CODING_ATTR_BASE_NAME (attrs);
7746 coding.source = src;
7747 coding.src_chars = src_chars;
7748 coding.src_bytes = src_bytes;
7749 coding.src_multibyte = multibytep;
7750 coding.consumed = 0;
7751 coding.mode |= CODING_MODE_LAST_BLOCK;
7752 coding.head_ascii = 0;
7754 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7756 /* At first, detect text-format if necessary. */
7757 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7758 if (base_category == coding_category_undecided)
7760 enum coding_category category;
7761 struct coding_system *this;
7762 int c, i;
7764 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7765 for (; src < src_end; src++)
7767 c = *src;
7768 if (c & 0x80)
7770 eight_bit_found = 1;
7771 if (null_byte_found)
7772 break;
7774 else if (c < 0x20)
7776 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7777 && ! inhibit_iso_escape_detection
7778 && ! detect_info.checked)
7780 if (detect_coding_iso_2022 (&coding, &detect_info))
7782 /* We have scanned the whole data. */
7783 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7785 /* We didn't find an 8-bit code. We may
7786 have found a null-byte, but it's very
7787 rare that a binary file confirm to
7788 ISO-2022. */
7789 src = src_end;
7790 coding.head_ascii = src - coding.source;
7792 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
7793 break;
7796 else if (! c && !inhibit_null_byte_detection)
7798 null_byte_found = 1;
7799 if (eight_bit_found)
7800 break;
7802 if (! eight_bit_found)
7803 coding.head_ascii++;
7805 else if (! eight_bit_found)
7806 coding.head_ascii++;
7809 if (null_byte_found || eight_bit_found
7810 || coding.head_ascii < coding.src_bytes
7811 || detect_info.found)
7813 if (coding.head_ascii == coding.src_bytes)
7814 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7815 for (i = 0; i < coding_category_raw_text; i++)
7817 category = coding_priorities[i];
7818 this = coding_categories + category;
7819 if (detect_info.found & (1 << category))
7820 break;
7822 else
7824 if (null_byte_found)
7826 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7827 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7829 for (i = 0; i < coding_category_raw_text; i++)
7831 category = coding_priorities[i];
7832 this = coding_categories + category;
7834 if (this->id < 0)
7836 /* No coding system of this category is defined. */
7837 detect_info.rejected |= (1 << category);
7839 else if (category >= coding_category_raw_text)
7840 continue;
7841 else if (detect_info.checked & (1 << category))
7843 if (highest
7844 && (detect_info.found & (1 << category)))
7845 break;
7847 else if ((*(this->detector)) (&coding, &detect_info)
7848 && highest
7849 && (detect_info.found & (1 << category)))
7851 if (category == coding_category_utf_16_auto)
7853 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7854 category = coding_category_utf_16_le;
7855 else
7856 category = coding_category_utf_16_be;
7858 break;
7864 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
7866 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7867 id = coding_categories[coding_category_raw_text].id;
7868 val = Fcons (make_number (id), Qnil);
7870 else if (! detect_info.rejected && ! detect_info.found)
7872 detect_info.found = CATEGORY_MASK_ANY;
7873 id = coding_categories[coding_category_undecided].id;
7874 val = Fcons (make_number (id), Qnil);
7876 else if (highest)
7878 if (detect_info.found)
7880 detect_info.found = 1 << category;
7881 val = Fcons (make_number (this->id), Qnil);
7883 else
7884 for (i = 0; i < coding_category_raw_text; i++)
7885 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7887 detect_info.found = 1 << coding_priorities[i];
7888 id = coding_categories[coding_priorities[i]].id;
7889 val = Fcons (make_number (id), Qnil);
7890 break;
7893 else
7895 int mask = detect_info.rejected | detect_info.found;
7896 int found = 0;
7898 for (i = coding_category_raw_text - 1; i >= 0; i--)
7900 category = coding_priorities[i];
7901 if (! (mask & (1 << category)))
7903 found |= 1 << category;
7904 id = coding_categories[category].id;
7905 if (id >= 0)
7906 val = Fcons (make_number (id), val);
7909 for (i = coding_category_raw_text - 1; i >= 0; i--)
7911 category = coding_priorities[i];
7912 if (detect_info.found & (1 << category))
7914 id = coding_categories[category].id;
7915 val = Fcons (make_number (id), val);
7918 detect_info.found |= found;
7921 else if (base_category == coding_category_utf_8_auto)
7923 if (detect_coding_utf_8 (&coding, &detect_info))
7925 struct coding_system *this;
7927 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7928 this = coding_categories + coding_category_utf_8_sig;
7929 else
7930 this = coding_categories + coding_category_utf_8_nosig;
7931 val = Fcons (make_number (this->id), Qnil);
7934 else if (base_category == coding_category_utf_16_auto)
7936 if (detect_coding_utf_16 (&coding, &detect_info))
7938 struct coding_system *this;
7940 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7941 this = coding_categories + coding_category_utf_16_le;
7942 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7943 this = coding_categories + coding_category_utf_16_be;
7944 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7945 this = coding_categories + coding_category_utf_16_be_nosig;
7946 else
7947 this = coding_categories + coding_category_utf_16_le_nosig;
7948 val = Fcons (make_number (this->id), Qnil);
7951 else
7953 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7954 val = Fcons (make_number (coding.id), Qnil);
7957 /* Then, detect eol-format if necessary. */
7959 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
7960 Lisp_Object tail;
7962 if (VECTORP (eol_type))
7964 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7966 if (null_byte_found)
7967 normal_eol = EOL_SEEN_LF;
7968 else
7969 normal_eol = detect_eol (coding.source, src_bytes,
7970 coding_category_raw_text);
7972 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7973 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7974 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7975 coding_category_utf_16_be);
7976 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7977 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7978 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7979 coding_category_utf_16_le);
7981 else
7983 if (EQ (eol_type, Qunix))
7984 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7985 else if (EQ (eol_type, Qdos))
7986 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7987 else
7988 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7991 for (tail = val; CONSP (tail); tail = XCDR (tail))
7993 enum coding_category category;
7994 int this_eol;
7996 id = XINT (XCAR (tail));
7997 attrs = CODING_ID_ATTRS (id);
7998 category = XINT (CODING_ATTR_CATEGORY (attrs));
7999 eol_type = CODING_ID_EOL_TYPE (id);
8000 if (VECTORP (eol_type))
8002 if (category == coding_category_utf_16_be
8003 || category == coding_category_utf_16_be_nosig)
8004 this_eol = utf_16_be_eol;
8005 else if (category == coding_category_utf_16_le
8006 || category == coding_category_utf_16_le_nosig)
8007 this_eol = utf_16_le_eol;
8008 else
8009 this_eol = normal_eol;
8011 if (this_eol == EOL_SEEN_LF)
8012 XSETCAR (tail, AREF (eol_type, 0));
8013 else if (this_eol == EOL_SEEN_CRLF)
8014 XSETCAR (tail, AREF (eol_type, 1));
8015 else if (this_eol == EOL_SEEN_CR)
8016 XSETCAR (tail, AREF (eol_type, 2));
8017 else
8018 XSETCAR (tail, CODING_ID_NAME (id));
8020 else
8021 XSETCAR (tail, CODING_ID_NAME (id));
8025 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8029 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8030 2, 3, 0,
8031 doc: /* Detect coding system of the text in the region between START and END.
8032 Return a list of possible coding systems ordered by priority.
8034 If only ASCII characters are found (except for such ISO-2022 control
8035 characters as ESC), it returns a list of single element `undecided'
8036 or its subsidiary coding system according to a detected end-of-line
8037 format.
8039 If optional argument HIGHEST is non-nil, return the coding system of
8040 highest priority. */)
8041 (start, end, highest)
8042 Lisp_Object start, end, highest;
8044 int from, to;
8045 int from_byte, to_byte;
8047 CHECK_NUMBER_COERCE_MARKER (start);
8048 CHECK_NUMBER_COERCE_MARKER (end);
8050 validate_region (&start, &end);
8051 from = XINT (start), to = XINT (end);
8052 from_byte = CHAR_TO_BYTE (from);
8053 to_byte = CHAR_TO_BYTE (to);
8055 if (from < GPT && to >= GPT)
8056 move_gap_both (to, to_byte);
8058 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8059 to - from, to_byte - from_byte,
8060 !NILP (highest),
8061 !NILP (current_buffer
8062 ->enable_multibyte_characters),
8063 Qnil);
8066 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8067 1, 2, 0,
8068 doc: /* Detect coding system of the text in STRING.
8069 Return a list of possible coding systems ordered by priority.
8071 If only ASCII characters are found (except for such ISO-2022 control
8072 characters as ESC), it returns a list of single element `undecided'
8073 or its subsidiary coding system according to a detected end-of-line
8074 format.
8076 If optional argument HIGHEST is non-nil, return the coding system of
8077 highest priority. */)
8078 (string, highest)
8079 Lisp_Object string, highest;
8081 CHECK_STRING (string);
8083 return detect_coding_system (SDATA (string),
8084 SCHARS (string), SBYTES (string),
8085 !NILP (highest), STRING_MULTIBYTE (string),
8086 Qnil);
8090 static INLINE int
8091 char_encodable_p (c, attrs)
8092 int c;
8093 Lisp_Object attrs;
8095 Lisp_Object tail;
8096 struct charset *charset;
8097 Lisp_Object translation_table;
8099 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8100 if (! NILP (translation_table))
8101 c = translate_char (translation_table, c);
8102 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8103 CONSP (tail); tail = XCDR (tail))
8105 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8106 if (CHAR_CHARSET_P (c, charset))
8107 break;
8109 return (! NILP (tail));
8113 /* Return a list of coding systems that safely encode the text between
8114 START and END. If EXCLUDE is non-nil, it is a list of coding
8115 systems not to check. The returned list doesn't contain any such
8116 coding systems. In any case, if the text contains only ASCII or is
8117 unibyte, return t. */
8119 DEFUN ("find-coding-systems-region-internal",
8120 Ffind_coding_systems_region_internal,
8121 Sfind_coding_systems_region_internal, 2, 3, 0,
8122 doc: /* Internal use only. */)
8123 (start, end, exclude)
8124 Lisp_Object start, end, exclude;
8126 Lisp_Object coding_attrs_list, safe_codings;
8127 EMACS_INT start_byte, end_byte;
8128 const unsigned char *p, *pbeg, *pend;
8129 int c;
8130 Lisp_Object tail, elt;
8132 if (STRINGP (start))
8134 if (!STRING_MULTIBYTE (start)
8135 || SCHARS (start) == SBYTES (start))
8136 return Qt;
8137 start_byte = 0;
8138 end_byte = SBYTES (start);
8140 else
8142 CHECK_NUMBER_COERCE_MARKER (start);
8143 CHECK_NUMBER_COERCE_MARKER (end);
8144 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8145 args_out_of_range (start, end);
8146 if (NILP (current_buffer->enable_multibyte_characters))
8147 return Qt;
8148 start_byte = CHAR_TO_BYTE (XINT (start));
8149 end_byte = CHAR_TO_BYTE (XINT (end));
8150 if (XINT (end) - XINT (start) == end_byte - start_byte)
8151 return Qt;
8153 if (XINT (start) < GPT && XINT (end) > GPT)
8155 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8156 move_gap_both (XINT (start), start_byte);
8157 else
8158 move_gap_both (XINT (end), end_byte);
8162 coding_attrs_list = Qnil;
8163 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8164 if (NILP (exclude)
8165 || NILP (Fmemq (XCAR (tail), exclude)))
8167 Lisp_Object attrs;
8169 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8170 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8171 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8173 ASET (attrs, coding_attr_trans_tbl,
8174 get_translation_table (attrs, 1, NULL));
8175 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8179 if (STRINGP (start))
8180 p = pbeg = SDATA (start);
8181 else
8182 p = pbeg = BYTE_POS_ADDR (start_byte);
8183 pend = p + (end_byte - start_byte);
8185 while (p < pend && ASCII_BYTE_P (*p)) p++;
8186 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8188 while (p < pend)
8190 if (ASCII_BYTE_P (*p))
8191 p++;
8192 else
8194 c = STRING_CHAR_ADVANCE (p);
8196 charset_map_loaded = 0;
8197 for (tail = coding_attrs_list; CONSP (tail);)
8199 elt = XCAR (tail);
8200 if (NILP (elt))
8201 tail = XCDR (tail);
8202 else if (char_encodable_p (c, elt))
8203 tail = XCDR (tail);
8204 else if (CONSP (XCDR (tail)))
8206 XSETCAR (tail, XCAR (XCDR (tail)));
8207 XSETCDR (tail, XCDR (XCDR (tail)));
8209 else
8211 XSETCAR (tail, Qnil);
8212 tail = XCDR (tail);
8215 if (charset_map_loaded)
8217 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8219 if (STRINGP (start))
8220 pbeg = SDATA (start);
8221 else
8222 pbeg = BYTE_POS_ADDR (start_byte);
8223 p = pbeg + p_offset;
8224 pend = pbeg + pend_offset;
8229 safe_codings = list2 (Qraw_text, Qno_conversion);
8230 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8231 if (! NILP (XCAR (tail)))
8232 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8234 return safe_codings;
8238 DEFUN ("unencodable-char-position", Funencodable_char_position,
8239 Sunencodable_char_position, 3, 5, 0,
8240 doc: /*
8241 Return position of first un-encodable character in a region.
8242 START and END specify the region and CODING-SYSTEM specifies the
8243 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8245 If optional 4th argument COUNT is non-nil, it specifies at most how
8246 many un-encodable characters to search. In this case, the value is a
8247 list of positions.
8249 If optional 5th argument STRING is non-nil, it is a string to search
8250 for un-encodable characters. In that case, START and END are indexes
8251 to the string. */)
8252 (start, end, coding_system, count, string)
8253 Lisp_Object start, end, coding_system, count, string;
8255 int n;
8256 struct coding_system coding;
8257 Lisp_Object attrs, charset_list, translation_table;
8258 Lisp_Object positions;
8259 int from, to;
8260 const unsigned char *p, *stop, *pend;
8261 int ascii_compatible;
8263 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8264 attrs = CODING_ID_ATTRS (coding.id);
8265 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8266 return Qnil;
8267 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8268 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8269 translation_table = get_translation_table (attrs, 1, NULL);
8271 if (NILP (string))
8273 validate_region (&start, &end);
8274 from = XINT (start);
8275 to = XINT (end);
8276 if (NILP (current_buffer->enable_multibyte_characters)
8277 || (ascii_compatible
8278 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8279 return Qnil;
8280 p = CHAR_POS_ADDR (from);
8281 pend = CHAR_POS_ADDR (to);
8282 if (from < GPT && to >= GPT)
8283 stop = GPT_ADDR;
8284 else
8285 stop = pend;
8287 else
8289 CHECK_STRING (string);
8290 CHECK_NATNUM (start);
8291 CHECK_NATNUM (end);
8292 from = XINT (start);
8293 to = XINT (end);
8294 if (from > to
8295 || to > SCHARS (string))
8296 args_out_of_range_3 (string, start, end);
8297 if (! STRING_MULTIBYTE (string))
8298 return Qnil;
8299 p = SDATA (string) + string_char_to_byte (string, from);
8300 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8301 if (ascii_compatible && (to - from) == (pend - p))
8302 return Qnil;
8305 if (NILP (count))
8306 n = 1;
8307 else
8309 CHECK_NATNUM (count);
8310 n = XINT (count);
8313 positions = Qnil;
8314 while (1)
8316 int c;
8318 if (ascii_compatible)
8319 while (p < stop && ASCII_BYTE_P (*p))
8320 p++, from++;
8321 if (p >= stop)
8323 if (p >= pend)
8324 break;
8325 stop = pend;
8326 p = GAP_END_ADDR;
8329 c = STRING_CHAR_ADVANCE (p);
8330 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8331 && ! char_charset (translate_char (translation_table, c),
8332 charset_list, NULL))
8334 positions = Fcons (make_number (from), positions);
8335 n--;
8336 if (n == 0)
8337 break;
8340 from++;
8343 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8347 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8348 Scheck_coding_systems_region, 3, 3, 0,
8349 doc: /* Check if the region is encodable by coding systems.
8351 START and END are buffer positions specifying the region.
8352 CODING-SYSTEM-LIST is a list of coding systems to check.
8354 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8355 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8356 whole region, POS0, POS1, ... are buffer positions where non-encodable
8357 characters are found.
8359 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8360 value is nil.
8362 START may be a string. In that case, check if the string is
8363 encodable, and the value contains indices to the string instead of
8364 buffer positions. END is ignored. */)
8365 (start, end, coding_system_list)
8366 Lisp_Object start, end, coding_system_list;
8368 Lisp_Object list;
8369 EMACS_INT start_byte, end_byte;
8370 int pos;
8371 const unsigned char *p, *pbeg, *pend;
8372 int c;
8373 Lisp_Object tail, elt, attrs;
8375 if (STRINGP (start))
8377 if (!STRING_MULTIBYTE (start)
8378 && SCHARS (start) != SBYTES (start))
8379 return Qnil;
8380 start_byte = 0;
8381 end_byte = SBYTES (start);
8382 pos = 0;
8384 else
8386 CHECK_NUMBER_COERCE_MARKER (start);
8387 CHECK_NUMBER_COERCE_MARKER (end);
8388 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8389 args_out_of_range (start, end);
8390 if (NILP (current_buffer->enable_multibyte_characters))
8391 return Qnil;
8392 start_byte = CHAR_TO_BYTE (XINT (start));
8393 end_byte = CHAR_TO_BYTE (XINT (end));
8394 if (XINT (end) - XINT (start) == end_byte - start_byte)
8395 return Qt;
8397 if (XINT (start) < GPT && XINT (end) > GPT)
8399 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8400 move_gap_both (XINT (start), start_byte);
8401 else
8402 move_gap_both (XINT (end), end_byte);
8404 pos = XINT (start);
8407 list = Qnil;
8408 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8410 elt = XCAR (tail);
8411 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8412 ASET (attrs, coding_attr_trans_tbl,
8413 get_translation_table (attrs, 1, NULL));
8414 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8417 if (STRINGP (start))
8418 p = pbeg = SDATA (start);
8419 else
8420 p = pbeg = BYTE_POS_ADDR (start_byte);
8421 pend = p + (end_byte - start_byte);
8423 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8424 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8426 while (p < pend)
8428 if (ASCII_BYTE_P (*p))
8429 p++;
8430 else
8432 c = STRING_CHAR_ADVANCE (p);
8434 charset_map_loaded = 0;
8435 for (tail = list; CONSP (tail); tail = XCDR (tail))
8437 elt = XCDR (XCAR (tail));
8438 if (! char_encodable_p (c, XCAR (elt)))
8439 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8441 if (charset_map_loaded)
8443 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8445 if (STRINGP (start))
8446 pbeg = SDATA (start);
8447 else
8448 pbeg = BYTE_POS_ADDR (start_byte);
8449 p = pbeg + p_offset;
8450 pend = pbeg + pend_offset;
8453 pos++;
8456 tail = list;
8457 list = Qnil;
8458 for (; CONSP (tail); tail = XCDR (tail))
8460 elt = XCAR (tail);
8461 if (CONSP (XCDR (XCDR (elt))))
8462 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8463 list);
8466 return list;
8470 Lisp_Object
8471 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8472 Lisp_Object start, end, coding_system, dst_object;
8473 int encodep, norecord;
8475 struct coding_system coding;
8476 EMACS_INT from, from_byte, to, to_byte;
8477 Lisp_Object src_object;
8479 CHECK_NUMBER_COERCE_MARKER (start);
8480 CHECK_NUMBER_COERCE_MARKER (end);
8481 if (NILP (coding_system))
8482 coding_system = Qno_conversion;
8483 else
8484 CHECK_CODING_SYSTEM (coding_system);
8485 src_object = Fcurrent_buffer ();
8486 if (NILP (dst_object))
8487 dst_object = src_object;
8488 else if (! EQ (dst_object, Qt))
8489 CHECK_BUFFER (dst_object);
8491 validate_region (&start, &end);
8492 from = XFASTINT (start);
8493 from_byte = CHAR_TO_BYTE (from);
8494 to = XFASTINT (end);
8495 to_byte = CHAR_TO_BYTE (to);
8497 setup_coding_system (coding_system, &coding);
8498 coding.mode |= CODING_MODE_LAST_BLOCK;
8500 if (encodep)
8501 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8502 dst_object);
8503 else
8504 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8505 dst_object);
8506 if (! norecord)
8507 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8509 return (BUFFERP (dst_object)
8510 ? make_number (coding.produced_char)
8511 : coding.dst_object);
8515 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8516 3, 4, "r\nzCoding system: ",
8517 doc: /* Decode the current region from the specified coding system.
8518 When called from a program, takes four arguments:
8519 START, END, CODING-SYSTEM, and DESTINATION.
8520 START and END are buffer positions.
8522 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8523 If nil, the region between START and END is replaced by the decoded text.
8524 If buffer, the decoded text is inserted in that buffer after point (point
8525 does not move).
8526 In those cases, the length of the decoded text is returned.
8527 If DESTINATION is t, the decoded text is returned.
8529 This function sets `last-coding-system-used' to the precise coding system
8530 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8531 not fully specified.) */)
8532 (start, end, coding_system, destination)
8533 Lisp_Object start, end, coding_system, destination;
8535 return code_convert_region (start, end, coding_system, destination, 0, 0);
8538 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8539 3, 4, "r\nzCoding system: ",
8540 doc: /* Encode the current region by specified coding system.
8541 When called from a program, takes four arguments:
8542 START, END, CODING-SYSTEM and DESTINATION.
8543 START and END are buffer positions.
8545 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8546 If nil, the region between START and END is replace by the encoded text.
8547 If buffer, the encoded text is inserted in that buffer after point (point
8548 does not move).
8549 In those cases, the length of the encoded text is returned.
8550 If DESTINATION is t, the encoded text is returned.
8552 This function sets `last-coding-system-used' to the precise coding system
8553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8554 not fully specified.) */)
8555 (start, end, coding_system, destination)
8556 Lisp_Object start, end, coding_system, destination;
8558 return code_convert_region (start, end, coding_system, destination, 1, 0);
8561 Lisp_Object
8562 code_convert_string (string, coding_system, dst_object,
8563 encodep, nocopy, norecord)
8564 Lisp_Object string, coding_system, dst_object;
8565 int encodep, nocopy, norecord;
8567 struct coding_system coding;
8568 EMACS_INT chars, bytes;
8570 CHECK_STRING (string);
8571 if (NILP (coding_system))
8573 if (! norecord)
8574 Vlast_coding_system_used = Qno_conversion;
8575 if (NILP (dst_object))
8576 return (nocopy ? Fcopy_sequence (string) : string);
8579 if (NILP (coding_system))
8580 coding_system = Qno_conversion;
8581 else
8582 CHECK_CODING_SYSTEM (coding_system);
8583 if (NILP (dst_object))
8584 dst_object = Qt;
8585 else if (! EQ (dst_object, Qt))
8586 CHECK_BUFFER (dst_object);
8588 setup_coding_system (coding_system, &coding);
8589 coding.mode |= CODING_MODE_LAST_BLOCK;
8590 chars = SCHARS (string);
8591 bytes = SBYTES (string);
8592 if (encodep)
8593 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8594 else
8595 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8596 if (! norecord)
8597 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8599 return (BUFFERP (dst_object)
8600 ? make_number (coding.produced_char)
8601 : coding.dst_object);
8605 /* Encode or decode STRING according to CODING_SYSTEM.
8606 Do not set Vlast_coding_system_used.
8608 This function is called only from macros DECODE_FILE and
8609 ENCODE_FILE, thus we ignore character composition. */
8611 Lisp_Object
8612 code_convert_string_norecord (string, coding_system, encodep)
8613 Lisp_Object string, coding_system;
8614 int encodep;
8616 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8620 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8621 2, 4, 0,
8622 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8624 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8625 if the decoding operation is trivial.
8627 Optional fourth arg BUFFER non-nil means that the decoded text is
8628 inserted in that buffer after point (point does not move). In this
8629 case, the return value is the length of the decoded text.
8631 This function sets `last-coding-system-used' to the precise coding system
8632 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8633 not fully specified.) */)
8634 (string, coding_system, nocopy, buffer)
8635 Lisp_Object string, coding_system, nocopy, buffer;
8637 return code_convert_string (string, coding_system, buffer,
8638 0, ! NILP (nocopy), 0);
8641 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8642 2, 4, 0,
8643 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8645 Optional third arg NOCOPY non-nil means it is OK to return STRING
8646 itself if the encoding operation is trivial.
8648 Optional fourth arg BUFFER non-nil means that the encoded text is
8649 inserted in that buffer after point (point does not move). In this
8650 case, the return value is the length of the encoded text.
8652 This function sets `last-coding-system-used' to the precise coding system
8653 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8654 not fully specified.) */)
8655 (string, coding_system, nocopy, buffer)
8656 Lisp_Object string, coding_system, nocopy, buffer;
8658 return code_convert_string (string, coding_system, buffer,
8659 1, ! NILP (nocopy), 1);
8663 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8664 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8665 Return the corresponding character. */)
8666 (code)
8667 Lisp_Object code;
8669 Lisp_Object spec, attrs, val;
8670 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8671 int c;
8673 CHECK_NATNUM (code);
8674 c = XFASTINT (code);
8675 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8676 attrs = AREF (spec, 0);
8678 if (ASCII_BYTE_P (c)
8679 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8680 return code;
8682 val = CODING_ATTR_CHARSET_LIST (attrs);
8683 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8684 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8685 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8687 if (c <= 0x7F)
8688 charset = charset_roman;
8689 else if (c >= 0xA0 && c < 0xDF)
8691 charset = charset_kana;
8692 c -= 0x80;
8694 else
8696 int s1 = c >> 8, s2 = c & 0xFF;
8698 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8699 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8700 error ("Invalid code: %d", code);
8701 SJIS_TO_JIS (c);
8702 charset = charset_kanji;
8704 c = DECODE_CHAR (charset, c);
8705 if (c < 0)
8706 error ("Invalid code: %d", code);
8707 return make_number (c);
8711 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8712 doc: /* Encode a Japanese character CH to shift_jis encoding.
8713 Return the corresponding code in SJIS. */)
8714 (ch)
8715 Lisp_Object ch;
8717 Lisp_Object spec, attrs, charset_list;
8718 int c;
8719 struct charset *charset;
8720 unsigned code;
8722 CHECK_CHARACTER (ch);
8723 c = XFASTINT (ch);
8724 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8725 attrs = AREF (spec, 0);
8727 if (ASCII_CHAR_P (c)
8728 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8729 return ch;
8731 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8732 charset = char_charset (c, charset_list, &code);
8733 if (code == CHARSET_INVALID_CODE (charset))
8734 error ("Can't encode by shift_jis encoding: %d", c);
8735 JIS_TO_SJIS (code);
8737 return make_number (code);
8740 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8741 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8742 Return the corresponding character. */)
8743 (code)
8744 Lisp_Object code;
8746 Lisp_Object spec, attrs, val;
8747 struct charset *charset_roman, *charset_big5, *charset;
8748 int c;
8750 CHECK_NATNUM (code);
8751 c = XFASTINT (code);
8752 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8753 attrs = AREF (spec, 0);
8755 if (ASCII_BYTE_P (c)
8756 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8757 return code;
8759 val = CODING_ATTR_CHARSET_LIST (attrs);
8760 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8761 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8763 if (c <= 0x7F)
8764 charset = charset_roman;
8765 else
8767 int b1 = c >> 8, b2 = c & 0x7F;
8768 if (b1 < 0xA1 || b1 > 0xFE
8769 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8770 error ("Invalid code: %d", code);
8771 charset = charset_big5;
8773 c = DECODE_CHAR (charset, (unsigned )c);
8774 if (c < 0)
8775 error ("Invalid code: %d", code);
8776 return make_number (c);
8779 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8780 doc: /* Encode the Big5 character CH to BIG5 coding system.
8781 Return the corresponding character code in Big5. */)
8782 (ch)
8783 Lisp_Object ch;
8785 Lisp_Object spec, attrs, charset_list;
8786 struct charset *charset;
8787 int c;
8788 unsigned code;
8790 CHECK_CHARACTER (ch);
8791 c = XFASTINT (ch);
8792 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8793 attrs = AREF (spec, 0);
8794 if (ASCII_CHAR_P (c)
8795 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8796 return ch;
8798 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8799 charset = char_charset (c, charset_list, &code);
8800 if (code == CHARSET_INVALID_CODE (charset))
8801 error ("Can't encode by Big5 encoding: %d", c);
8803 return make_number (code);
8807 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8808 Sset_terminal_coding_system_internal, 1, 2, 0,
8809 doc: /* Internal use only. */)
8810 (coding_system, terminal)
8811 Lisp_Object coding_system;
8812 Lisp_Object terminal;
8814 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8815 CHECK_SYMBOL (coding_system);
8816 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8817 /* We had better not send unsafe characters to terminal. */
8818 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8819 /* Characer composition should be disabled. */
8820 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8821 terminal_coding->src_multibyte = 1;
8822 terminal_coding->dst_multibyte = 0;
8823 return Qnil;
8826 DEFUN ("set-safe-terminal-coding-system-internal",
8827 Fset_safe_terminal_coding_system_internal,
8828 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8829 doc: /* Internal use only. */)
8830 (coding_system)
8831 Lisp_Object coding_system;
8833 CHECK_SYMBOL (coding_system);
8834 setup_coding_system (Fcheck_coding_system (coding_system),
8835 &safe_terminal_coding);
8836 /* Characer composition should be disabled. */
8837 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8838 safe_terminal_coding.src_multibyte = 1;
8839 safe_terminal_coding.dst_multibyte = 0;
8840 return Qnil;
8843 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8844 Sterminal_coding_system, 0, 1, 0,
8845 doc: /* Return coding system specified for terminal output on the given terminal.
8846 TERMINAL may be a terminal id, a frame, or nil for the selected
8847 frame's terminal device. */)
8848 (terminal)
8849 Lisp_Object terminal;
8851 struct coding_system *terminal_coding
8852 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8853 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8855 /* For backward compatibility, return nil if it is `undecided'. */
8856 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8859 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8860 Sset_keyboard_coding_system_internal, 1, 2, 0,
8861 doc: /* Internal use only. */)
8862 (coding_system, terminal)
8863 Lisp_Object coding_system;
8864 Lisp_Object terminal;
8866 struct terminal *t = get_terminal (terminal, 1);
8867 CHECK_SYMBOL (coding_system);
8868 setup_coding_system (Fcheck_coding_system (coding_system),
8869 TERMINAL_KEYBOARD_CODING (t));
8870 /* Characer composition should be disabled. */
8871 TERMINAL_KEYBOARD_CODING (t)->common_flags
8872 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8873 return Qnil;
8876 DEFUN ("keyboard-coding-system",
8877 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8878 doc: /* Return coding system specified for decoding keyboard input. */)
8879 (terminal)
8880 Lisp_Object terminal;
8882 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8883 (get_terminal (terminal, 1))->id);
8887 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8888 Sfind_operation_coding_system, 1, MANY, 0,
8889 doc: /* Choose a coding system for an operation based on the target name.
8890 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8891 DECODING-SYSTEM is the coding system to use for decoding
8892 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8893 for encoding (in case OPERATION does encoding).
8895 The first argument OPERATION specifies an I/O primitive:
8896 For file I/O, `insert-file-contents' or `write-region'.
8897 For process I/O, `call-process', `call-process-region', or `start-process'.
8898 For network I/O, `open-network-stream'.
8900 The remaining arguments should be the same arguments that were passed
8901 to the primitive. Depending on which primitive, one of those arguments
8902 is selected as the TARGET. For example, if OPERATION does file I/O,
8903 whichever argument specifies the file name is TARGET.
8905 TARGET has a meaning which depends on OPERATION:
8906 For file I/O, TARGET is a file name (except for the special case below).
8907 For process I/O, TARGET is a process name.
8908 For network I/O, TARGET is a service name or a port number.
8910 This function looks up what is specified for TARGET in
8911 `file-coding-system-alist', `process-coding-system-alist',
8912 or `network-coding-system-alist' depending on OPERATION.
8913 They may specify a coding system, a cons of coding systems,
8914 or a function symbol to call.
8915 In the last case, we call the function with one argument,
8916 which is a list of all the arguments given to this function.
8917 If the function can't decide a coding system, it can return
8918 `undecided' so that the normal code-detection is performed.
8920 If OPERATION is `insert-file-contents', the argument corresponding to
8921 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8922 file name to look up, and BUFFER is a buffer that contains the file's
8923 contents (not yet decoded). If `file-coding-system-alist' specifies a
8924 function to call for FILENAME, that function should examine the
8925 contents of BUFFER instead of reading the file.
8927 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
8928 (nargs, args)
8929 int nargs;
8930 Lisp_Object *args;
8932 Lisp_Object operation, target_idx, target, val;
8933 register Lisp_Object chain;
8935 if (nargs < 2)
8936 error ("Too few arguments");
8937 operation = args[0];
8938 if (!SYMBOLP (operation)
8939 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8940 error ("Invalid first argument");
8941 if (nargs < 1 + XINT (target_idx))
8942 error ("Too few arguments for operation: %s",
8943 SDATA (SYMBOL_NAME (operation)));
8944 target = args[XINT (target_idx) + 1];
8945 if (!(STRINGP (target)
8946 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8947 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8948 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8949 error ("Invalid %dth argument", XINT (target_idx) + 1);
8950 if (CONSP (target))
8951 target = XCAR (target);
8953 chain = ((EQ (operation, Qinsert_file_contents)
8954 || EQ (operation, Qwrite_region))
8955 ? Vfile_coding_system_alist
8956 : (EQ (operation, Qopen_network_stream)
8957 ? Vnetwork_coding_system_alist
8958 : Vprocess_coding_system_alist));
8959 if (NILP (chain))
8960 return Qnil;
8962 for (; CONSP (chain); chain = XCDR (chain))
8964 Lisp_Object elt;
8966 elt = XCAR (chain);
8967 if (CONSP (elt)
8968 && ((STRINGP (target)
8969 && STRINGP (XCAR (elt))
8970 && fast_string_match (XCAR (elt), target) >= 0)
8971 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8973 val = XCDR (elt);
8974 /* Here, if VAL is both a valid coding system and a valid
8975 function symbol, we return VAL as a coding system. */
8976 if (CONSP (val))
8977 return val;
8978 if (! SYMBOLP (val))
8979 return Qnil;
8980 if (! NILP (Fcoding_system_p (val)))
8981 return Fcons (val, val);
8982 if (! NILP (Ffboundp (val)))
8984 /* We use call1 rather than safe_call1
8985 so as to get bug reports about functions called here
8986 which don't handle the current interface. */
8987 val = call1 (val, Flist (nargs, args));
8988 if (CONSP (val))
8989 return val;
8990 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8991 return Fcons (val, val);
8993 return Qnil;
8996 return Qnil;
8999 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9000 Sset_coding_system_priority, 0, MANY, 0,
9001 doc: /* Assign higher priority to the coding systems given as arguments.
9002 If multiple coding systems belong to the same category,
9003 all but the first one are ignored.
9005 usage: (set-coding-system-priority &rest coding-systems) */)
9006 (nargs, args)
9007 int nargs;
9008 Lisp_Object *args;
9010 int i, j;
9011 int changed[coding_category_max];
9012 enum coding_category priorities[coding_category_max];
9014 bzero (changed, sizeof changed);
9016 for (i = j = 0; i < nargs; i++)
9018 enum coding_category category;
9019 Lisp_Object spec, attrs;
9021 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9022 attrs = AREF (spec, 0);
9023 category = XINT (CODING_ATTR_CATEGORY (attrs));
9024 if (changed[category])
9025 /* Ignore this coding system because a coding system of the
9026 same category already had a higher priority. */
9027 continue;
9028 changed[category] = 1;
9029 priorities[j++] = category;
9030 if (coding_categories[category].id >= 0
9031 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9032 setup_coding_system (args[i], &coding_categories[category]);
9033 Fset (AREF (Vcoding_category_table, category), args[i]);
9036 /* Now we have decided top J priorities. Reflect the order of the
9037 original priorities to the remaining priorities. */
9039 for (i = j, j = 0; i < coding_category_max; i++, j++)
9041 while (j < coding_category_max
9042 && changed[coding_priorities[j]])
9043 j++;
9044 if (j == coding_category_max)
9045 abort ();
9046 priorities[i] = coding_priorities[j];
9049 bcopy (priorities, coding_priorities, sizeof priorities);
9051 /* Update `coding-category-list'. */
9052 Vcoding_category_list = Qnil;
9053 for (i = coding_category_max - 1; i >= 0; i--)
9054 Vcoding_category_list
9055 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9056 Vcoding_category_list);
9058 return Qnil;
9061 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9062 Scoding_system_priority_list, 0, 1, 0,
9063 doc: /* Return a list of coding systems ordered by their priorities.
9064 HIGHESTP non-nil means just return the highest priority one. */)
9065 (highestp)
9066 Lisp_Object highestp;
9068 int i;
9069 Lisp_Object val;
9071 for (i = 0, val = Qnil; i < coding_category_max; i++)
9073 enum coding_category category = coding_priorities[i];
9074 int id = coding_categories[category].id;
9075 Lisp_Object attrs;
9077 if (id < 0)
9078 continue;
9079 attrs = CODING_ID_ATTRS (id);
9080 if (! NILP (highestp))
9081 return CODING_ATTR_BASE_NAME (attrs);
9082 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9084 return Fnreverse (val);
9087 static char *suffixes[] = { "-unix", "-dos", "-mac" };
9089 static Lisp_Object
9090 make_subsidiaries (base)
9091 Lisp_Object base;
9093 Lisp_Object subsidiaries;
9094 int base_name_len = SBYTES (SYMBOL_NAME (base));
9095 char *buf = (char *) alloca (base_name_len + 6);
9096 int i;
9098 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9099 subsidiaries = Fmake_vector (make_number (3), Qnil);
9100 for (i = 0; i < 3; i++)
9102 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9103 ASET (subsidiaries, i, intern (buf));
9105 return subsidiaries;
9109 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9110 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9111 doc: /* For internal use only.
9112 usage: (define-coding-system-internal ...) */)
9113 (nargs, args)
9114 int nargs;
9115 Lisp_Object *args;
9117 Lisp_Object name;
9118 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9119 Lisp_Object attrs; /* Vector of attributes. */
9120 Lisp_Object eol_type;
9121 Lisp_Object aliases;
9122 Lisp_Object coding_type, charset_list, safe_charsets;
9123 enum coding_category category;
9124 Lisp_Object tail, val;
9125 int max_charset_id = 0;
9126 int i;
9128 if (nargs < coding_arg_max)
9129 goto short_args;
9131 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9133 name = args[coding_arg_name];
9134 CHECK_SYMBOL (name);
9135 CODING_ATTR_BASE_NAME (attrs) = name;
9137 val = args[coding_arg_mnemonic];
9138 if (! STRINGP (val))
9139 CHECK_CHARACTER (val);
9140 CODING_ATTR_MNEMONIC (attrs) = val;
9142 coding_type = args[coding_arg_coding_type];
9143 CHECK_SYMBOL (coding_type);
9144 CODING_ATTR_TYPE (attrs) = coding_type;
9146 charset_list = args[coding_arg_charset_list];
9147 if (SYMBOLP (charset_list))
9149 if (EQ (charset_list, Qiso_2022))
9151 if (! EQ (coding_type, Qiso_2022))
9152 error ("Invalid charset-list");
9153 charset_list = Viso_2022_charset_list;
9155 else if (EQ (charset_list, Qemacs_mule))
9157 if (! EQ (coding_type, Qemacs_mule))
9158 error ("Invalid charset-list");
9159 charset_list = Vemacs_mule_charset_list;
9161 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9162 if (max_charset_id < XFASTINT (XCAR (tail)))
9163 max_charset_id = XFASTINT (XCAR (tail));
9165 else
9167 charset_list = Fcopy_sequence (charset_list);
9168 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9170 struct charset *charset;
9172 val = XCAR (tail);
9173 CHECK_CHARSET_GET_CHARSET (val, charset);
9174 if (EQ (coding_type, Qiso_2022)
9175 ? CHARSET_ISO_FINAL (charset) < 0
9176 : EQ (coding_type, Qemacs_mule)
9177 ? CHARSET_EMACS_MULE_ID (charset) < 0
9178 : 0)
9179 error ("Can't handle charset `%s'",
9180 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9182 XSETCAR (tail, make_number (charset->id));
9183 if (max_charset_id < charset->id)
9184 max_charset_id = charset->id;
9187 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9189 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9190 make_number (255));
9191 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9192 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9193 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9195 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9197 val = args[coding_arg_decode_translation_table];
9198 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9199 CHECK_SYMBOL (val);
9200 CODING_ATTR_DECODE_TBL (attrs) = val;
9202 val = args[coding_arg_encode_translation_table];
9203 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9204 CHECK_SYMBOL (val);
9205 CODING_ATTR_ENCODE_TBL (attrs) = val;
9207 val = args[coding_arg_post_read_conversion];
9208 CHECK_SYMBOL (val);
9209 CODING_ATTR_POST_READ (attrs) = val;
9211 val = args[coding_arg_pre_write_conversion];
9212 CHECK_SYMBOL (val);
9213 CODING_ATTR_PRE_WRITE (attrs) = val;
9215 val = args[coding_arg_default_char];
9216 if (NILP (val))
9217 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9218 else
9220 CHECK_CHARACTER (val);
9221 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9224 val = args[coding_arg_for_unibyte];
9225 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9227 val = args[coding_arg_plist];
9228 CHECK_LIST (val);
9229 CODING_ATTR_PLIST (attrs) = val;
9231 if (EQ (coding_type, Qcharset))
9233 /* Generate a lisp vector of 256 elements. Each element is nil,
9234 integer, or a list of charset IDs.
9236 If Nth element is nil, the byte code N is invalid in this
9237 coding system.
9239 If Nth element is a number NUM, N is the first byte of a
9240 charset whose ID is NUM.
9242 If Nth element is a list of charset IDs, N is the first byte
9243 of one of them. The list is sorted by dimensions of the
9244 charsets. A charset of smaller dimension comes firtst. */
9245 val = Fmake_vector (make_number (256), Qnil);
9247 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9249 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9250 int dim = CHARSET_DIMENSION (charset);
9251 int idx = (dim - 1) * 4;
9253 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9254 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9256 for (i = charset->code_space[idx];
9257 i <= charset->code_space[idx + 1]; i++)
9259 Lisp_Object tmp, tmp2;
9260 int dim2;
9262 tmp = AREF (val, i);
9263 if (NILP (tmp))
9264 tmp = XCAR (tail);
9265 else if (NUMBERP (tmp))
9267 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9268 if (dim < dim2)
9269 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9270 else
9271 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9273 else
9275 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9277 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9278 if (dim < dim2)
9279 break;
9281 if (NILP (tmp2))
9282 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9283 else
9285 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9286 XSETCAR (tmp2, XCAR (tail));
9289 ASET (val, i, tmp);
9292 ASET (attrs, coding_attr_charset_valids, val);
9293 category = coding_category_charset;
9295 else if (EQ (coding_type, Qccl))
9297 Lisp_Object valids;
9299 if (nargs < coding_arg_ccl_max)
9300 goto short_args;
9302 val = args[coding_arg_ccl_decoder];
9303 CHECK_CCL_PROGRAM (val);
9304 if (VECTORP (val))
9305 val = Fcopy_sequence (val);
9306 ASET (attrs, coding_attr_ccl_decoder, val);
9308 val = args[coding_arg_ccl_encoder];
9309 CHECK_CCL_PROGRAM (val);
9310 if (VECTORP (val))
9311 val = Fcopy_sequence (val);
9312 ASET (attrs, coding_attr_ccl_encoder, val);
9314 val = args[coding_arg_ccl_valids];
9315 valids = Fmake_string (make_number (256), make_number (0));
9316 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9318 int from, to;
9320 val = Fcar (tail);
9321 if (INTEGERP (val))
9323 from = to = XINT (val);
9324 if (from < 0 || from > 255)
9325 args_out_of_range_3 (val, make_number (0), make_number (255));
9327 else
9329 CHECK_CONS (val);
9330 CHECK_NATNUM_CAR (val);
9331 CHECK_NATNUM_CDR (val);
9332 from = XINT (XCAR (val));
9333 if (from > 255)
9334 args_out_of_range_3 (XCAR (val),
9335 make_number (0), make_number (255));
9336 to = XINT (XCDR (val));
9337 if (to < from || to > 255)
9338 args_out_of_range_3 (XCDR (val),
9339 XCAR (val), make_number (255));
9341 for (i = from; i <= to; i++)
9342 SSET (valids, i, 1);
9344 ASET (attrs, coding_attr_ccl_valids, valids);
9346 category = coding_category_ccl;
9348 else if (EQ (coding_type, Qutf_16))
9350 Lisp_Object bom, endian;
9352 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9354 if (nargs < coding_arg_utf16_max)
9355 goto short_args;
9357 bom = args[coding_arg_utf16_bom];
9358 if (! NILP (bom) && ! EQ (bom, Qt))
9360 CHECK_CONS (bom);
9361 val = XCAR (bom);
9362 CHECK_CODING_SYSTEM (val);
9363 val = XCDR (bom);
9364 CHECK_CODING_SYSTEM (val);
9366 ASET (attrs, coding_attr_utf_bom, bom);
9368 endian = args[coding_arg_utf16_endian];
9369 CHECK_SYMBOL (endian);
9370 if (NILP (endian))
9371 endian = Qbig;
9372 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9373 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9374 ASET (attrs, coding_attr_utf_16_endian, endian);
9376 category = (CONSP (bom)
9377 ? coding_category_utf_16_auto
9378 : NILP (bom)
9379 ? (EQ (endian, Qbig)
9380 ? coding_category_utf_16_be_nosig
9381 : coding_category_utf_16_le_nosig)
9382 : (EQ (endian, Qbig)
9383 ? coding_category_utf_16_be
9384 : coding_category_utf_16_le));
9386 else if (EQ (coding_type, Qiso_2022))
9388 Lisp_Object initial, reg_usage, request, flags;
9389 int i;
9391 if (nargs < coding_arg_iso2022_max)
9392 goto short_args;
9394 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9395 CHECK_VECTOR (initial);
9396 for (i = 0; i < 4; i++)
9398 val = Faref (initial, make_number (i));
9399 if (! NILP (val))
9401 struct charset *charset;
9403 CHECK_CHARSET_GET_CHARSET (val, charset);
9404 ASET (initial, i, make_number (CHARSET_ID (charset)));
9405 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9406 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9408 else
9409 ASET (initial, i, make_number (-1));
9412 reg_usage = args[coding_arg_iso2022_reg_usage];
9413 CHECK_CONS (reg_usage);
9414 CHECK_NUMBER_CAR (reg_usage);
9415 CHECK_NUMBER_CDR (reg_usage);
9417 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9418 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9420 int id;
9421 Lisp_Object tmp;
9423 val = Fcar (tail);
9424 CHECK_CONS (val);
9425 tmp = XCAR (val);
9426 CHECK_CHARSET_GET_ID (tmp, id);
9427 CHECK_NATNUM_CDR (val);
9428 if (XINT (XCDR (val)) >= 4)
9429 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9430 XSETCAR (val, make_number (id));
9433 flags = args[coding_arg_iso2022_flags];
9434 CHECK_NATNUM (flags);
9435 i = XINT (flags);
9436 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9437 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9439 ASET (attrs, coding_attr_iso_initial, initial);
9440 ASET (attrs, coding_attr_iso_usage, reg_usage);
9441 ASET (attrs, coding_attr_iso_request, request);
9442 ASET (attrs, coding_attr_iso_flags, flags);
9443 setup_iso_safe_charsets (attrs);
9445 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9446 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9447 | CODING_ISO_FLAG_SINGLE_SHIFT))
9448 ? coding_category_iso_7_else
9449 : EQ (args[coding_arg_charset_list], Qiso_2022)
9450 ? coding_category_iso_7
9451 : coding_category_iso_7_tight);
9452 else
9454 int id = XINT (AREF (initial, 1));
9456 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9457 || EQ (args[coding_arg_charset_list], Qiso_2022)
9458 || id < 0)
9459 ? coding_category_iso_8_else
9460 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9461 ? coding_category_iso_8_1
9462 : coding_category_iso_8_2);
9464 if (category != coding_category_iso_8_1
9465 && category != coding_category_iso_8_2)
9466 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9468 else if (EQ (coding_type, Qemacs_mule))
9470 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9471 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9472 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9473 category = coding_category_emacs_mule;
9475 else if (EQ (coding_type, Qshift_jis))
9478 struct charset *charset;
9480 if (XINT (Flength (charset_list)) != 3
9481 && XINT (Flength (charset_list)) != 4)
9482 error ("There should be three or four charsets");
9484 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9485 if (CHARSET_DIMENSION (charset) != 1)
9486 error ("Dimension of charset %s is not one",
9487 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9488 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9489 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9491 charset_list = XCDR (charset_list);
9492 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9493 if (CHARSET_DIMENSION (charset) != 1)
9494 error ("Dimension of charset %s is not one",
9495 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9497 charset_list = XCDR (charset_list);
9498 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9499 if (CHARSET_DIMENSION (charset) != 2)
9500 error ("Dimension of charset %s is not two",
9501 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9503 charset_list = XCDR (charset_list);
9504 if (! NILP (charset_list))
9506 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9507 if (CHARSET_DIMENSION (charset) != 2)
9508 error ("Dimension of charset %s is not two",
9509 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9512 category = coding_category_sjis;
9513 Vsjis_coding_system = name;
9515 else if (EQ (coding_type, Qbig5))
9517 struct charset *charset;
9519 if (XINT (Flength (charset_list)) != 2)
9520 error ("There should be just two charsets");
9522 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9523 if (CHARSET_DIMENSION (charset) != 1)
9524 error ("Dimension of charset %s is not one",
9525 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9526 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9527 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9529 charset_list = XCDR (charset_list);
9530 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9531 if (CHARSET_DIMENSION (charset) != 2)
9532 error ("Dimension of charset %s is not two",
9533 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9535 category = coding_category_big5;
9536 Vbig5_coding_system = name;
9538 else if (EQ (coding_type, Qraw_text))
9540 category = coding_category_raw_text;
9541 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9543 else if (EQ (coding_type, Qutf_8))
9545 Lisp_Object bom;
9547 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9549 if (nargs < coding_arg_utf8_max)
9550 goto short_args;
9552 bom = args[coding_arg_utf8_bom];
9553 if (! NILP (bom) && ! EQ (bom, Qt))
9555 CHECK_CONS (bom);
9556 val = XCAR (bom);
9557 CHECK_CODING_SYSTEM (val);
9558 val = XCDR (bom);
9559 CHECK_CODING_SYSTEM (val);
9561 ASET (attrs, coding_attr_utf_bom, bom);
9563 category = (CONSP (bom) ? coding_category_utf_8_auto
9564 : NILP (bom) ? coding_category_utf_8_nosig
9565 : coding_category_utf_8_sig);
9567 else if (EQ (coding_type, Qundecided))
9568 category = coding_category_undecided;
9569 else
9570 error ("Invalid coding system type: %s",
9571 SDATA (SYMBOL_NAME (coding_type)));
9573 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9574 CODING_ATTR_PLIST (attrs)
9575 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9576 CODING_ATTR_PLIST (attrs)));
9577 CODING_ATTR_PLIST (attrs)
9578 = Fcons (QCascii_compatible_p,
9579 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9580 CODING_ATTR_PLIST (attrs)));
9582 eol_type = args[coding_arg_eol_type];
9583 if (! NILP (eol_type)
9584 && ! EQ (eol_type, Qunix)
9585 && ! EQ (eol_type, Qdos)
9586 && ! EQ (eol_type, Qmac))
9587 error ("Invalid eol-type");
9589 aliases = Fcons (name, Qnil);
9591 if (NILP (eol_type))
9593 eol_type = make_subsidiaries (name);
9594 for (i = 0; i < 3; i++)
9596 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9598 this_name = AREF (eol_type, i);
9599 this_aliases = Fcons (this_name, Qnil);
9600 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9601 this_spec = Fmake_vector (make_number (3), attrs);
9602 ASET (this_spec, 1, this_aliases);
9603 ASET (this_spec, 2, this_eol_type);
9604 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9605 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9606 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9607 if (NILP (val))
9608 Vcoding_system_alist
9609 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9610 Vcoding_system_alist);
9614 spec_vec = Fmake_vector (make_number (3), attrs);
9615 ASET (spec_vec, 1, aliases);
9616 ASET (spec_vec, 2, eol_type);
9618 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9619 Vcoding_system_list = Fcons (name, Vcoding_system_list);
9620 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9621 if (NILP (val))
9622 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9623 Vcoding_system_alist);
9626 int id = coding_categories[category].id;
9628 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9629 setup_coding_system (name, &coding_categories[category]);
9632 return Qnil;
9634 short_args:
9635 return Fsignal (Qwrong_number_of_arguments,
9636 Fcons (intern ("define-coding-system-internal"),
9637 make_number (nargs)));
9641 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9642 3, 3, 0,
9643 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9644 (coding_system, prop, val)
9645 Lisp_Object coding_system, prop, val;
9647 Lisp_Object spec, attrs;
9649 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9650 attrs = AREF (spec, 0);
9651 if (EQ (prop, QCmnemonic))
9653 if (! STRINGP (val))
9654 CHECK_CHARACTER (val);
9655 CODING_ATTR_MNEMONIC (attrs) = val;
9657 else if (EQ (prop, QCdefault_char))
9659 if (NILP (val))
9660 val = make_number (' ');
9661 else
9662 CHECK_CHARACTER (val);
9663 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9665 else if (EQ (prop, QCdecode_translation_table))
9667 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9668 CHECK_SYMBOL (val);
9669 CODING_ATTR_DECODE_TBL (attrs) = val;
9671 else if (EQ (prop, QCencode_translation_table))
9673 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9674 CHECK_SYMBOL (val);
9675 CODING_ATTR_ENCODE_TBL (attrs) = val;
9677 else if (EQ (prop, QCpost_read_conversion))
9679 CHECK_SYMBOL (val);
9680 CODING_ATTR_POST_READ (attrs) = val;
9682 else if (EQ (prop, QCpre_write_conversion))
9684 CHECK_SYMBOL (val);
9685 CODING_ATTR_PRE_WRITE (attrs) = val;
9687 else if (EQ (prop, QCascii_compatible_p))
9689 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9692 CODING_ATTR_PLIST (attrs)
9693 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9694 return val;
9698 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9699 Sdefine_coding_system_alias, 2, 2, 0,
9700 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9701 (alias, coding_system)
9702 Lisp_Object alias, coding_system;
9704 Lisp_Object spec, aliases, eol_type, val;
9706 CHECK_SYMBOL (alias);
9707 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9708 aliases = AREF (spec, 1);
9709 /* ALIASES should be a list of length more than zero, and the first
9710 element is a base coding system. Append ALIAS at the tail of the
9711 list. */
9712 while (!NILP (XCDR (aliases)))
9713 aliases = XCDR (aliases);
9714 XSETCDR (aliases, Fcons (alias, Qnil));
9716 eol_type = AREF (spec, 2);
9717 if (VECTORP (eol_type))
9719 Lisp_Object subsidiaries;
9720 int i;
9722 subsidiaries = make_subsidiaries (alias);
9723 for (i = 0; i < 3; i++)
9724 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9725 AREF (eol_type, i));
9728 Fputhash (alias, spec, Vcoding_system_hash_table);
9729 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9730 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9731 if (NILP (val))
9732 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9733 Vcoding_system_alist);
9735 return Qnil;
9738 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9739 1, 1, 0,
9740 doc: /* Return the base of CODING-SYSTEM.
9741 Any alias or subsidiary coding system is not a base coding system. */)
9742 (coding_system)
9743 Lisp_Object coding_system;
9745 Lisp_Object spec, attrs;
9747 if (NILP (coding_system))
9748 return (Qno_conversion);
9749 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9750 attrs = AREF (spec, 0);
9751 return CODING_ATTR_BASE_NAME (attrs);
9754 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9755 1, 1, 0,
9756 doc: "Return the property list of CODING-SYSTEM.")
9757 (coding_system)
9758 Lisp_Object coding_system;
9760 Lisp_Object spec, attrs;
9762 if (NILP (coding_system))
9763 coding_system = Qno_conversion;
9764 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9765 attrs = AREF (spec, 0);
9766 return CODING_ATTR_PLIST (attrs);
9770 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9771 1, 1, 0,
9772 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9773 (coding_system)
9774 Lisp_Object coding_system;
9776 Lisp_Object spec;
9778 if (NILP (coding_system))
9779 coding_system = Qno_conversion;
9780 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9781 return AREF (spec, 1);
9784 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9785 Scoding_system_eol_type, 1, 1, 0,
9786 doc: /* Return eol-type of CODING-SYSTEM.
9787 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
9789 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9790 and CR respectively.
9792 A vector value indicates that a format of end-of-line should be
9793 detected automatically. Nth element of the vector is the subsidiary
9794 coding system whose eol-type is N. */)
9795 (coding_system)
9796 Lisp_Object coding_system;
9798 Lisp_Object spec, eol_type;
9799 int n;
9801 if (NILP (coding_system))
9802 coding_system = Qno_conversion;
9803 if (! CODING_SYSTEM_P (coding_system))
9804 return Qnil;
9805 spec = CODING_SYSTEM_SPEC (coding_system);
9806 eol_type = AREF (spec, 2);
9807 if (VECTORP (eol_type))
9808 return Fcopy_sequence (eol_type);
9809 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9810 return make_number (n);
9813 #endif /* emacs */
9816 /*** 9. Post-amble ***/
9818 void
9819 init_coding_once ()
9821 int i;
9823 for (i = 0; i < coding_category_max; i++)
9825 coding_categories[i].id = -1;
9826 coding_priorities[i] = i;
9829 /* ISO2022 specific initialize routine. */
9830 for (i = 0; i < 0x20; i++)
9831 iso_code_class[i] = ISO_control_0;
9832 for (i = 0x21; i < 0x7F; i++)
9833 iso_code_class[i] = ISO_graphic_plane_0;
9834 for (i = 0x80; i < 0xA0; i++)
9835 iso_code_class[i] = ISO_control_1;
9836 for (i = 0xA1; i < 0xFF; i++)
9837 iso_code_class[i] = ISO_graphic_plane_1;
9838 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9839 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9840 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9841 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9842 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9843 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9844 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9845 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9846 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9848 for (i = 0; i < 256; i++)
9850 emacs_mule_bytes[i] = 1;
9852 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9853 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9854 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9855 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9858 #ifdef emacs
9860 void
9861 syms_of_coding ()
9863 staticpro (&Vcoding_system_hash_table);
9865 Lisp_Object args[2];
9866 args[0] = QCtest;
9867 args[1] = Qeq;
9868 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9871 staticpro (&Vsjis_coding_system);
9872 Vsjis_coding_system = Qnil;
9874 staticpro (&Vbig5_coding_system);
9875 Vbig5_coding_system = Qnil;
9877 staticpro (&Vcode_conversion_reused_workbuf);
9878 Vcode_conversion_reused_workbuf = Qnil;
9880 staticpro (&Vcode_conversion_workbuf_name);
9881 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9883 reused_workbuf_in_use = 0;
9885 DEFSYM (Qcharset, "charset");
9886 DEFSYM (Qtarget_idx, "target-idx");
9887 DEFSYM (Qcoding_system_history, "coding-system-history");
9888 Fset (Qcoding_system_history, Qnil);
9890 /* Target FILENAME is the first argument. */
9891 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9892 /* Target FILENAME is the third argument. */
9893 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9895 DEFSYM (Qcall_process, "call-process");
9896 /* Target PROGRAM is the first argument. */
9897 Fput (Qcall_process, Qtarget_idx, make_number (0));
9899 DEFSYM (Qcall_process_region, "call-process-region");
9900 /* Target PROGRAM is the third argument. */
9901 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9903 DEFSYM (Qstart_process, "start-process");
9904 /* Target PROGRAM is the third argument. */
9905 Fput (Qstart_process, Qtarget_idx, make_number (2));
9907 DEFSYM (Qopen_network_stream, "open-network-stream");
9908 /* Target SERVICE is the fourth argument. */
9909 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9911 DEFSYM (Qcoding_system, "coding-system");
9912 DEFSYM (Qcoding_aliases, "coding-aliases");
9914 DEFSYM (Qeol_type, "eol-type");
9915 DEFSYM (Qunix, "unix");
9916 DEFSYM (Qdos, "dos");
9918 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9919 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9920 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9921 DEFSYM (Qdefault_char, "default-char");
9922 DEFSYM (Qundecided, "undecided");
9923 DEFSYM (Qno_conversion, "no-conversion");
9924 DEFSYM (Qraw_text, "raw-text");
9926 DEFSYM (Qiso_2022, "iso-2022");
9928 DEFSYM (Qutf_8, "utf-8");
9929 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9931 DEFSYM (Qutf_16, "utf-16");
9932 DEFSYM (Qbig, "big");
9933 DEFSYM (Qlittle, "little");
9935 DEFSYM (Qshift_jis, "shift-jis");
9936 DEFSYM (Qbig5, "big5");
9938 DEFSYM (Qcoding_system_p, "coding-system-p");
9940 DEFSYM (Qcoding_system_error, "coding-system-error");
9941 Fput (Qcoding_system_error, Qerror_conditions,
9942 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9943 Fput (Qcoding_system_error, Qerror_message,
9944 build_string ("Invalid coding system"));
9946 /* Intern this now in case it isn't already done.
9947 Setting this variable twice is harmless.
9948 But don't staticpro it here--that is done in alloc.c. */
9949 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9951 DEFSYM (Qtranslation_table, "translation-table");
9952 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9953 DEFSYM (Qtranslation_table_id, "translation-table-id");
9954 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9955 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9957 DEFSYM (Qvalid_codes, "valid-codes");
9959 DEFSYM (Qemacs_mule, "emacs-mule");
9961 DEFSYM (QCcategory, ":category");
9962 DEFSYM (QCmnemonic, ":mnemonic");
9963 DEFSYM (QCdefault_char, ":default-char");
9964 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9965 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9966 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9967 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9968 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9970 Vcoding_category_table
9971 = Fmake_vector (make_number (coding_category_max), Qnil);
9972 staticpro (&Vcoding_category_table);
9973 /* Followings are target of code detection. */
9974 ASET (Vcoding_category_table, coding_category_iso_7,
9975 intern ("coding-category-iso-7"));
9976 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9977 intern ("coding-category-iso-7-tight"));
9978 ASET (Vcoding_category_table, coding_category_iso_8_1,
9979 intern ("coding-category-iso-8-1"));
9980 ASET (Vcoding_category_table, coding_category_iso_8_2,
9981 intern ("coding-category-iso-8-2"));
9982 ASET (Vcoding_category_table, coding_category_iso_7_else,
9983 intern ("coding-category-iso-7-else"));
9984 ASET (Vcoding_category_table, coding_category_iso_8_else,
9985 intern ("coding-category-iso-8-else"));
9986 ASET (Vcoding_category_table, coding_category_utf_8_auto,
9987 intern ("coding-category-utf-8-auto"));
9988 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
9989 intern ("coding-category-utf-8"));
9990 ASET (Vcoding_category_table, coding_category_utf_8_sig,
9991 intern ("coding-category-utf-8-sig"));
9992 ASET (Vcoding_category_table, coding_category_utf_16_be,
9993 intern ("coding-category-utf-16-be"));
9994 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9995 intern ("coding-category-utf-16-auto"));
9996 ASET (Vcoding_category_table, coding_category_utf_16_le,
9997 intern ("coding-category-utf-16-le"));
9998 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9999 intern ("coding-category-utf-16-be-nosig"));
10000 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10001 intern ("coding-category-utf-16-le-nosig"));
10002 ASET (Vcoding_category_table, coding_category_charset,
10003 intern ("coding-category-charset"));
10004 ASET (Vcoding_category_table, coding_category_sjis,
10005 intern ("coding-category-sjis"));
10006 ASET (Vcoding_category_table, coding_category_big5,
10007 intern ("coding-category-big5"));
10008 ASET (Vcoding_category_table, coding_category_ccl,
10009 intern ("coding-category-ccl"));
10010 ASET (Vcoding_category_table, coding_category_emacs_mule,
10011 intern ("coding-category-emacs-mule"));
10012 /* Followings are NOT target of code detection. */
10013 ASET (Vcoding_category_table, coding_category_raw_text,
10014 intern ("coding-category-raw-text"));
10015 ASET (Vcoding_category_table, coding_category_undecided,
10016 intern ("coding-category-undecided"));
10018 DEFSYM (Qinsufficient_source, "insufficient-source");
10019 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10020 DEFSYM (Qinvalid_source, "invalid-source");
10021 DEFSYM (Qinterrupted, "interrupted");
10022 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10023 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10025 defsubr (&Scoding_system_p);
10026 defsubr (&Sread_coding_system);
10027 defsubr (&Sread_non_nil_coding_system);
10028 defsubr (&Scheck_coding_system);
10029 defsubr (&Sdetect_coding_region);
10030 defsubr (&Sdetect_coding_string);
10031 defsubr (&Sfind_coding_systems_region_internal);
10032 defsubr (&Sunencodable_char_position);
10033 defsubr (&Scheck_coding_systems_region);
10034 defsubr (&Sdecode_coding_region);
10035 defsubr (&Sencode_coding_region);
10036 defsubr (&Sdecode_coding_string);
10037 defsubr (&Sencode_coding_string);
10038 defsubr (&Sdecode_sjis_char);
10039 defsubr (&Sencode_sjis_char);
10040 defsubr (&Sdecode_big5_char);
10041 defsubr (&Sencode_big5_char);
10042 defsubr (&Sset_terminal_coding_system_internal);
10043 defsubr (&Sset_safe_terminal_coding_system_internal);
10044 defsubr (&Sterminal_coding_system);
10045 defsubr (&Sset_keyboard_coding_system_internal);
10046 defsubr (&Skeyboard_coding_system);
10047 defsubr (&Sfind_operation_coding_system);
10048 defsubr (&Sset_coding_system_priority);
10049 defsubr (&Sdefine_coding_system_internal);
10050 defsubr (&Sdefine_coding_system_alias);
10051 defsubr (&Scoding_system_put);
10052 defsubr (&Scoding_system_base);
10053 defsubr (&Scoding_system_plist);
10054 defsubr (&Scoding_system_aliases);
10055 defsubr (&Scoding_system_eol_type);
10056 defsubr (&Scoding_system_priority_list);
10058 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10059 doc: /* List of coding systems.
10061 Do not alter the value of this variable manually. This variable should be
10062 updated by the functions `define-coding-system' and
10063 `define-coding-system-alias'. */);
10064 Vcoding_system_list = Qnil;
10066 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10067 doc: /* Alist of coding system names.
10068 Each element is one element list of coding system name.
10069 This variable is given to `completing-read' as COLLECTION argument.
10071 Do not alter the value of this variable manually. This variable should be
10072 updated by the functions `make-coding-system' and
10073 `define-coding-system-alias'. */);
10074 Vcoding_system_alist = Qnil;
10076 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10077 doc: /* List of coding-categories (symbols) ordered by priority.
10079 On detecting a coding system, Emacs tries code detection algorithms
10080 associated with each coding-category one by one in this order. When
10081 one algorithm agrees with a byte sequence of source text, the coding
10082 system bound to the corresponding coding-category is selected.
10084 Don't modify this variable directly, but use `set-coding-priority'. */);
10086 int i;
10088 Vcoding_category_list = Qnil;
10089 for (i = coding_category_max - 1; i >= 0; i--)
10090 Vcoding_category_list
10091 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10092 Vcoding_category_list);
10095 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10096 doc: /* Specify the coding system for read operations.
10097 It is useful to bind this variable with `let', but do not set it globally.
10098 If the value is a coding system, it is used for decoding on read operation.
10099 If not, an appropriate element is used from one of the coding system alists.
10100 There are three such tables: `file-coding-system-alist',
10101 `process-coding-system-alist', and `network-coding-system-alist'. */);
10102 Vcoding_system_for_read = Qnil;
10104 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10105 doc: /* Specify the coding system for write operations.
10106 Programs bind this variable with `let', but you should not set it globally.
10107 If the value is a coding system, it is used for encoding of output,
10108 when writing it to a file and when sending it to a file or subprocess.
10110 If this does not specify a coding system, an appropriate element
10111 is used from one of the coding system alists.
10112 There are three such tables: `file-coding-system-alist',
10113 `process-coding-system-alist', and `network-coding-system-alist'.
10114 For output to files, if the above procedure does not specify a coding system,
10115 the value of `buffer-file-coding-system' is used. */);
10116 Vcoding_system_for_write = Qnil;
10118 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10119 doc: /*
10120 Coding system used in the latest file or process I/O. */);
10121 Vlast_coding_system_used = Qnil;
10123 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10124 doc: /*
10125 Error status of the last code conversion.
10127 When an error was detected in the last code conversion, this variable
10128 is set to one of the following symbols.
10129 `insufficient-source'
10130 `inconsistent-eol'
10131 `invalid-source'
10132 `interrupted'
10133 `insufficient-memory'
10134 When no error was detected, the value doesn't change. So, to check
10135 the error status of a code conversion by this variable, you must
10136 explicitly set this variable to nil before performing code
10137 conversion. */);
10138 Vlast_code_conversion_error = Qnil;
10140 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10141 doc: /*
10142 *Non-nil means always inhibit code conversion of end-of-line format.
10143 See info node `Coding Systems' and info node `Text and Binary' concerning
10144 such conversion. */);
10145 inhibit_eol_conversion = 0;
10147 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10148 doc: /*
10149 Non-nil means process buffer inherits coding system of process output.
10150 Bind it to t if the process output is to be treated as if it were a file
10151 read from some filesystem. */);
10152 inherit_process_coding_system = 0;
10154 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10155 doc: /*
10156 Alist to decide a coding system to use for a file I/O operation.
10157 The format is ((PATTERN . VAL) ...),
10158 where PATTERN is a regular expression matching a file name,
10159 VAL is a coding system, a cons of coding systems, or a function symbol.
10160 If VAL is a coding system, it is used for both decoding and encoding
10161 the file contents.
10162 If VAL is a cons of coding systems, the car part is used for decoding,
10163 and the cdr part is used for encoding.
10164 If VAL is a function symbol, the function must return a coding system
10165 or a cons of coding systems which are used as above. The function is
10166 called with an argument that is a list of the arguments with which
10167 `find-operation-coding-system' was called. If the function can't decide
10168 a coding system, it can return `undecided' so that the normal
10169 code-detection is performed.
10171 See also the function `find-operation-coding-system'
10172 and the variable `auto-coding-alist'. */);
10173 Vfile_coding_system_alist = Qnil;
10175 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10176 doc: /*
10177 Alist to decide a coding system to use for a process I/O operation.
10178 The format is ((PATTERN . VAL) ...),
10179 where PATTERN is a regular expression matching a program name,
10180 VAL is a coding system, a cons of coding systems, or a function symbol.
10181 If VAL is a coding system, it is used for both decoding what received
10182 from the program and encoding what sent to the program.
10183 If VAL is a cons of coding systems, the car part is used for decoding,
10184 and the cdr part is used for encoding.
10185 If VAL is a function symbol, the function must return a coding system
10186 or a cons of coding systems which are used as above.
10188 See also the function `find-operation-coding-system'. */);
10189 Vprocess_coding_system_alist = Qnil;
10191 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10192 doc: /*
10193 Alist to decide a coding system to use for a network I/O operation.
10194 The format is ((PATTERN . VAL) ...),
10195 where PATTERN is a regular expression matching a network service name
10196 or is a port number to connect to,
10197 VAL is a coding system, a cons of coding systems, or a function symbol.
10198 If VAL is a coding system, it is used for both decoding what received
10199 from the network stream and encoding what sent to the network stream.
10200 If VAL is a cons of coding systems, the car part is used for decoding,
10201 and the cdr part is used for encoding.
10202 If VAL is a function symbol, the function must return a coding system
10203 or a cons of coding systems which are used as above.
10205 See also the function `find-operation-coding-system'. */);
10206 Vnetwork_coding_system_alist = Qnil;
10208 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10209 doc: /* Coding system to use with system messages.
10210 Also used for decoding keyboard input on X Window system. */);
10211 Vlocale_coding_system = Qnil;
10213 /* The eol mnemonics are reset in startup.el system-dependently. */
10214 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10215 doc: /*
10216 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10217 eol_mnemonic_unix = build_string (":");
10219 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10220 doc: /*
10221 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10222 eol_mnemonic_dos = build_string ("\\");
10224 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10225 doc: /*
10226 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10227 eol_mnemonic_mac = build_string ("/");
10229 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10230 doc: /*
10231 *String displayed in mode line when end-of-line format is not yet determined. */);
10232 eol_mnemonic_undecided = build_string (":");
10234 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10235 doc: /*
10236 *Non-nil enables character translation while encoding and decoding. */);
10237 Venable_character_translation = Qt;
10239 DEFVAR_LISP ("standard-translation-table-for-decode",
10240 &Vstandard_translation_table_for_decode,
10241 doc: /* Table for translating characters while decoding. */);
10242 Vstandard_translation_table_for_decode = Qnil;
10244 DEFVAR_LISP ("standard-translation-table-for-encode",
10245 &Vstandard_translation_table_for_encode,
10246 doc: /* Table for translating characters while encoding. */);
10247 Vstandard_translation_table_for_encode = Qnil;
10249 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10250 doc: /* Alist of charsets vs revision numbers.
10251 While encoding, if a charset (car part of an element) is found,
10252 designate it with the escape sequence identifying revision (cdr part
10253 of the element). */);
10254 Vcharset_revision_table = Qnil;
10256 DEFVAR_LISP ("default-process-coding-system",
10257 &Vdefault_process_coding_system,
10258 doc: /* Cons of coding systems used for process I/O by default.
10259 The car part is used for decoding a process output,
10260 the cdr part is used for encoding a text to be sent to a process. */);
10261 Vdefault_process_coding_system = Qnil;
10263 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10264 doc: /*
10265 Table of extra Latin codes in the range 128..159 (inclusive).
10266 This is a vector of length 256.
10267 If Nth element is non-nil, the existence of code N in a file
10268 \(or output of subprocess) doesn't prevent it to be detected as
10269 a coding system of ISO 2022 variant which has a flag
10270 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10271 or reading output of a subprocess.
10272 Only 128th through 159th elements have a meaning. */);
10273 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10275 DEFVAR_LISP ("select-safe-coding-system-function",
10276 &Vselect_safe_coding_system_function,
10277 doc: /*
10278 Function to call to select safe coding system for encoding a text.
10280 If set, this function is called to force a user to select a proper
10281 coding system which can encode the text in the case that a default
10282 coding system used in each operation can't encode the text. The
10283 function should take care that the buffer is not modified while
10284 the coding system is being selected.
10286 The default value is `select-safe-coding-system' (which see). */);
10287 Vselect_safe_coding_system_function = Qnil;
10289 DEFVAR_BOOL ("coding-system-require-warning",
10290 &coding_system_require_warning,
10291 doc: /* Internal use only.
10292 If non-nil, on writing a file, `select-safe-coding-system-function' is
10293 called even if `coding-system-for-write' is non-nil. The command
10294 `universal-coding-system-argument' binds this variable to t temporarily. */);
10295 coding_system_require_warning = 0;
10298 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10299 &inhibit_iso_escape_detection,
10300 doc: /*
10301 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10303 When Emacs reads text, it tries to detect how the text is encoded.
10304 This code detection is sensitive to escape sequences. If Emacs sees
10305 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10306 of the ISO2022 encodings, and decodes text by the corresponding coding
10307 system (e.g. `iso-2022-7bit').
10309 However, there may be a case that you want to read escape sequences in
10310 a file as is. In such a case, you can set this variable to non-nil.
10311 Then the code detection will ignore any escape sequences, and no text is
10312 detected as encoded in some ISO-2022 encoding. The result is that all
10313 escape sequences become visible in a buffer.
10315 The default value is nil, and it is strongly recommended not to change
10316 it. That is because many Emacs Lisp source files that contain
10317 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10318 in Emacs's distribution, and they won't be decoded correctly on
10319 reading if you suppress escape sequence detection.
10321 The other way to read escape sequences in a file without decoding is
10322 to explicitly specify some coding system that doesn't use ISO-2022
10323 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10324 inhibit_iso_escape_detection = 0;
10326 DEFVAR_BOOL ("inhibit-null-byte-detection",
10327 &inhibit_null_byte_detection,
10328 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10329 By default, Emacs treats it as binary data, and does not attempt to
10330 decode it. The effect is as if you specified `no-conversion' for
10331 reading that text.
10333 Set this to non-nil when a regular text happens to include null bytes.
10334 Examples are Index nodes of Info files and null-byte delimited output
10335 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10336 decode text as usual. */);
10337 inhibit_null_byte_detection = 0;
10339 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10340 doc: /* Char table for translating self-inserting characters.
10341 This is applied to the result of input methods, not their input.
10342 See also `keyboard-translate-table'. */);
10343 Vtranslation_table_for_input = Qnil;
10346 Lisp_Object args[coding_arg_max];
10347 Lisp_Object plist[16];
10348 int i;
10350 for (i = 0; i < coding_arg_max; i++)
10351 args[i] = Qnil;
10353 plist[0] = intern (":name");
10354 plist[1] = args[coding_arg_name] = Qno_conversion;
10355 plist[2] = intern (":mnemonic");
10356 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10357 plist[4] = intern (":coding-type");
10358 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10359 plist[6] = intern (":ascii-compatible-p");
10360 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10361 plist[8] = intern (":default-char");
10362 plist[9] = args[coding_arg_default_char] = make_number (0);
10363 plist[10] = intern (":for-unibyte");
10364 plist[11] = args[coding_arg_for_unibyte] = Qt;
10365 plist[12] = intern (":docstring");
10366 plist[13] = build_string ("Do no conversion.\n\
10368 When you visit a file with this coding, the file is read into a\n\
10369 unibyte buffer as is, thus each byte of a file is treated as a\n\
10370 character.");
10371 plist[14] = intern (":eol-type");
10372 plist[15] = args[coding_arg_eol_type] = Qunix;
10373 args[coding_arg_plist] = Flist (16, plist);
10374 Fdefine_coding_system_internal (coding_arg_max, args);
10376 plist[1] = args[coding_arg_name] = Qundecided;
10377 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10378 plist[5] = args[coding_arg_coding_type] = Qundecided;
10379 /* This is already set.
10380 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10381 plist[8] = intern (":charset-list");
10382 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10383 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10384 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10385 plist[15] = args[coding_arg_eol_type] = Qnil;
10386 args[coding_arg_plist] = Flist (16, plist);
10387 Fdefine_coding_system_internal (coding_arg_max, args);
10390 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10393 int i;
10395 for (i = 0; i < coding_category_max; i++)
10396 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10398 #if defined (MSDOS) || defined (WINDOWSNT)
10399 system_eol_type = Qdos;
10400 #else
10401 system_eol_type = Qunix;
10402 #endif
10403 staticpro (&system_eol_type);
10406 char *
10407 emacs_strerror (error_number)
10408 int error_number;
10410 char *str;
10412 synchronize_system_messages_locale ();
10413 str = strerror (error_number);
10415 if (! NILP (Vlocale_coding_system))
10417 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10418 Vlocale_coding_system,
10420 str = (char *) SDATA (dec);
10423 return str;
10426 #endif /* emacs */
10428 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10429 (do not change this comment) */