Add coding cookie to lisp/emacs-lisp/lisp-mode.el.
[emacs.git] / src / coding.c
blob42fd81b6322507001fcc2cbfda10d10067897bb1
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
498 /* A character to be produced on output if encoding of the original
499 character is prohibited by CODING_ISO_FLAG_SAFE. */
500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
502 /* UTF-8 section */
503 #define CODING_UTF_8_BOM(coding) \
504 ((coding)->spec.utf_8_bom)
506 /* UTF-16 section */
507 #define CODING_UTF_16_BOM(coding) \
508 ((coding)->spec.utf_16.bom)
510 #define CODING_UTF_16_ENDIAN(coding) \
511 ((coding)->spec.utf_16.endian)
513 #define CODING_UTF_16_SURROGATE(coding) \
514 ((coding)->spec.utf_16.surrogate)
517 /* CCL section */
518 #define CODING_CCL_DECODER(coding) \
519 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
520 #define CODING_CCL_ENCODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
522 #define CODING_CCL_VALIDS(coding) \
523 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
525 /* Index for each coding category in `coding_categories' */
527 enum coding_category
529 coding_category_iso_7,
530 coding_category_iso_7_tight,
531 coding_category_iso_8_1,
532 coding_category_iso_8_2,
533 coding_category_iso_7_else,
534 coding_category_iso_8_else,
535 coding_category_utf_8_auto,
536 coding_category_utf_8_nosig,
537 coding_category_utf_8_sig,
538 coding_category_utf_16_auto,
539 coding_category_utf_16_be,
540 coding_category_utf_16_le,
541 coding_category_utf_16_be_nosig,
542 coding_category_utf_16_le_nosig,
543 coding_category_charset,
544 coding_category_sjis,
545 coding_category_big5,
546 coding_category_ccl,
547 coding_category_emacs_mule,
548 /* All above are targets of code detection. */
549 coding_category_raw_text,
550 coding_category_undecided,
551 coding_category_max
554 /* Definitions of flag bits used in detect_coding_XXXX. */
555 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
556 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
557 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
558 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
559 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
560 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
561 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
562 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
563 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
564 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
565 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
566 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
567 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
568 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
569 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
570 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
571 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
572 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
573 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
574 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
576 /* This value is returned if detect_coding_mask () find nothing other
577 than ASCII characters. */
578 #define CATEGORY_MASK_ANY \
579 (CATEGORY_MASK_ISO_7 \
580 | CATEGORY_MASK_ISO_7_TIGHT \
581 | CATEGORY_MASK_ISO_8_1 \
582 | CATEGORY_MASK_ISO_8_2 \
583 | CATEGORY_MASK_ISO_7_ELSE \
584 | CATEGORY_MASK_ISO_8_ELSE \
585 | CATEGORY_MASK_UTF_8_AUTO \
586 | CATEGORY_MASK_UTF_8_NOSIG \
587 | CATEGORY_MASK_UTF_8_SIG \
588 | CATEGORY_MASK_UTF_16_AUTO \
589 | CATEGORY_MASK_UTF_16_BE \
590 | CATEGORY_MASK_UTF_16_LE \
591 | CATEGORY_MASK_UTF_16_BE_NOSIG \
592 | CATEGORY_MASK_UTF_16_LE_NOSIG \
593 | CATEGORY_MASK_CHARSET \
594 | CATEGORY_MASK_SJIS \
595 | CATEGORY_MASK_BIG5 \
596 | CATEGORY_MASK_CCL \
597 | CATEGORY_MASK_EMACS_MULE)
600 #define CATEGORY_MASK_ISO_7BIT \
601 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
603 #define CATEGORY_MASK_ISO_8BIT \
604 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
606 #define CATEGORY_MASK_ISO_ELSE \
607 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
609 #define CATEGORY_MASK_ISO_ESCAPE \
610 (CATEGORY_MASK_ISO_7 \
611 | CATEGORY_MASK_ISO_7_TIGHT \
612 | CATEGORY_MASK_ISO_7_ELSE \
613 | CATEGORY_MASK_ISO_8_ELSE)
615 #define CATEGORY_MASK_ISO \
616 ( CATEGORY_MASK_ISO_7BIT \
617 | CATEGORY_MASK_ISO_8BIT \
618 | CATEGORY_MASK_ISO_ELSE)
620 #define CATEGORY_MASK_UTF_16 \
621 (CATEGORY_MASK_UTF_16_AUTO \
622 | CATEGORY_MASK_UTF_16_BE \
623 | CATEGORY_MASK_UTF_16_LE \
624 | CATEGORY_MASK_UTF_16_BE_NOSIG \
625 | CATEGORY_MASK_UTF_16_LE_NOSIG)
627 #define CATEGORY_MASK_UTF_8 \
628 (CATEGORY_MASK_UTF_8_AUTO \
629 | CATEGORY_MASK_UTF_8_NOSIG \
630 | CATEGORY_MASK_UTF_8_SIG)
632 /* Table of coding categories (Lisp symbols). This variable is for
633 internal use only. */
634 static Lisp_Object Vcoding_category_table;
636 /* Table of coding-categories ordered by priority. */
637 static enum coding_category coding_priorities[coding_category_max];
639 /* Nth element is a coding context for the coding system bound to the
640 Nth coding category. */
641 static struct coding_system coding_categories[coding_category_max];
643 /*** Commonly used macros and functions ***/
645 #ifndef min
646 #define min(a, b) ((a) < (b) ? (a) : (b))
647 #endif
648 #ifndef max
649 #define max(a, b) ((a) > (b) ? (a) : (b))
650 #endif
652 #define CODING_GET_INFO(coding, attrs, charset_list) \
653 do { \
654 (attrs) = CODING_ID_ATTRS ((coding)->id); \
655 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
656 } while (0)
659 /* Safely get one byte from the source text pointed by SRC which ends
660 at SRC_END, and set C to that byte. If there are not enough bytes
661 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
662 and a multibyte character is found at SRC, set C to the
663 negative value of the character code. The caller should declare
664 and set these variables appropriately in advance:
665 src, src_end, multibytep */
667 #define ONE_MORE_BYTE(c) \
668 do { \
669 if (src == src_end) \
671 if (src_base < src) \
672 record_conversion_result \
673 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
674 goto no_more_source; \
676 c = *src++; \
677 if (multibytep && (c & 0x80)) \
679 if ((c & 0xFE) == 0xC0) \
680 c = ((c & 1) << 6) | *src++; \
681 else \
683 src--; \
684 c = - string_char (src, &src, NULL); \
685 record_conversion_result \
686 (coding, CODING_RESULT_INVALID_SRC); \
689 consumed_chars++; \
690 } while (0)
692 /* Safely get two bytes from the source text pointed by SRC which ends
693 at SRC_END, and set C1 and C2 to those bytes while skipping the
694 heading multibyte characters. If there are not enough bytes in the
695 source, it jumps to 'no_more_source'. If MULTIBYTEP and
696 a multibyte character is found for C2, set C2 to the negative value
697 of the character code. The caller should declare and set these
698 variables appropriately in advance:
699 src, src_end, multibytep
700 It is intended that this macro is used in detect_coding_utf_16. */
702 #define TWO_MORE_BYTES(c1, c2) \
703 do { \
704 do { \
705 if (src == src_end) \
706 goto no_more_source; \
707 c1 = *src++; \
708 if (multibytep && (c1 & 0x80)) \
710 if ((c1 & 0xFE) == 0xC0) \
711 c1 = ((c1 & 1) << 6) | *src++; \
712 else \
714 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
715 c1 = -1; \
718 } while (c1 < 0); \
719 if (src == src_end) \
720 goto no_more_source; \
721 c2 = *src++; \
722 if (multibytep && (c2 & 0x80)) \
724 if ((c2 & 0xFE) == 0xC0) \
725 c2 = ((c2 & 1) << 6) | *src++; \
726 else \
727 c2 = -1; \
729 } while (0)
732 /* Store a byte C in the place pointed by DST and increment DST to the
733 next free point, and increment PRODUCED_CHARS. The caller should
734 assure that C is 0..127, and declare and set the variable `dst'
735 appropriately in advance.
739 #define EMIT_ONE_ASCII_BYTE(c) \
740 do { \
741 produced_chars++; \
742 *dst++ = (c); \
743 } while (0)
746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
748 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
749 do { \
750 produced_chars += 2; \
751 *dst++ = (c1), *dst++ = (c2); \
752 } while (0)
755 /* Store a byte C in the place pointed by DST and increment DST to the
756 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
757 store in an appropriate multibyte form. The caller should
758 declare and set the variables `dst' and `multibytep' appropriately
759 in advance. */
761 #define EMIT_ONE_BYTE(c) \
762 do { \
763 produced_chars++; \
764 if (multibytep) \
766 unsigned ch = (c); \
767 if (ch >= 0x80) \
768 ch = BYTE8_TO_CHAR (ch); \
769 CHAR_STRING_ADVANCE (ch, dst); \
771 else \
772 *dst++ = (c); \
773 } while (0)
776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
778 #define EMIT_TWO_BYTES(c1, c2) \
779 do { \
780 produced_chars += 2; \
781 if (multibytep) \
783 unsigned ch; \
785 ch = (c1); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
789 ch = (c2); \
790 if (ch >= 0x80) \
791 ch = BYTE8_TO_CHAR (ch); \
792 CHAR_STRING_ADVANCE (ch, dst); \
794 else \
796 *dst++ = (c1); \
797 *dst++ = (c2); \
799 } while (0)
802 #define EMIT_THREE_BYTES(c1, c2, c3) \
803 do { \
804 EMIT_ONE_BYTE (c1); \
805 EMIT_TWO_BYTES (c2, c3); \
806 } while (0)
809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
810 do { \
811 EMIT_TWO_BYTES (c1, c2); \
812 EMIT_TWO_BYTES (c3, c4); \
813 } while (0)
816 static void
817 record_conversion_result (struct coding_system *coding,
818 enum coding_result_code result)
820 coding->result = result;
821 switch (result)
823 case CODING_RESULT_INSUFFICIENT_SRC:
824 Vlast_code_conversion_error = Qinsufficient_source;
825 break;
826 case CODING_RESULT_INVALID_SRC:
827 Vlast_code_conversion_error = Qinvalid_source;
828 break;
829 case CODING_RESULT_INTERRUPT:
830 Vlast_code_conversion_error = Qinterrupted;
831 break;
832 case CODING_RESULT_INSUFFICIENT_DST:
833 /* Don't record this error in Vlast_code_conversion_error
834 because it happens just temporarily and is resolved when the
835 whole conversion is finished. */
836 break;
837 case CODING_RESULT_SUCCESS:
838 break;
839 default:
840 Vlast_code_conversion_error = intern ("Unknown error");
844 /* These wrapper macros are used to preserve validity of pointers into
845 buffer text across calls to decode_char, encode_char, etc, which
846 could cause relocation of buffers if it loads a charset map,
847 because loading a charset map allocates large structures. */
849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
850 do { \
851 ptrdiff_t offset; \
853 charset_map_loaded = 0; \
854 c = DECODE_CHAR (charset, code); \
855 if (charset_map_loaded \
856 && (offset = coding_change_source (coding))) \
858 src += offset; \
859 src_base += offset; \
860 src_end += offset; \
862 } while (0)
864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
865 do { \
866 ptrdiff_t offset; \
868 charset_map_loaded = 0; \
869 code = ENCODE_CHAR (charset, c); \
870 if (charset_map_loaded \
871 && (offset = coding_change_destination (coding))) \
873 dst += offset; \
874 dst_end += offset; \
876 } while (0)
878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
879 do { \
880 ptrdiff_t offset; \
882 charset_map_loaded = 0; \
883 charset = char_charset (c, charset_list, code_return); \
884 if (charset_map_loaded \
885 && (offset = coding_change_destination (coding))) \
887 dst += offset; \
888 dst_end += offset; \
890 } while (0)
892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
893 do { \
894 ptrdiff_t offset; \
896 charset_map_loaded = 0; \
897 result = CHAR_CHARSET_P (c, charset); \
898 if (charset_map_loaded \
899 && (offset = coding_change_destination (coding))) \
901 dst += offset; \
902 dst_end += offset; \
904 } while (0)
907 /* If there are at least BYTES length of room at dst, allocate memory
908 for coding->destination and update dst and dst_end. We don't have
909 to take care of coding->source which will be relocated. It is
910 handled by calling coding_set_source in encode_coding. */
912 #define ASSURE_DESTINATION(bytes) \
913 do { \
914 if (dst + (bytes) >= dst_end) \
916 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
918 dst = alloc_destination (coding, more_bytes, dst); \
919 dst_end = coding->destination + coding->dst_bytes; \
921 } while (0)
924 /* Store multibyte form of the character C in P, and advance P to the
925 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
926 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
927 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
931 /* Return the character code of character whose multibyte form is at
932 P, and advance P to the end of the multibyte form. This used to be
933 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
934 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
938 /* Set coding->source from coding->src_object. */
940 static void
941 coding_set_source (struct coding_system *coding)
943 if (BUFFERP (coding->src_object))
945 struct buffer *buf = XBUFFER (coding->src_object);
947 if (coding->src_pos < 0)
948 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
949 else
950 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
952 else if (STRINGP (coding->src_object))
954 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
956 else
958 /* Otherwise, the source is C string and is never relocated
959 automatically. Thus we don't have to update anything. */
964 /* Set coding->source from coding->src_object, and return how many
965 bytes coding->source was changed. */
967 static ptrdiff_t
968 coding_change_source (struct coding_system *coding)
970 const unsigned char *orig = coding->source;
971 coding_set_source (coding);
972 return coding->source - orig;
976 /* Set coding->destination from coding->dst_object. */
978 static void
979 coding_set_destination (struct coding_system *coding)
981 if (BUFFERP (coding->dst_object))
983 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
985 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
986 coding->dst_bytes = (GAP_END_ADDR
987 - (coding->src_bytes - coding->consumed)
988 - coding->destination);
990 else
992 /* We are sure that coding->dst_pos_byte is before the gap
993 of the buffer. */
994 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
995 + coding->dst_pos_byte - BEG_BYTE);
996 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
997 - coding->destination);
1000 else
1002 /* Otherwise, the destination is C string and is never relocated
1003 automatically. Thus we don't have to update anything. */
1008 /* Set coding->destination from coding->dst_object, and return how
1009 many bytes coding->destination was changed. */
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1014 const unsigned char *orig = coding->destination;
1015 coding_set_destination (coding);
1016 return coding->destination - orig;
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1023 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024 string_overflow ();
1025 coding->destination = xrealloc (coding->destination,
1026 coding->dst_bytes + bytes);
1027 coding->dst_bytes += bytes;
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1034 if (EQ (coding->src_object, coding->dst_object))
1036 /* The gap may contain the produced data at the head and not-yet
1037 consumed data at the tail. To preserve those data, we at
1038 first make the gap size to zero, then increase the gap
1039 size. */
1040 ptrdiff_t add = GAP_SIZE;
1042 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044 make_gap (bytes);
1045 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1048 else
1049 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055 unsigned char *dst)
1057 ptrdiff_t offset = dst - coding->destination;
1059 if (BUFFERP (coding->dst_object))
1061 struct buffer *buf = XBUFFER (coding->dst_object);
1063 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1065 else
1066 coding_alloc_by_realloc (coding, nbytes);
1067 coding_set_destination (coding);
1068 dst = coding->destination + offset;
1069 return dst;
1072 /** Macros for annotations. */
1074 /* An annotation data is stored in the array coding->charbuf in this
1075 format:
1076 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077 LENGTH is the number of elements in the annotation.
1078 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079 NCHARS is the number of characters in the text annotated.
1081 The format of the following elements depend on ANNOTATION_MASK.
1083 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084 follows:
1085 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1087 NBYTES is the number of bytes specified in the header part of
1088 old-style emacs-mule encoding, or 0 for the other kind of
1089 composition.
1091 METHOD is one of enum composition_method.
1093 Optional COMPOSITION-COMPONENTS are characters and composition
1094 rules.
1096 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097 follows.
1099 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100 recover from an invalid annotation, and should be skipped by
1101 produce_annotation. */
1103 /* Maximum length of the header of annotation data. */
1104 #define MAX_ANNOTATION_LENGTH 5
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1107 do { \
1108 *(buf)++ = -(len); \
1109 *(buf)++ = (mask); \
1110 *(buf)++ = (nchars); \
1111 coding->annotated = 1; \
1112 } while (0);
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1115 do { \
1116 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117 *buf++ = nbytes; \
1118 *buf++ = method; \
1119 } while (0)
1122 #define ADD_CHARSET_DATA(buf, nchars, id) \
1123 do { \
1124 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125 *buf++ = id; \
1126 } while (0)
1129 /* Bitmasks for coding->eol_seen. */
1131 #define EOL_SEEN_NONE 0
1132 #define EOL_SEEN_LF 1
1133 #define EOL_SEEN_CR 2
1134 #define EOL_SEEN_CRLF 4
1137 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1142 /*** 3. UTF-8 ***/
1144 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1145 Return true if a text is encoded in UTF-8. */
1147 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1148 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1149 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1150 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1151 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1152 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1154 #define UTF_8_BOM_1 0xEF
1155 #define UTF_8_BOM_2 0xBB
1156 #define UTF_8_BOM_3 0xBF
1158 /* Unlike the other detect_coding_XXX, this function counts number of
1159 characters and check EOL format. */
1161 static bool
1162 detect_coding_utf_8 (struct coding_system *coding,
1163 struct coding_detection_info *detect_info)
1165 const unsigned char *src = coding->source, *src_base;
1166 const unsigned char *src_end = coding->source + coding->src_bytes;
1167 bool multibytep = coding->src_multibyte;
1168 ptrdiff_t consumed_chars = 0;
1169 bool bom_found = 0;
1170 int nchars = coding->head_ascii;
1171 int eol_seen = coding->eol_seen;
1173 detect_info->checked |= CATEGORY_MASK_UTF_8;
1174 /* A coding system of this category is always ASCII compatible. */
1175 src += nchars;
1177 if (src == coding->source /* BOM should be at the head. */
1178 && src + 3 < src_end /* BOM is 3-byte long. */
1179 && src[0] == UTF_8_BOM_1
1180 && src[1] == UTF_8_BOM_2
1181 && src[2] == UTF_8_BOM_3)
1183 bom_found = 1;
1184 src += 3;
1185 nchars++;
1188 while (1)
1190 int c, c1, c2, c3, c4;
1192 src_base = src;
1193 ONE_MORE_BYTE (c);
1194 if (c < 0 || UTF_8_1_OCTET_P (c))
1196 nchars++;
1197 if (c == '\r')
1199 if (src < src_end && *src == '\n')
1201 eol_seen |= EOL_SEEN_CRLF;
1202 src++;
1203 nchars++;
1205 else
1206 eol_seen |= EOL_SEEN_CR;
1208 else if (c == '\n')
1209 eol_seen |= EOL_SEEN_LF;
1210 continue;
1212 ONE_MORE_BYTE (c1);
1213 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1214 break;
1215 if (UTF_8_2_OCTET_LEADING_P (c))
1217 nchars++;
1218 continue;
1220 ONE_MORE_BYTE (c2);
1221 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1222 break;
1223 if (UTF_8_3_OCTET_LEADING_P (c))
1225 nchars++;
1226 continue;
1228 ONE_MORE_BYTE (c3);
1229 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1230 break;
1231 if (UTF_8_4_OCTET_LEADING_P (c))
1233 nchars++;
1234 continue;
1236 ONE_MORE_BYTE (c4);
1237 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1238 break;
1239 if (UTF_8_5_OCTET_LEADING_P (c))
1241 nchars++;
1242 continue;
1244 break;
1246 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1247 return 0;
1249 no_more_source:
1250 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1252 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1253 return 0;
1255 if (bom_found)
1257 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1258 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1260 else
1262 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1263 if (nchars < src_end - coding->source)
1264 /* The found characters are less than source bytes, which
1265 means that we found a valid non-ASCII characters. */
1266 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1268 coding->detected_utf8_chars = nchars;
1269 return 1;
1273 static void
1274 decode_coding_utf_8 (struct coding_system *coding)
1276 const unsigned char *src = coding->source + coding->consumed;
1277 const unsigned char *src_end = coding->source + coding->src_bytes;
1278 const unsigned char *src_base;
1279 int *charbuf = coding->charbuf + coding->charbuf_used;
1280 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1281 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1282 bool multibytep = coding->src_multibyte;
1283 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1284 bool eol_dos
1285 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1286 int byte_after_cr = -1;
1288 if (bom != utf_without_bom)
1290 int c1, c2, c3;
1292 src_base = src;
1293 ONE_MORE_BYTE (c1);
1294 if (! UTF_8_3_OCTET_LEADING_P (c1))
1295 src = src_base;
1296 else
1298 ONE_MORE_BYTE (c2);
1299 if (! UTF_8_EXTRA_OCTET_P (c2))
1300 src = src_base;
1301 else
1303 ONE_MORE_BYTE (c3);
1304 if (! UTF_8_EXTRA_OCTET_P (c3))
1305 src = src_base;
1306 else
1308 if ((c1 != UTF_8_BOM_1)
1309 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1310 src = src_base;
1311 else
1312 CODING_UTF_8_BOM (coding) = utf_without_bom;
1317 CODING_UTF_8_BOM (coding) = utf_without_bom;
1319 while (1)
1321 int c, c1, c2, c3, c4, c5;
1323 src_base = src;
1324 consumed_chars_base = consumed_chars;
1326 if (charbuf >= charbuf_end)
1328 if (byte_after_cr >= 0)
1329 src_base--;
1330 break;
1333 if (byte_after_cr >= 0)
1334 c1 = byte_after_cr, byte_after_cr = -1;
1335 else
1336 ONE_MORE_BYTE (c1);
1337 if (c1 < 0)
1339 c = - c1;
1341 else if (UTF_8_1_OCTET_P (c1))
1343 if (eol_dos && c1 == '\r')
1344 ONE_MORE_BYTE (byte_after_cr);
1345 c = c1;
1347 else
1349 ONE_MORE_BYTE (c2);
1350 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1351 goto invalid_code;
1352 if (UTF_8_2_OCTET_LEADING_P (c1))
1354 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1355 /* Reject overlong sequences here and below. Encoders
1356 producing them are incorrect, they can be misleading,
1357 and they mess up read/write invariance. */
1358 if (c < 128)
1359 goto invalid_code;
1361 else
1363 ONE_MORE_BYTE (c3);
1364 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1365 goto invalid_code;
1366 if (UTF_8_3_OCTET_LEADING_P (c1))
1368 c = (((c1 & 0xF) << 12)
1369 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1370 if (c < 0x800
1371 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1372 goto invalid_code;
1374 else
1376 ONE_MORE_BYTE (c4);
1377 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1378 goto invalid_code;
1379 if (UTF_8_4_OCTET_LEADING_P (c1))
1381 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1382 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1383 if (c < 0x10000)
1384 goto invalid_code;
1386 else
1388 ONE_MORE_BYTE (c5);
1389 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1390 goto invalid_code;
1391 if (UTF_8_5_OCTET_LEADING_P (c1))
1393 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1394 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1395 | (c5 & 0x3F));
1396 if ((c > MAX_CHAR) || (c < 0x200000))
1397 goto invalid_code;
1399 else
1400 goto invalid_code;
1406 *charbuf++ = c;
1407 continue;
1409 invalid_code:
1410 src = src_base;
1411 consumed_chars = consumed_chars_base;
1412 ONE_MORE_BYTE (c);
1413 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1414 coding->errors++;
1417 no_more_source:
1418 coding->consumed_char += consumed_chars_base;
1419 coding->consumed = src_base - coding->source;
1420 coding->charbuf_used = charbuf - coding->charbuf;
1424 static bool
1425 encode_coding_utf_8 (struct coding_system *coding)
1427 bool multibytep = coding->dst_multibyte;
1428 int *charbuf = coding->charbuf;
1429 int *charbuf_end = charbuf + coding->charbuf_used;
1430 unsigned char *dst = coding->destination + coding->produced;
1431 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1432 ptrdiff_t produced_chars = 0;
1433 int c;
1435 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1437 ASSURE_DESTINATION (3);
1438 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1439 CODING_UTF_8_BOM (coding) = utf_without_bom;
1442 if (multibytep)
1444 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1446 while (charbuf < charbuf_end)
1448 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1450 ASSURE_DESTINATION (safe_room);
1451 c = *charbuf++;
1452 if (CHAR_BYTE8_P (c))
1454 c = CHAR_TO_BYTE8 (c);
1455 EMIT_ONE_BYTE (c);
1457 else
1459 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1460 for (p = str; p < pend; p++)
1461 EMIT_ONE_BYTE (*p);
1465 else
1467 int safe_room = MAX_MULTIBYTE_LENGTH;
1469 while (charbuf < charbuf_end)
1471 ASSURE_DESTINATION (safe_room);
1472 c = *charbuf++;
1473 if (CHAR_BYTE8_P (c))
1474 *dst++ = CHAR_TO_BYTE8 (c);
1475 else
1476 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1477 produced_chars++;
1480 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1481 coding->produced_char += produced_chars;
1482 coding->produced = dst - coding->destination;
1483 return 0;
1487 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1488 Return true if a text is encoded in one of UTF-16 based coding systems. */
1490 #define UTF_16_HIGH_SURROGATE_P(val) \
1491 (((val) & 0xFC00) == 0xD800)
1493 #define UTF_16_LOW_SURROGATE_P(val) \
1494 (((val) & 0xFC00) == 0xDC00)
1497 static bool
1498 detect_coding_utf_16 (struct coding_system *coding,
1499 struct coding_detection_info *detect_info)
1501 const unsigned char *src = coding->source;
1502 const unsigned char *src_end = coding->source + coding->src_bytes;
1503 bool multibytep = coding->src_multibyte;
1504 int c1, c2;
1506 detect_info->checked |= CATEGORY_MASK_UTF_16;
1507 if (coding->mode & CODING_MODE_LAST_BLOCK
1508 && (coding->src_chars & 1))
1510 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1511 return 0;
1514 TWO_MORE_BYTES (c1, c2);
1515 if ((c1 == 0xFF) && (c2 == 0xFE))
1517 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1518 | CATEGORY_MASK_UTF_16_AUTO);
1519 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1520 | CATEGORY_MASK_UTF_16_BE_NOSIG
1521 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1523 else if ((c1 == 0xFE) && (c2 == 0xFF))
1525 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1526 | CATEGORY_MASK_UTF_16_AUTO);
1527 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1528 | CATEGORY_MASK_UTF_16_BE_NOSIG
1529 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1531 else if (c2 < 0)
1533 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1534 return 0;
1536 else
1538 /* We check the dispersion of Eth and Oth bytes where E is even and
1539 O is odd. If both are high, we assume binary data.*/
1540 unsigned char e[256], o[256];
1541 unsigned e_num = 1, o_num = 1;
1543 memset (e, 0, 256);
1544 memset (o, 0, 256);
1545 e[c1] = 1;
1546 o[c2] = 1;
1548 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1549 |CATEGORY_MASK_UTF_16_BE
1550 | CATEGORY_MASK_UTF_16_LE);
1552 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1553 != CATEGORY_MASK_UTF_16)
1555 TWO_MORE_BYTES (c1, c2);
1556 if (c2 < 0)
1557 break;
1558 if (! e[c1])
1560 e[c1] = 1;
1561 e_num++;
1562 if (e_num >= 128)
1563 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1565 if (! o[c2])
1567 o[c2] = 1;
1568 o_num++;
1569 if (o_num >= 128)
1570 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1573 return 0;
1576 no_more_source:
1577 return 1;
1580 static void
1581 decode_coding_utf_16 (struct coding_system *coding)
1583 const unsigned char *src = coding->source + coding->consumed;
1584 const unsigned char *src_end = coding->source + coding->src_bytes;
1585 const unsigned char *src_base;
1586 int *charbuf = coding->charbuf + coding->charbuf_used;
1587 /* We may produces at most 3 chars in one loop. */
1588 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1589 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1590 bool multibytep = coding->src_multibyte;
1591 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1592 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1593 int surrogate = CODING_UTF_16_SURROGATE (coding);
1594 bool eol_dos
1595 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1596 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1598 if (bom == utf_with_bom)
1600 int c, c1, c2;
1602 src_base = src;
1603 ONE_MORE_BYTE (c1);
1604 ONE_MORE_BYTE (c2);
1605 c = (c1 << 8) | c2;
1607 if (endian == utf_16_big_endian
1608 ? c != 0xFEFF : c != 0xFFFE)
1610 /* The first two bytes are not BOM. Treat them as bytes
1611 for a normal character. */
1612 src = src_base;
1613 coding->errors++;
1615 CODING_UTF_16_BOM (coding) = utf_without_bom;
1617 else if (bom == utf_detect_bom)
1619 /* We have already tried to detect BOM and failed in
1620 detect_coding. */
1621 CODING_UTF_16_BOM (coding) = utf_without_bom;
1624 while (1)
1626 int c, c1, c2;
1628 src_base = src;
1629 consumed_chars_base = consumed_chars;
1631 if (charbuf >= charbuf_end)
1633 if (byte_after_cr1 >= 0)
1634 src_base -= 2;
1635 break;
1638 if (byte_after_cr1 >= 0)
1639 c1 = byte_after_cr1, byte_after_cr1 = -1;
1640 else
1641 ONE_MORE_BYTE (c1);
1642 if (c1 < 0)
1644 *charbuf++ = -c1;
1645 continue;
1647 if (byte_after_cr2 >= 0)
1648 c2 = byte_after_cr2, byte_after_cr2 = -1;
1649 else
1650 ONE_MORE_BYTE (c2);
1651 if (c2 < 0)
1653 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1654 *charbuf++ = -c2;
1655 continue;
1657 c = (endian == utf_16_big_endian
1658 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1660 if (surrogate)
1662 if (! UTF_16_LOW_SURROGATE_P (c))
1664 if (endian == utf_16_big_endian)
1665 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1666 else
1667 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1668 *charbuf++ = c1;
1669 *charbuf++ = c2;
1670 coding->errors++;
1671 if (UTF_16_HIGH_SURROGATE_P (c))
1672 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1673 else
1674 *charbuf++ = c;
1676 else
1678 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1679 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1680 *charbuf++ = 0x10000 + c;
1683 else
1685 if (UTF_16_HIGH_SURROGATE_P (c))
1686 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1687 else
1689 if (eol_dos && c == '\r')
1691 ONE_MORE_BYTE (byte_after_cr1);
1692 ONE_MORE_BYTE (byte_after_cr2);
1694 *charbuf++ = c;
1699 no_more_source:
1700 coding->consumed_char += consumed_chars_base;
1701 coding->consumed = src_base - coding->source;
1702 coding->charbuf_used = charbuf - coding->charbuf;
1705 static bool
1706 encode_coding_utf_16 (struct coding_system *coding)
1708 bool multibytep = coding->dst_multibyte;
1709 int *charbuf = coding->charbuf;
1710 int *charbuf_end = charbuf + coding->charbuf_used;
1711 unsigned char *dst = coding->destination + coding->produced;
1712 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1713 int safe_room = 8;
1714 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1715 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1716 ptrdiff_t produced_chars = 0;
1717 int c;
1719 if (bom != utf_without_bom)
1721 ASSURE_DESTINATION (safe_room);
1722 if (big_endian)
1723 EMIT_TWO_BYTES (0xFE, 0xFF);
1724 else
1725 EMIT_TWO_BYTES (0xFF, 0xFE);
1726 CODING_UTF_16_BOM (coding) = utf_without_bom;
1729 while (charbuf < charbuf_end)
1731 ASSURE_DESTINATION (safe_room);
1732 c = *charbuf++;
1733 if (c > MAX_UNICODE_CHAR)
1734 c = coding->default_char;
1736 if (c < 0x10000)
1738 if (big_endian)
1739 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1740 else
1741 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1743 else
1745 int c1, c2;
1747 c -= 0x10000;
1748 c1 = (c >> 10) + 0xD800;
1749 c2 = (c & 0x3FF) + 0xDC00;
1750 if (big_endian)
1751 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1752 else
1753 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1756 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1757 coding->produced = dst - coding->destination;
1758 coding->produced_char += produced_chars;
1759 return 0;
1763 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1765 /* Emacs' internal format for representation of multiple character
1766 sets is a kind of multi-byte encoding, i.e. characters are
1767 represented by variable-length sequences of one-byte codes.
1769 ASCII characters and control characters (e.g. `tab', `newline') are
1770 represented by one-byte sequences which are their ASCII codes, in
1771 the range 0x00 through 0x7F.
1773 8-bit characters of the range 0x80..0x9F are represented by
1774 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1775 code + 0x20).
1777 8-bit characters of the range 0xA0..0xFF are represented by
1778 one-byte sequences which are their 8-bit code.
1780 The other characters are represented by a sequence of `base
1781 leading-code', optional `extended leading-code', and one or two
1782 `position-code's. The length of the sequence is determined by the
1783 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1784 whereas extended leading-code and position-code take the range 0xA0
1785 through 0xFF. See `charset.h' for more details about leading-code
1786 and position-code.
1788 --- CODE RANGE of Emacs' internal format ---
1789 character set range
1790 ------------- -----
1791 ascii 0x00..0x7F
1792 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1793 eight-bit-graphic 0xA0..0xBF
1794 ELSE 0x81..0x9D + [0xA0..0xFF]+
1795 ---------------------------------------------
1797 As this is the internal character representation, the format is
1798 usually not used externally (i.e. in a file or in a data sent to a
1799 process). But, it is possible to have a text externally in this
1800 format (i.e. by encoding by the coding system `emacs-mule').
1802 In that case, a sequence of one-byte codes has a slightly different
1803 form.
1805 At first, all characters in eight-bit-control are represented by
1806 one-byte sequences which are their 8-bit code.
1808 Next, character composition data are represented by the byte
1809 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1810 where,
1811 METHOD is 0xF2 plus one of composition method (enum
1812 composition_method),
1814 BYTES is 0xA0 plus a byte length of this composition data,
1816 CHARS is 0xA0 plus a number of characters composed by this
1817 data,
1819 COMPONENTs are characters of multibyte form or composition
1820 rules encoded by two-byte of ASCII codes.
1822 In addition, for backward compatibility, the following formats are
1823 also recognized as composition data on decoding.
1825 0x80 MSEQ ...
1826 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1828 Here,
1829 MSEQ is a multibyte form but in these special format:
1830 ASCII: 0xA0 ASCII_CODE+0x80,
1831 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1832 RULE is a one byte code of the range 0xA0..0xF0 that
1833 represents a composition rule.
1836 char emacs_mule_bytes[256];
1839 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1840 Return true if a text is encoded in 'emacs-mule'. */
1842 static bool
1843 detect_coding_emacs_mule (struct coding_system *coding,
1844 struct coding_detection_info *detect_info)
1846 const unsigned char *src = coding->source, *src_base;
1847 const unsigned char *src_end = coding->source + coding->src_bytes;
1848 bool multibytep = coding->src_multibyte;
1849 ptrdiff_t consumed_chars = 0;
1850 int c;
1851 int found = 0;
1853 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1854 /* A coding system of this category is always ASCII compatible. */
1855 src += coding->head_ascii;
1857 while (1)
1859 src_base = src;
1860 ONE_MORE_BYTE (c);
1861 if (c < 0)
1862 continue;
1863 if (c == 0x80)
1865 /* Perhaps the start of composite character. We simply skip
1866 it because analyzing it is too heavy for detecting. But,
1867 at least, we check that the composite character
1868 constitutes of more than 4 bytes. */
1869 const unsigned char *src_start;
1871 repeat:
1872 src_start = src;
1875 ONE_MORE_BYTE (c);
1877 while (c >= 0xA0);
1879 if (src - src_start <= 4)
1880 break;
1881 found = CATEGORY_MASK_EMACS_MULE;
1882 if (c == 0x80)
1883 goto repeat;
1886 if (c < 0x80)
1888 if (c < 0x20
1889 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1890 break;
1892 else
1894 int more_bytes = emacs_mule_bytes[c] - 1;
1896 while (more_bytes > 0)
1898 ONE_MORE_BYTE (c);
1899 if (c < 0xA0)
1901 src--; /* Unread the last byte. */
1902 break;
1904 more_bytes--;
1906 if (more_bytes != 0)
1907 break;
1908 found = CATEGORY_MASK_EMACS_MULE;
1911 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1912 return 0;
1914 no_more_source:
1915 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1917 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1918 return 0;
1920 detect_info->found |= found;
1921 return 1;
1925 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1926 character. If CMP_STATUS indicates that we must expect MSEQ or
1927 RULE described above, decode it and return the negative value of
1928 the decoded character or rule. If an invalid byte is found, return
1929 -1. If SRC is too short, return -2. */
1931 static int
1932 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1933 int *nbytes, int *nchars, int *id,
1934 struct composition_status *cmp_status)
1936 const unsigned char *src_end = coding->source + coding->src_bytes;
1937 const unsigned char *src_base = src;
1938 bool multibytep = coding->src_multibyte;
1939 int charset_ID;
1940 unsigned code;
1941 int c;
1942 int consumed_chars = 0;
1943 bool mseq_found = 0;
1945 ONE_MORE_BYTE (c);
1946 if (c < 0)
1948 c = -c;
1949 charset_ID = emacs_mule_charset[0];
1951 else
1953 if (c >= 0xA0)
1955 if (cmp_status->state != COMPOSING_NO
1956 && cmp_status->old_form)
1958 if (cmp_status->state == COMPOSING_CHAR)
1960 if (c == 0xA0)
1962 ONE_MORE_BYTE (c);
1963 c -= 0x80;
1964 if (c < 0)
1965 goto invalid_code;
1967 else
1968 c -= 0x20;
1969 mseq_found = 1;
1971 else
1973 *nbytes = src - src_base;
1974 *nchars = consumed_chars;
1975 return -c;
1978 else
1979 goto invalid_code;
1982 switch (emacs_mule_bytes[c])
1984 case 2:
1985 if ((charset_ID = emacs_mule_charset[c]) < 0)
1986 goto invalid_code;
1987 ONE_MORE_BYTE (c);
1988 if (c < 0xA0)
1989 goto invalid_code;
1990 code = c & 0x7F;
1991 break;
1993 case 3:
1994 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1995 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1997 ONE_MORE_BYTE (c);
1998 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1999 goto invalid_code;
2000 ONE_MORE_BYTE (c);
2001 if (c < 0xA0)
2002 goto invalid_code;
2003 code = c & 0x7F;
2005 else
2007 if ((charset_ID = emacs_mule_charset[c]) < 0)
2008 goto invalid_code;
2009 ONE_MORE_BYTE (c);
2010 if (c < 0xA0)
2011 goto invalid_code;
2012 code = (c & 0x7F) << 8;
2013 ONE_MORE_BYTE (c);
2014 if (c < 0xA0)
2015 goto invalid_code;
2016 code |= c & 0x7F;
2018 break;
2020 case 4:
2021 ONE_MORE_BYTE (c);
2022 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2023 goto invalid_code;
2024 ONE_MORE_BYTE (c);
2025 if (c < 0xA0)
2026 goto invalid_code;
2027 code = (c & 0x7F) << 8;
2028 ONE_MORE_BYTE (c);
2029 if (c < 0xA0)
2030 goto invalid_code;
2031 code |= c & 0x7F;
2032 break;
2034 case 1:
2035 code = c;
2036 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2037 break;
2039 default:
2040 emacs_abort ();
2042 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2043 CHARSET_FROM_ID (charset_ID), code, c);
2044 if (c < 0)
2045 goto invalid_code;
2047 *nbytes = src - src_base;
2048 *nchars = consumed_chars;
2049 if (id)
2050 *id = charset_ID;
2051 return (mseq_found ? -c : c);
2053 no_more_source:
2054 return -2;
2056 invalid_code:
2057 return -1;
2061 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2063 /* Handle these composition sequence ('|': the end of header elements,
2064 BYTES and CHARS >= 0xA0):
2066 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2067 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2068 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2070 and these old form:
2072 (4) relative composition: 0x80 | MSEQ ... MSEQ
2073 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2075 When the starter 0x80 and the following header elements are found,
2076 this annotation header is produced.
2078 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2080 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2081 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2083 Then, upon reading the following elements, these codes are produced
2084 until the composition end is found:
2086 (1) CHAR ... CHAR
2087 (2) ALT ... ALT CHAR ... CHAR
2088 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2089 (4) CHAR ... CHAR
2090 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2092 When the composition end is found, LENGTH and NCHARS in the
2093 annotation header is updated as below:
2095 (1) LENGTH: unchanged, NCHARS: unchanged
2096 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2097 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2098 (4) LENGTH: unchanged, NCHARS: number of CHARs
2099 (5) LENGTH: unchanged, NCHARS: number of CHARs
2101 If an error is found while composing, the annotation header is
2102 changed to the original composition header (plus filler -1s) as
2103 below:
2105 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2106 (5) [ 0x80 0xFF -1 -1- -1 ]
2108 and the sequence [ -2 DECODED-RULE ] is changed to the original
2109 byte sequence as below:
2110 o the original byte sequence is B: [ B -1 ]
2111 o the original byte sequence is B1 B2: [ B1 B2 ]
2113 Most of the routines are implemented by macros because many
2114 variables and labels in the caller decode_coding_emacs_mule must be
2115 accessible, and they are usually called just once (thus doesn't
2116 increase the size of compiled object). */
2118 /* Decode a composition rule represented by C as a component of
2119 composition sequence of Emacs 20 style. Set RULE to the decoded
2120 rule. */
2122 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2123 do { \
2124 int gref, nref; \
2126 c -= 0xA0; \
2127 if (c < 0 || c >= 81) \
2128 goto invalid_code; \
2129 gref = c / 9, nref = c % 9; \
2130 if (gref == 4) gref = 10; \
2131 if (nref == 4) nref = 10; \
2132 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2133 } while (0)
2136 /* Decode a composition rule represented by C and the following byte
2137 at SRC as a component of composition sequence of Emacs 21 style.
2138 Set RULE to the decoded rule. */
2140 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2141 do { \
2142 int gref, nref; \
2144 gref = c - 0x20; \
2145 if (gref < 0 || gref >= 81) \
2146 goto invalid_code; \
2147 ONE_MORE_BYTE (c); \
2148 nref = c - 0x20; \
2149 if (nref < 0 || nref >= 81) \
2150 goto invalid_code; \
2151 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2152 } while (0)
2155 /* Start of Emacs 21 style format. The first three bytes at SRC are
2156 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2157 byte length of this composition information, CHARS is the number of
2158 characters composed by this composition. */
2160 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2161 do { \
2162 enum composition_method method = c - 0xF2; \
2163 int nbytes, nchars; \
2165 ONE_MORE_BYTE (c); \
2166 if (c < 0) \
2167 goto invalid_code; \
2168 nbytes = c - 0xA0; \
2169 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2170 goto invalid_code; \
2171 ONE_MORE_BYTE (c); \
2172 nchars = c - 0xA0; \
2173 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2174 goto invalid_code; \
2175 cmp_status->old_form = 0; \
2176 cmp_status->method = method; \
2177 if (method == COMPOSITION_RELATIVE) \
2178 cmp_status->state = COMPOSING_CHAR; \
2179 else \
2180 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2181 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2182 cmp_status->nchars = nchars; \
2183 cmp_status->ncomps = nbytes - 4; \
2184 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2185 } while (0)
2188 /* Start of Emacs 20 style format for relative composition. */
2190 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2191 do { \
2192 cmp_status->old_form = 1; \
2193 cmp_status->method = COMPOSITION_RELATIVE; \
2194 cmp_status->state = COMPOSING_CHAR; \
2195 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2196 cmp_status->nchars = cmp_status->ncomps = 0; \
2197 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2198 } while (0)
2201 /* Start of Emacs 20 style format for rule-base composition. */
2203 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2204 do { \
2205 cmp_status->old_form = 1; \
2206 cmp_status->method = COMPOSITION_WITH_RULE; \
2207 cmp_status->state = COMPOSING_CHAR; \
2208 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2209 cmp_status->nchars = cmp_status->ncomps = 0; \
2210 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2211 } while (0)
2214 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2215 do { \
2216 const unsigned char *current_src = src; \
2218 ONE_MORE_BYTE (c); \
2219 if (c < 0) \
2220 goto invalid_code; \
2221 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2222 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2223 DECODE_EMACS_MULE_21_COMPOSITION (); \
2224 else if (c < 0xA0) \
2225 goto invalid_code; \
2226 else if (c < 0xC0) \
2228 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2229 /* Re-read C as a composition component. */ \
2230 src = current_src; \
2232 else if (c == 0xFF) \
2233 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2234 else \
2235 goto invalid_code; \
2236 } while (0)
2238 #define EMACS_MULE_COMPOSITION_END() \
2239 do { \
2240 int idx = - cmp_status->length; \
2242 if (cmp_status->old_form) \
2243 charbuf[idx + 2] = cmp_status->nchars; \
2244 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2245 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2246 cmp_status->state = COMPOSING_NO; \
2247 } while (0)
2250 static int
2251 emacs_mule_finish_composition (int *charbuf,
2252 struct composition_status *cmp_status)
2254 int idx = - cmp_status->length;
2255 int new_chars;
2257 if (cmp_status->old_form && cmp_status->nchars > 0)
2259 charbuf[idx + 2] = cmp_status->nchars;
2260 new_chars = 0;
2261 if (cmp_status->method == COMPOSITION_WITH_RULE
2262 && cmp_status->state == COMPOSING_CHAR)
2264 /* The last rule was invalid. */
2265 int rule = charbuf[-1] + 0xA0;
2267 charbuf[-2] = BYTE8_TO_CHAR (rule);
2268 charbuf[-1] = -1;
2269 new_chars = 1;
2272 else
2274 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2276 if (cmp_status->method == COMPOSITION_WITH_RULE)
2278 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2279 charbuf[idx++] = -3;
2280 charbuf[idx++] = 0;
2281 new_chars = 1;
2283 else
2285 int nchars = charbuf[idx + 1] + 0xA0;
2286 int nbytes = charbuf[idx + 2] + 0xA0;
2288 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2289 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2290 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2291 charbuf[idx++] = -1;
2292 new_chars = 4;
2295 cmp_status->state = COMPOSING_NO;
2296 return new_chars;
2299 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2300 do { \
2301 if (cmp_status->state != COMPOSING_NO) \
2302 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2303 } while (0)
2306 static void
2307 decode_coding_emacs_mule (struct coding_system *coding)
2309 const unsigned char *src = coding->source + coding->consumed;
2310 const unsigned char *src_end = coding->source + coding->src_bytes;
2311 const unsigned char *src_base;
2312 int *charbuf = coding->charbuf + coding->charbuf_used;
2313 /* We may produce two annotations (charset and composition) in one
2314 loop and one more charset annotation at the end. */
2315 int *charbuf_end
2316 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2317 /* We can produce up to 2 characters in a loop. */
2318 - 1;
2319 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2320 bool multibytep = coding->src_multibyte;
2321 ptrdiff_t char_offset = coding->produced_char;
2322 ptrdiff_t last_offset = char_offset;
2323 int last_id = charset_ascii;
2324 bool eol_dos
2325 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2326 int byte_after_cr = -1;
2327 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2329 if (cmp_status->state != COMPOSING_NO)
2331 int i;
2333 if (charbuf_end - charbuf < cmp_status->length)
2334 emacs_abort ();
2335 for (i = 0; i < cmp_status->length; i++)
2336 *charbuf++ = cmp_status->carryover[i];
2337 coding->annotated = 1;
2340 while (1)
2342 int c, id IF_LINT (= 0);
2344 src_base = src;
2345 consumed_chars_base = consumed_chars;
2347 if (charbuf >= charbuf_end)
2349 if (byte_after_cr >= 0)
2350 src_base--;
2351 break;
2354 if (byte_after_cr >= 0)
2355 c = byte_after_cr, byte_after_cr = -1;
2356 else
2357 ONE_MORE_BYTE (c);
2359 if (c < 0 || c == 0x80)
2361 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2362 if (c < 0)
2364 *charbuf++ = -c;
2365 char_offset++;
2367 else
2368 DECODE_EMACS_MULE_COMPOSITION_START ();
2369 continue;
2372 if (c < 0x80)
2374 if (eol_dos && c == '\r')
2375 ONE_MORE_BYTE (byte_after_cr);
2376 id = charset_ascii;
2377 if (cmp_status->state != COMPOSING_NO)
2379 if (cmp_status->old_form)
2380 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2381 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2382 cmp_status->ncomps--;
2385 else
2387 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2388 /* emacs_mule_char can load a charset map from a file, which
2389 allocates a large structure and might cause buffer text
2390 to be relocated as result. Thus, we need to remember the
2391 original pointer to buffer text, and fix up all related
2392 pointers after the call. */
2393 const unsigned char *orig = coding->source;
2394 ptrdiff_t offset;
2396 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2397 cmp_status);
2398 offset = coding->source - orig;
2399 if (offset)
2401 src += offset;
2402 src_base += offset;
2403 src_end += offset;
2405 if (c < 0)
2407 if (c == -1)
2408 goto invalid_code;
2409 if (c == -2)
2410 break;
2412 src = src_base + nbytes;
2413 consumed_chars = consumed_chars_base + nchars;
2414 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2415 cmp_status->ncomps -= nchars;
2418 /* Now if C >= 0, we found a normally encoded character, if C <
2419 0, we found an old-style composition component character or
2420 rule. */
2422 if (cmp_status->state == COMPOSING_NO)
2424 if (last_id != id)
2426 if (last_id != charset_ascii)
2427 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2428 last_id);
2429 last_id = id;
2430 last_offset = char_offset;
2432 *charbuf++ = c;
2433 char_offset++;
2435 else if (cmp_status->state == COMPOSING_CHAR)
2437 if (cmp_status->old_form)
2439 if (c >= 0)
2441 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2442 *charbuf++ = c;
2443 char_offset++;
2445 else
2447 *charbuf++ = -c;
2448 cmp_status->nchars++;
2449 cmp_status->length++;
2450 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2451 EMACS_MULE_COMPOSITION_END ();
2452 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2453 cmp_status->state = COMPOSING_RULE;
2456 else
2458 *charbuf++ = c;
2459 cmp_status->length++;
2460 cmp_status->nchars--;
2461 if (cmp_status->nchars == 0)
2462 EMACS_MULE_COMPOSITION_END ();
2465 else if (cmp_status->state == COMPOSING_RULE)
2467 int rule;
2469 if (c >= 0)
2471 EMACS_MULE_COMPOSITION_END ();
2472 *charbuf++ = c;
2473 char_offset++;
2475 else
2477 c = -c;
2478 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2479 if (rule < 0)
2480 goto invalid_code;
2481 *charbuf++ = -2;
2482 *charbuf++ = rule;
2483 cmp_status->length += 2;
2484 cmp_status->state = COMPOSING_CHAR;
2487 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2489 *charbuf++ = c;
2490 cmp_status->length++;
2491 if (cmp_status->ncomps == 0)
2492 cmp_status->state = COMPOSING_CHAR;
2493 else if (cmp_status->ncomps > 0)
2495 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2496 cmp_status->state = COMPOSING_COMPONENT_RULE;
2498 else
2499 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2501 else /* COMPOSING_COMPONENT_RULE */
2503 int rule;
2505 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2506 if (rule < 0)
2507 goto invalid_code;
2508 *charbuf++ = -2;
2509 *charbuf++ = rule;
2510 cmp_status->length += 2;
2511 cmp_status->ncomps--;
2512 if (cmp_status->ncomps > 0)
2513 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2514 else
2515 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517 continue;
2519 invalid_code:
2520 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2521 src = src_base;
2522 consumed_chars = consumed_chars_base;
2523 ONE_MORE_BYTE (c);
2524 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2525 char_offset++;
2526 coding->errors++;
2529 no_more_source:
2530 if (cmp_status->state != COMPOSING_NO)
2532 if (coding->mode & CODING_MODE_LAST_BLOCK)
2533 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2534 else
2536 int i;
2538 charbuf -= cmp_status->length;
2539 for (i = 0; i < cmp_status->length; i++)
2540 cmp_status->carryover[i] = charbuf[i];
2543 if (last_id != charset_ascii)
2544 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2545 coding->consumed_char += consumed_chars_base;
2546 coding->consumed = src_base - coding->source;
2547 coding->charbuf_used = charbuf - coding->charbuf;
2551 #define EMACS_MULE_LEADING_CODES(id, codes) \
2552 do { \
2553 if (id < 0xA0) \
2554 codes[0] = id, codes[1] = 0; \
2555 else if (id < 0xE0) \
2556 codes[0] = 0x9A, codes[1] = id; \
2557 else if (id < 0xF0) \
2558 codes[0] = 0x9B, codes[1] = id; \
2559 else if (id < 0xF5) \
2560 codes[0] = 0x9C, codes[1] = id; \
2561 else \
2562 codes[0] = 0x9D, codes[1] = id; \
2563 } while (0);
2566 static bool
2567 encode_coding_emacs_mule (struct coding_system *coding)
2569 bool multibytep = coding->dst_multibyte;
2570 int *charbuf = coding->charbuf;
2571 int *charbuf_end = charbuf + coding->charbuf_used;
2572 unsigned char *dst = coding->destination + coding->produced;
2573 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2574 int safe_room = 8;
2575 ptrdiff_t produced_chars = 0;
2576 Lisp_Object attrs, charset_list;
2577 int c;
2578 int preferred_charset_id = -1;
2580 CODING_GET_INFO (coding, attrs, charset_list);
2581 if (! EQ (charset_list, Vemacs_mule_charset_list))
2583 charset_list = Vemacs_mule_charset_list;
2584 ASET (attrs, coding_attr_charset_list, charset_list);
2587 while (charbuf < charbuf_end)
2589 ASSURE_DESTINATION (safe_room);
2590 c = *charbuf++;
2592 if (c < 0)
2594 /* Handle an annotation. */
2595 switch (*charbuf)
2597 case CODING_ANNOTATE_COMPOSITION_MASK:
2598 /* Not yet implemented. */
2599 break;
2600 case CODING_ANNOTATE_CHARSET_MASK:
2601 preferred_charset_id = charbuf[3];
2602 if (preferred_charset_id >= 0
2603 && NILP (Fmemq (make_number (preferred_charset_id),
2604 charset_list)))
2605 preferred_charset_id = -1;
2606 break;
2607 default:
2608 emacs_abort ();
2610 charbuf += -c - 1;
2611 continue;
2614 if (ASCII_CHAR_P (c))
2615 EMIT_ONE_ASCII_BYTE (c);
2616 else if (CHAR_BYTE8_P (c))
2618 c = CHAR_TO_BYTE8 (c);
2619 EMIT_ONE_BYTE (c);
2621 else
2623 struct charset *charset;
2624 unsigned code;
2625 int dimension;
2626 int emacs_mule_id;
2627 unsigned char leading_codes[2];
2629 if (preferred_charset_id >= 0)
2631 bool result;
2633 charset = CHARSET_FROM_ID (preferred_charset_id);
2634 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2635 if (result)
2636 code = ENCODE_CHAR (charset, c);
2637 else
2638 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2639 &code, charset);
2641 else
2642 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2643 &code, charset);
2644 if (! charset)
2646 c = coding->default_char;
2647 if (ASCII_CHAR_P (c))
2649 EMIT_ONE_ASCII_BYTE (c);
2650 continue;
2652 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2653 &code, charset);
2655 dimension = CHARSET_DIMENSION (charset);
2656 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2657 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2658 EMIT_ONE_BYTE (leading_codes[0]);
2659 if (leading_codes[1])
2660 EMIT_ONE_BYTE (leading_codes[1]);
2661 if (dimension == 1)
2662 EMIT_ONE_BYTE (code | 0x80);
2663 else
2665 code |= 0x8080;
2666 EMIT_ONE_BYTE (code >> 8);
2667 EMIT_ONE_BYTE (code & 0xFF);
2671 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2672 coding->produced_char += produced_chars;
2673 coding->produced = dst - coding->destination;
2674 return 0;
2678 /*** 7. ISO2022 handlers ***/
2680 /* The following note describes the coding system ISO2022 briefly.
2681 Since the intention of this note is to help understand the
2682 functions in this file, some parts are NOT ACCURATE or are OVERLY
2683 SIMPLIFIED. For thorough understanding, please refer to the
2684 original document of ISO2022. This is equivalent to the standard
2685 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2687 ISO2022 provides many mechanisms to encode several character sets
2688 in 7-bit and 8-bit environments. For 7-bit environments, all text
2689 is encoded using bytes less than 128. This may make the encoded
2690 text a little bit longer, but the text passes more easily through
2691 several types of gateway, some of which strip off the MSB (Most
2692 Significant Bit).
2694 There are two kinds of character sets: control character sets and
2695 graphic character sets. The former contain control characters such
2696 as `newline' and `escape' to provide control functions (control
2697 functions are also provided by escape sequences). The latter
2698 contain graphic characters such as 'A' and '-'. Emacs recognizes
2699 two control character sets and many graphic character sets.
2701 Graphic character sets are classified into one of the following
2702 four classes, according to the number of bytes (DIMENSION) and
2703 number of characters in one dimension (CHARS) of the set:
2704 - DIMENSION1_CHARS94
2705 - DIMENSION1_CHARS96
2706 - DIMENSION2_CHARS94
2707 - DIMENSION2_CHARS96
2709 In addition, each character set is assigned an identification tag,
2710 unique for each set, called the "final character" (denoted as <F>
2711 hereafter). The <F> of each character set is decided by ECMA(*)
2712 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2713 (0x30..0x3F are for private use only).
2715 Note (*): ECMA = European Computer Manufacturers Association
2717 Here are examples of graphic character sets [NAME(<F>)]:
2718 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2719 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2720 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2721 o DIMENSION2_CHARS96 -- none for the moment
2723 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2724 C0 [0x00..0x1F] -- control character plane 0
2725 GL [0x20..0x7F] -- graphic character plane 0
2726 C1 [0x80..0x9F] -- control character plane 1
2727 GR [0xA0..0xFF] -- graphic character plane 1
2729 A control character set is directly designated and invoked to C0 or
2730 C1 by an escape sequence. The most common case is that:
2731 - ISO646's control character set is designated/invoked to C0, and
2732 - ISO6429's control character set is designated/invoked to C1,
2733 and usually these designations/invocations are omitted in encoded
2734 text. In a 7-bit environment, only C0 can be used, and a control
2735 character for C1 is encoded by an appropriate escape sequence to
2736 fit into the environment. All control characters for C1 are
2737 defined to have corresponding escape sequences.
2739 A graphic character set is at first designated to one of four
2740 graphic registers (G0 through G3), then these graphic registers are
2741 invoked to GL or GR. These designations and invocations can be
2742 done independently. The most common case is that G0 is invoked to
2743 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2744 these invocations and designations are omitted in encoded text.
2745 In a 7-bit environment, only GL can be used.
2747 When a graphic character set of CHARS94 is invoked to GL, codes
2748 0x20 and 0x7F of the GL area work as control characters SPACE and
2749 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2750 be used.
2752 There are two ways of invocation: locking-shift and single-shift.
2753 With locking-shift, the invocation lasts until the next different
2754 invocation, whereas with single-shift, the invocation affects the
2755 following character only and doesn't affect the locking-shift
2756 state. Invocations are done by the following control characters or
2757 escape sequences:
2759 ----------------------------------------------------------------------
2760 abbrev function cntrl escape seq description
2761 ----------------------------------------------------------------------
2762 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2763 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2764 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2765 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2766 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2767 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2768 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2769 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2770 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2771 ----------------------------------------------------------------------
2772 (*) These are not used by any known coding system.
2774 Control characters for these functions are defined by macros
2775 ISO_CODE_XXX in `coding.h'.
2777 Designations are done by the following escape sequences:
2778 ----------------------------------------------------------------------
2779 escape sequence description
2780 ----------------------------------------------------------------------
2781 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2782 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2783 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2784 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2785 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2786 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2787 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2788 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2789 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2790 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2791 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2792 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2793 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2794 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2795 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2796 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2797 ----------------------------------------------------------------------
2799 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2800 of dimension 1, chars 94, and final character <F>, etc...
2802 Note (*): Although these designations are not allowed in ISO2022,
2803 Emacs accepts them on decoding, and produces them on encoding
2804 CHARS96 character sets in a coding system which is characterized as
2805 7-bit environment, non-locking-shift, and non-single-shift.
2807 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2808 '(' must be omitted. We refer to this as "short-form" hereafter.
2810 Now you may notice that there are a lot of ways of encoding the
2811 same multilingual text in ISO2022. Actually, there exist many
2812 coding systems such as Compound Text (used in X11's inter client
2813 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2814 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2815 localized platforms), and all of these are variants of ISO2022.
2817 In addition to the above, Emacs handles two more kinds of escape
2818 sequences: ISO6429's direction specification and Emacs' private
2819 sequence for specifying character composition.
2821 ISO6429's direction specification takes the following form:
2822 o CSI ']' -- end of the current direction
2823 o CSI '0' ']' -- end of the current direction
2824 o CSI '1' ']' -- start of left-to-right text
2825 o CSI '2' ']' -- start of right-to-left text
2826 The control character CSI (0x9B: control sequence introducer) is
2827 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2829 Character composition specification takes the following form:
2830 o ESC '0' -- start relative composition
2831 o ESC '1' -- end composition
2832 o ESC '2' -- start rule-base composition (*)
2833 o ESC '3' -- start relative composition with alternate chars (**)
2834 o ESC '4' -- start rule-base composition with alternate chars (**)
2835 Since these are not standard escape sequences of any ISO standard,
2836 the use of them with these meanings is restricted to Emacs only.
2838 (*) This form is used only in Emacs 20.7 and older versions,
2839 but newer versions can safely decode it.
2840 (**) This form is used only in Emacs 21.1 and newer versions,
2841 and older versions can't decode it.
2843 Here's a list of example usages of these composition escape
2844 sequences (categorized by `enum composition_method').
2846 COMPOSITION_RELATIVE:
2847 ESC 0 CHAR [ CHAR ] ESC 1
2848 COMPOSITION_WITH_RULE:
2849 ESC 2 CHAR [ RULE CHAR ] ESC 1
2850 COMPOSITION_WITH_ALTCHARS:
2851 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2852 COMPOSITION_WITH_RULE_ALTCHARS:
2853 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2855 static enum iso_code_class_type iso_code_class[256];
2857 #define SAFE_CHARSET_P(coding, id) \
2858 ((id) <= (coding)->max_charset_id \
2859 && (coding)->safe_charsets[id] != 255)
2861 static void
2862 setup_iso_safe_charsets (Lisp_Object attrs)
2864 Lisp_Object charset_list, safe_charsets;
2865 Lisp_Object request;
2866 Lisp_Object reg_usage;
2867 Lisp_Object tail;
2868 EMACS_INT reg94, reg96;
2869 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2870 int max_charset_id;
2872 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2873 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2874 && ! EQ (charset_list, Viso_2022_charset_list))
2876 charset_list = Viso_2022_charset_list;
2877 ASET (attrs, coding_attr_charset_list, charset_list);
2878 ASET (attrs, coding_attr_safe_charsets, Qnil);
2881 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2882 return;
2884 max_charset_id = 0;
2885 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2887 int id = XINT (XCAR (tail));
2888 if (max_charset_id < id)
2889 max_charset_id = id;
2892 safe_charsets = make_uninit_string (max_charset_id + 1);
2893 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2894 request = AREF (attrs, coding_attr_iso_request);
2895 reg_usage = AREF (attrs, coding_attr_iso_usage);
2896 reg94 = XINT (XCAR (reg_usage));
2897 reg96 = XINT (XCDR (reg_usage));
2899 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2901 Lisp_Object id;
2902 Lisp_Object reg;
2903 struct charset *charset;
2905 id = XCAR (tail);
2906 charset = CHARSET_FROM_ID (XINT (id));
2907 reg = Fcdr (Fassq (id, request));
2908 if (! NILP (reg))
2909 SSET (safe_charsets, XINT (id), XINT (reg));
2910 else if (charset->iso_chars_96)
2912 if (reg96 < 4)
2913 SSET (safe_charsets, XINT (id), reg96);
2915 else
2917 if (reg94 < 4)
2918 SSET (safe_charsets, XINT (id), reg94);
2921 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2926 Return true if a text is encoded in one of ISO-2022 based coding
2927 systems. */
2929 static bool
2930 detect_coding_iso_2022 (struct coding_system *coding,
2931 struct coding_detection_info *detect_info)
2933 const unsigned char *src = coding->source, *src_base = src;
2934 const unsigned char *src_end = coding->source + coding->src_bytes;
2935 bool multibytep = coding->src_multibyte;
2936 bool single_shifting = 0;
2937 int id;
2938 int c, c1;
2939 ptrdiff_t consumed_chars = 0;
2940 int i;
2941 int rejected = 0;
2942 int found = 0;
2943 int composition_count = -1;
2945 detect_info->checked |= CATEGORY_MASK_ISO;
2947 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2949 struct coding_system *this = &(coding_categories[i]);
2950 Lisp_Object attrs, val;
2952 if (this->id < 0)
2953 continue;
2954 attrs = CODING_ID_ATTRS (this->id);
2955 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2956 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2957 setup_iso_safe_charsets (attrs);
2958 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2959 this->max_charset_id = SCHARS (val) - 1;
2960 this->safe_charsets = SDATA (val);
2963 /* A coding system of this category is always ASCII compatible. */
2964 src += coding->head_ascii;
2966 while (rejected != CATEGORY_MASK_ISO)
2968 src_base = src;
2969 ONE_MORE_BYTE (c);
2970 switch (c)
2972 case ISO_CODE_ESC:
2973 if (inhibit_iso_escape_detection)
2974 break;
2975 single_shifting = 0;
2976 ONE_MORE_BYTE (c);
2977 if (c == 'N' || c == 'O')
2979 /* ESC <Fe> for SS2 or SS3. */
2980 single_shifting = 1;
2981 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2983 else if (c == '1')
2985 /* End of composition. */
2986 if (composition_count < 0
2987 || composition_count > MAX_COMPOSITION_COMPONENTS)
2988 /* Invalid */
2989 break;
2990 composition_count = -1;
2991 found |= CATEGORY_MASK_ISO;
2993 else if (c >= '0' && c <= '4')
2995 /* ESC <Fp> for start/end composition. */
2996 composition_count = 0;
2998 else
3000 if (c >= '(' && c <= '/')
3002 /* Designation sequence for a charset of dimension 1. */
3003 ONE_MORE_BYTE (c1);
3004 if (c1 < ' ' || c1 >= 0x80
3005 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3006 /* Invalid designation sequence. Just ignore. */
3007 break;
3009 else if (c == '$')
3011 /* Designation sequence for a charset of dimension 2. */
3012 ONE_MORE_BYTE (c);
3013 if (c >= '@' && c <= 'B')
3014 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3015 id = iso_charset_table[1][0][c];
3016 else if (c >= '(' && c <= '/')
3018 ONE_MORE_BYTE (c1);
3019 if (c1 < ' ' || c1 >= 0x80
3020 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3021 /* Invalid designation sequence. Just ignore. */
3022 break;
3024 else
3025 /* Invalid designation sequence. Just ignore it. */
3026 break;
3028 else
3030 /* Invalid escape sequence. Just ignore it. */
3031 break;
3034 /* We found a valid designation sequence for CHARSET. */
3035 rejected |= CATEGORY_MASK_ISO_8BIT;
3036 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3037 id))
3038 found |= CATEGORY_MASK_ISO_7;
3039 else
3040 rejected |= CATEGORY_MASK_ISO_7;
3041 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3042 id))
3043 found |= CATEGORY_MASK_ISO_7_TIGHT;
3044 else
3045 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3046 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3047 id))
3048 found |= CATEGORY_MASK_ISO_7_ELSE;
3049 else
3050 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3051 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3052 id))
3053 found |= CATEGORY_MASK_ISO_8_ELSE;
3054 else
3055 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3057 break;
3059 case ISO_CODE_SO:
3060 case ISO_CODE_SI:
3061 /* Locking shift out/in. */
3062 if (inhibit_iso_escape_detection)
3063 break;
3064 single_shifting = 0;
3065 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3066 break;
3068 case ISO_CODE_CSI:
3069 /* Control sequence introducer. */
3070 single_shifting = 0;
3071 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072 found |= CATEGORY_MASK_ISO_8_ELSE;
3073 goto check_extra_latin;
3075 case ISO_CODE_SS2:
3076 case ISO_CODE_SS3:
3077 /* Single shift. */
3078 if (inhibit_iso_escape_detection)
3079 break;
3080 single_shifting = 0;
3081 rejected |= CATEGORY_MASK_ISO_7BIT;
3082 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3083 & CODING_ISO_FLAG_SINGLE_SHIFT)
3085 found |= CATEGORY_MASK_ISO_8_1;
3086 single_shifting = 1;
3088 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3089 & CODING_ISO_FLAG_SINGLE_SHIFT)
3091 found |= CATEGORY_MASK_ISO_8_2;
3092 single_shifting = 1;
3094 if (single_shifting)
3095 break;
3096 goto check_extra_latin;
3098 default:
3099 if (c < 0)
3100 continue;
3101 if (c < 0x80)
3103 if (composition_count >= 0)
3104 composition_count++;
3105 single_shifting = 0;
3106 break;
3108 if (c >= 0xA0)
3110 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3111 found |= CATEGORY_MASK_ISO_8_1;
3112 /* Check the length of succeeding codes of the range
3113 0xA0..0FF. If the byte length is even, we include
3114 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3115 only when we are not single shifting. */
3116 if (! single_shifting
3117 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3119 int len = 1;
3120 while (src < src_end)
3122 src_base = src;
3123 ONE_MORE_BYTE (c);
3124 if (c < 0xA0)
3126 src = src_base;
3127 break;
3129 len++;
3132 if (len & 1 && src < src_end)
3134 rejected |= CATEGORY_MASK_ISO_8_2;
3135 if (composition_count >= 0)
3136 composition_count += len;
3138 else
3140 found |= CATEGORY_MASK_ISO_8_2;
3141 if (composition_count >= 0)
3142 composition_count += len / 2;
3145 break;
3147 check_extra_latin:
3148 if (! VECTORP (Vlatin_extra_code_table)
3149 || NILP (AREF (Vlatin_extra_code_table, c)))
3151 rejected = CATEGORY_MASK_ISO;
3152 break;
3154 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3155 & CODING_ISO_FLAG_LATIN_EXTRA)
3156 found |= CATEGORY_MASK_ISO_8_1;
3157 else
3158 rejected |= CATEGORY_MASK_ISO_8_1;
3159 rejected |= CATEGORY_MASK_ISO_8_2;
3160 break;
3163 detect_info->rejected |= CATEGORY_MASK_ISO;
3164 return 0;
3166 no_more_source:
3167 detect_info->rejected |= rejected;
3168 detect_info->found |= (found & ~rejected);
3169 return 1;
3173 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3174 escape sequence should be kept. */
3175 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3176 do { \
3177 int id, prev; \
3179 if (final < '0' || final >= 128 \
3180 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3181 || !SAFE_CHARSET_P (coding, id)) \
3183 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3184 chars_96 = -1; \
3185 break; \
3187 prev = CODING_ISO_DESIGNATION (coding, reg); \
3188 if (id == charset_jisx0201_roman) \
3190 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3191 id = charset_ascii; \
3193 else if (id == charset_jisx0208_1978) \
3195 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3196 id = charset_jisx0208; \
3198 CODING_ISO_DESIGNATION (coding, reg) = id; \
3199 /* If there was an invalid designation to REG previously, and this \
3200 designation is ASCII to REG, we should keep this designation \
3201 sequence. */ \
3202 if (prev == -2 && id == charset_ascii) \
3203 chars_96 = -1; \
3204 } while (0)
3207 /* Handle these composition sequence (ALT: alternate char):
3209 (1) relative composition: ESC 0 CHAR ... ESC 1
3210 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3211 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3212 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3214 When the start sequence (ESC 0/2/3/4) is found, this annotation
3215 header is produced.
3217 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3219 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3220 produced until the end sequence (ESC 1) is found:
3222 (1) CHAR ... CHAR
3223 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3224 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3225 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3227 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3228 annotation header is updated as below:
3230 (1) LENGTH: unchanged, NCHARS: number of CHARs
3231 (2) LENGTH: unchanged, NCHARS: number of CHARs
3232 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3233 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3235 If an error is found while composing, the annotation header is
3236 changed to:
3238 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3240 and the sequence [ -2 DECODED-RULE ] is changed to the original
3241 byte sequence as below:
3242 o the original byte sequence is B: [ B -1 ]
3243 o the original byte sequence is B1 B2: [ B1 B2 ]
3244 and the sequence [ -1 -1 ] is changed to the original byte
3245 sequence:
3246 [ ESC '0' ]
3249 /* Decode a composition rule C1 and maybe one more byte from the
3250 source, and set RULE to the encoded composition rule. If the rule
3251 is invalid, goto invalid_code. */
3253 #define DECODE_COMPOSITION_RULE(rule) \
3254 do { \
3255 rule = c1 - 32; \
3256 if (rule < 0) \
3257 goto invalid_code; \
3258 if (rule < 81) /* old format (before ver.21) */ \
3260 int gref = (rule) / 9; \
3261 int nref = (rule) % 9; \
3262 if (gref == 4) gref = 10; \
3263 if (nref == 4) nref = 10; \
3264 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3266 else /* new format (after ver.21) */ \
3268 int b; \
3270 ONE_MORE_BYTE (b); \
3271 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3272 goto invalid_code; \
3273 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3274 rule += 0x100; /* Distinguish it from the old format. */ \
3276 } while (0)
3278 #define ENCODE_COMPOSITION_RULE(rule) \
3279 do { \
3280 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3282 if (rule < 0x100) /* old format */ \
3284 if (gref == 10) gref = 4; \
3285 if (nref == 10) nref = 4; \
3286 charbuf[idx] = 32 + gref * 9 + nref; \
3287 charbuf[idx + 1] = -1; \
3288 new_chars++; \
3290 else /* new format */ \
3292 charbuf[idx] = 32 + 81 + gref; \
3293 charbuf[idx + 1] = 32 + nref; \
3294 new_chars += 2; \
3296 } while (0)
3298 /* Finish the current composition as invalid. */
3300 static int
3301 finish_composition (int *charbuf, struct composition_status *cmp_status)
3303 int idx = - cmp_status->length;
3304 int new_chars;
3306 /* Recover the original ESC sequence */
3307 charbuf[idx++] = ISO_CODE_ESC;
3308 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3309 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3310 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3311 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3312 : '4');
3313 charbuf[idx++] = -2;
3314 charbuf[idx++] = 0;
3315 charbuf[idx++] = -1;
3316 new_chars = cmp_status->nchars;
3317 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3318 for (; idx < 0; idx++)
3320 int elt = charbuf[idx];
3322 if (elt == -2)
3324 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3325 idx++;
3327 else if (elt == -1)
3329 charbuf[idx++] = ISO_CODE_ESC;
3330 charbuf[idx] = '0';
3331 new_chars += 2;
3334 cmp_status->state = COMPOSING_NO;
3335 return new_chars;
3338 /* If characters are under composition, finish the composition. */
3339 #define MAYBE_FINISH_COMPOSITION() \
3340 do { \
3341 if (cmp_status->state != COMPOSING_NO) \
3342 char_offset += finish_composition (charbuf, cmp_status); \
3343 } while (0)
3345 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3347 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3348 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3349 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3350 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3352 Produce this annotation sequence now:
3354 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3357 #define DECODE_COMPOSITION_START(c1) \
3358 do { \
3359 if (c1 == '0' \
3360 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3361 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3362 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3363 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3365 *charbuf++ = -1; \
3366 *charbuf++= -1; \
3367 cmp_status->state = COMPOSING_CHAR; \
3368 cmp_status->length += 2; \
3370 else \
3372 MAYBE_FINISH_COMPOSITION (); \
3373 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3374 : c1 == '2' ? COMPOSITION_WITH_RULE \
3375 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3376 : COMPOSITION_WITH_RULE_ALTCHARS); \
3377 cmp_status->state \
3378 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3379 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3380 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3381 cmp_status->nchars = cmp_status->ncomps = 0; \
3382 coding->annotated = 1; \
3384 } while (0)
3387 /* Handle composition end sequence ESC 1. */
3389 #define DECODE_COMPOSITION_END() \
3390 do { \
3391 if (cmp_status->nchars == 0 \
3392 || ((cmp_status->state == COMPOSING_CHAR) \
3393 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3395 MAYBE_FINISH_COMPOSITION (); \
3396 goto invalid_code; \
3398 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3399 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3400 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3401 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3402 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3403 char_offset += cmp_status->nchars; \
3404 cmp_status->state = COMPOSING_NO; \
3405 } while (0)
3407 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3409 #define STORE_COMPOSITION_RULE(rule) \
3410 do { \
3411 *charbuf++ = -2; \
3412 *charbuf++ = rule; \
3413 cmp_status->length += 2; \
3414 cmp_status->state--; \
3415 } while (0)
3417 /* Store a composed char or a component char C in charbuf, and update
3418 cmp_status. */
3420 #define STORE_COMPOSITION_CHAR(c) \
3421 do { \
3422 *charbuf++ = (c); \
3423 cmp_status->length++; \
3424 if (cmp_status->state == COMPOSING_CHAR) \
3425 cmp_status->nchars++; \
3426 else \
3427 cmp_status->ncomps++; \
3428 if (cmp_status->method == COMPOSITION_WITH_RULE \
3429 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3430 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3431 cmp_status->state++; \
3432 } while (0)
3435 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3437 static void
3438 decode_coding_iso_2022 (struct coding_system *coding)
3440 const unsigned char *src = coding->source + coding->consumed;
3441 const unsigned char *src_end = coding->source + coding->src_bytes;
3442 const unsigned char *src_base;
3443 int *charbuf = coding->charbuf + coding->charbuf_used;
3444 /* We may produce two annotations (charset and composition) in one
3445 loop and one more charset annotation at the end. */
3446 int *charbuf_end
3447 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3448 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3449 bool multibytep = coding->src_multibyte;
3450 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3451 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3452 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3453 int charset_id_2, charset_id_3;
3454 struct charset *charset;
3455 int c;
3456 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3457 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3458 ptrdiff_t char_offset = coding->produced_char;
3459 ptrdiff_t last_offset = char_offset;
3460 int last_id = charset_ascii;
3461 bool eol_dos
3462 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3463 int byte_after_cr = -1;
3464 int i;
3466 setup_iso_safe_charsets (attrs);
3467 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3469 if (cmp_status->state != COMPOSING_NO)
3471 if (charbuf_end - charbuf < cmp_status->length)
3472 emacs_abort ();
3473 for (i = 0; i < cmp_status->length; i++)
3474 *charbuf++ = cmp_status->carryover[i];
3475 coding->annotated = 1;
3478 while (1)
3480 int c1, c2, c3;
3482 src_base = src;
3483 consumed_chars_base = consumed_chars;
3485 if (charbuf >= charbuf_end)
3487 if (byte_after_cr >= 0)
3488 src_base--;
3489 break;
3492 if (byte_after_cr >= 0)
3493 c1 = byte_after_cr, byte_after_cr = -1;
3494 else
3495 ONE_MORE_BYTE (c1);
3496 if (c1 < 0)
3497 goto invalid_code;
3499 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3501 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3502 char_offset++;
3503 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3504 continue;
3507 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3509 if (c1 == ISO_CODE_ESC)
3511 if (src + 1 >= src_end)
3512 goto no_more_source;
3513 *charbuf++ = ISO_CODE_ESC;
3514 char_offset++;
3515 if (src[0] == '%' && src[1] == '@')
3517 src += 2;
3518 consumed_chars += 2;
3519 char_offset += 2;
3520 /* We are sure charbuf can contain two more chars. */
3521 *charbuf++ = '%';
3522 *charbuf++ = '@';
3523 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3526 else
3528 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3529 char_offset++;
3531 continue;
3534 if ((cmp_status->state == COMPOSING_RULE
3535 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3536 && c1 != ISO_CODE_ESC)
3538 int rule;
3540 DECODE_COMPOSITION_RULE (rule);
3541 STORE_COMPOSITION_RULE (rule);
3542 continue;
3545 /* We produce at most one character. */
3546 switch (iso_code_class [c1])
3548 case ISO_0x20_or_0x7F:
3549 if (charset_id_0 < 0
3550 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3551 /* This is SPACE or DEL. */
3552 charset = CHARSET_FROM_ID (charset_ascii);
3553 else
3554 charset = CHARSET_FROM_ID (charset_id_0);
3555 break;
3557 case ISO_graphic_plane_0:
3558 if (charset_id_0 < 0)
3559 charset = CHARSET_FROM_ID (charset_ascii);
3560 else
3561 charset = CHARSET_FROM_ID (charset_id_0);
3562 break;
3564 case ISO_0xA0_or_0xFF:
3565 if (charset_id_1 < 0
3566 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3567 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3568 goto invalid_code;
3569 /* This is a graphic character, we fall down ... */
3571 case ISO_graphic_plane_1:
3572 if (charset_id_1 < 0)
3573 goto invalid_code;
3574 charset = CHARSET_FROM_ID (charset_id_1);
3575 break;
3577 case ISO_control_0:
3578 if (eol_dos && c1 == '\r')
3579 ONE_MORE_BYTE (byte_after_cr);
3580 MAYBE_FINISH_COMPOSITION ();
3581 charset = CHARSET_FROM_ID (charset_ascii);
3582 break;
3584 case ISO_control_1:
3585 goto invalid_code;
3587 case ISO_shift_out:
3588 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3589 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3590 goto invalid_code;
3591 CODING_ISO_INVOCATION (coding, 0) = 1;
3592 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3593 continue;
3595 case ISO_shift_in:
3596 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3597 goto invalid_code;
3598 CODING_ISO_INVOCATION (coding, 0) = 0;
3599 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3600 continue;
3602 case ISO_single_shift_2_7:
3603 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3604 goto invalid_code;
3605 case ISO_single_shift_2:
3606 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3607 goto invalid_code;
3608 /* SS2 is handled as an escape sequence of ESC 'N' */
3609 c1 = 'N';
3610 goto label_escape_sequence;
3612 case ISO_single_shift_3:
3613 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3614 goto invalid_code;
3615 /* SS2 is handled as an escape sequence of ESC 'O' */
3616 c1 = 'O';
3617 goto label_escape_sequence;
3619 case ISO_control_sequence_introducer:
3620 /* CSI is handled as an escape sequence of ESC '[' ... */
3621 c1 = '[';
3622 goto label_escape_sequence;
3624 case ISO_escape:
3625 ONE_MORE_BYTE (c1);
3626 label_escape_sequence:
3627 /* Escape sequences handled here are invocation,
3628 designation, direction specification, and character
3629 composition specification. */
3630 switch (c1)
3632 case '&': /* revision of following character set */
3633 ONE_MORE_BYTE (c1);
3634 if (!(c1 >= '@' && c1 <= '~'))
3635 goto invalid_code;
3636 ONE_MORE_BYTE (c1);
3637 if (c1 != ISO_CODE_ESC)
3638 goto invalid_code;
3639 ONE_MORE_BYTE (c1);
3640 goto label_escape_sequence;
3642 case '$': /* designation of 2-byte character set */
3643 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3644 goto invalid_code;
3646 int reg, chars96;
3648 ONE_MORE_BYTE (c1);
3649 if (c1 >= '@' && c1 <= 'B')
3650 { /* designation of JISX0208.1978, GB2312.1980,
3651 or JISX0208.1980 */
3652 reg = 0, chars96 = 0;
3654 else if (c1 >= 0x28 && c1 <= 0x2B)
3655 { /* designation of DIMENSION2_CHARS94 character set */
3656 reg = c1 - 0x28, chars96 = 0;
3657 ONE_MORE_BYTE (c1);
3659 else if (c1 >= 0x2C && c1 <= 0x2F)
3660 { /* designation of DIMENSION2_CHARS96 character set */
3661 reg = c1 - 0x2C, chars96 = 1;
3662 ONE_MORE_BYTE (c1);
3664 else
3665 goto invalid_code;
3666 DECODE_DESIGNATION (reg, 2, chars96, c1);
3667 /* We must update these variables now. */
3668 if (reg == 0)
3669 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3670 else if (reg == 1)
3671 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3672 if (chars96 < 0)
3673 goto invalid_code;
3675 continue;
3677 case 'n': /* invocation of locking-shift-2 */
3678 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3679 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3680 goto invalid_code;
3681 CODING_ISO_INVOCATION (coding, 0) = 2;
3682 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3683 continue;
3685 case 'o': /* invocation of locking-shift-3 */
3686 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3687 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3688 goto invalid_code;
3689 CODING_ISO_INVOCATION (coding, 0) = 3;
3690 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3691 continue;
3693 case 'N': /* invocation of single-shift-2 */
3694 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3695 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3696 goto invalid_code;
3697 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3698 if (charset_id_2 < 0)
3699 charset = CHARSET_FROM_ID (charset_ascii);
3700 else
3701 charset = CHARSET_FROM_ID (charset_id_2);
3702 ONE_MORE_BYTE (c1);
3703 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3704 goto invalid_code;
3705 break;
3707 case 'O': /* invocation of single-shift-3 */
3708 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3709 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3710 goto invalid_code;
3711 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3712 if (charset_id_3 < 0)
3713 charset = CHARSET_FROM_ID (charset_ascii);
3714 else
3715 charset = CHARSET_FROM_ID (charset_id_3);
3716 ONE_MORE_BYTE (c1);
3717 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3718 goto invalid_code;
3719 break;
3721 case '0': case '2': case '3': case '4': /* start composition */
3722 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3723 goto invalid_code;
3724 if (last_id != charset_ascii)
3726 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3727 last_id = charset_ascii;
3728 last_offset = char_offset;
3730 DECODE_COMPOSITION_START (c1);
3731 continue;
3733 case '1': /* end composition */
3734 if (cmp_status->state == COMPOSING_NO)
3735 goto invalid_code;
3736 DECODE_COMPOSITION_END ();
3737 continue;
3739 case '[': /* specification of direction */
3740 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3741 goto invalid_code;
3742 /* For the moment, nested direction is not supported.
3743 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3744 left-to-right, and nonzero means right-to-left. */
3745 ONE_MORE_BYTE (c1);
3746 switch (c1)
3748 case ']': /* end of the current direction */
3749 coding->mode &= ~CODING_MODE_DIRECTION;
3751 case '0': /* end of the current direction */
3752 case '1': /* start of left-to-right direction */
3753 ONE_MORE_BYTE (c1);
3754 if (c1 == ']')
3755 coding->mode &= ~CODING_MODE_DIRECTION;
3756 else
3757 goto invalid_code;
3758 break;
3760 case '2': /* start of right-to-left direction */
3761 ONE_MORE_BYTE (c1);
3762 if (c1 == ']')
3763 coding->mode |= CODING_MODE_DIRECTION;
3764 else
3765 goto invalid_code;
3766 break;
3768 default:
3769 goto invalid_code;
3771 continue;
3773 case '%':
3774 ONE_MORE_BYTE (c1);
3775 if (c1 == '/')
3777 /* CTEXT extended segment:
3778 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3779 We keep these bytes as is for the moment.
3780 They may be decoded by post-read-conversion. */
3781 int dim, M, L;
3782 int size;
3784 ONE_MORE_BYTE (dim);
3785 if (dim < '0' || dim > '4')
3786 goto invalid_code;
3787 ONE_MORE_BYTE (M);
3788 if (M < 128)
3789 goto invalid_code;
3790 ONE_MORE_BYTE (L);
3791 if (L < 128)
3792 goto invalid_code;
3793 size = ((M - 128) * 128) + (L - 128);
3794 if (charbuf + 6 > charbuf_end)
3795 goto break_loop;
3796 *charbuf++ = ISO_CODE_ESC;
3797 *charbuf++ = '%';
3798 *charbuf++ = '/';
3799 *charbuf++ = dim;
3800 *charbuf++ = BYTE8_TO_CHAR (M);
3801 *charbuf++ = BYTE8_TO_CHAR (L);
3802 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3804 else if (c1 == 'G')
3806 /* XFree86 extension for embedding UTF-8 in CTEXT:
3807 ESC % G --UTF-8-BYTES-- ESC % @
3808 We keep these bytes as is for the moment.
3809 They may be decoded by post-read-conversion. */
3810 if (charbuf + 3 > charbuf_end)
3811 goto break_loop;
3812 *charbuf++ = ISO_CODE_ESC;
3813 *charbuf++ = '%';
3814 *charbuf++ = 'G';
3815 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3817 else
3818 goto invalid_code;
3819 continue;
3820 break;
3822 default:
3823 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3824 goto invalid_code;
3826 int reg, chars96;
3828 if (c1 >= 0x28 && c1 <= 0x2B)
3829 { /* designation of DIMENSION1_CHARS94 character set */
3830 reg = c1 - 0x28, chars96 = 0;
3831 ONE_MORE_BYTE (c1);
3833 else if (c1 >= 0x2C && c1 <= 0x2F)
3834 { /* designation of DIMENSION1_CHARS96 character set */
3835 reg = c1 - 0x2C, chars96 = 1;
3836 ONE_MORE_BYTE (c1);
3838 else
3839 goto invalid_code;
3840 DECODE_DESIGNATION (reg, 1, chars96, c1);
3841 /* We must update these variables now. */
3842 if (reg == 0)
3843 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844 else if (reg == 1)
3845 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3846 if (chars96 < 0)
3847 goto invalid_code;
3849 continue;
3851 break;
3853 default:
3854 emacs_abort ();
3857 if (cmp_status->state == COMPOSING_NO
3858 && charset->id != charset_ascii
3859 && last_id != charset->id)
3861 if (last_id != charset_ascii)
3862 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3863 last_id = charset->id;
3864 last_offset = char_offset;
3867 /* Now we know CHARSET and 1st position code C1 of a character.
3868 Produce a decoded character while getting 2nd and 3rd
3869 position codes C2, C3 if necessary. */
3870 if (CHARSET_DIMENSION (charset) > 1)
3872 ONE_MORE_BYTE (c2);
3873 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3874 || ((c1 & 0x80) != (c2 & 0x80)))
3875 /* C2 is not in a valid range. */
3876 goto invalid_code;
3877 if (CHARSET_DIMENSION (charset) == 2)
3878 c1 = (c1 << 8) | c2;
3879 else
3881 ONE_MORE_BYTE (c3);
3882 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3883 || ((c1 & 0x80) != (c3 & 0x80)))
3884 /* C3 is not in a valid range. */
3885 goto invalid_code;
3886 c1 = (c1 << 16) | (c2 << 8) | c2;
3889 c1 &= 0x7F7F7F;
3890 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3891 if (c < 0)
3893 MAYBE_FINISH_COMPOSITION ();
3894 for (; src_base < src; src_base++, char_offset++)
3896 if (ASCII_BYTE_P (*src_base))
3897 *charbuf++ = *src_base;
3898 else
3899 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3902 else if (cmp_status->state == COMPOSING_NO)
3904 *charbuf++ = c;
3905 char_offset++;
3907 else if ((cmp_status->state == COMPOSING_CHAR
3908 ? cmp_status->nchars
3909 : cmp_status->ncomps)
3910 >= MAX_COMPOSITION_COMPONENTS)
3912 /* Too long composition. */
3913 MAYBE_FINISH_COMPOSITION ();
3914 *charbuf++ = c;
3915 char_offset++;
3917 else
3918 STORE_COMPOSITION_CHAR (c);
3919 continue;
3921 invalid_code:
3922 MAYBE_FINISH_COMPOSITION ();
3923 src = src_base;
3924 consumed_chars = consumed_chars_base;
3925 ONE_MORE_BYTE (c);
3926 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3927 char_offset++;
3928 coding->errors++;
3929 /* Reset the invocation and designation status to the safest
3930 one; i.e. designate ASCII to the graphic register 0, and
3931 invoke that register to the graphic plane 0. This typically
3932 helps the case that an designation sequence for ASCII "ESC (
3933 B" is somehow broken (e.g. broken by a newline). */
3934 CODING_ISO_INVOCATION (coding, 0) = 0;
3935 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3936 charset_id_0 = charset_ascii;
3937 continue;
3939 break_loop:
3940 break;
3943 no_more_source:
3944 if (cmp_status->state != COMPOSING_NO)
3946 if (coding->mode & CODING_MODE_LAST_BLOCK)
3947 MAYBE_FINISH_COMPOSITION ();
3948 else
3950 charbuf -= cmp_status->length;
3951 for (i = 0; i < cmp_status->length; i++)
3952 cmp_status->carryover[i] = charbuf[i];
3955 else if (last_id != charset_ascii)
3956 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3957 coding->consumed_char += consumed_chars_base;
3958 coding->consumed = src_base - coding->source;
3959 coding->charbuf_used = charbuf - coding->charbuf;
3963 /* ISO2022 encoding stuff. */
3966 It is not enough to say just "ISO2022" on encoding, we have to
3967 specify more details. In Emacs, each coding system of ISO2022
3968 variant has the following specifications:
3969 1. Initial designation to G0 thru G3.
3970 2. Allows short-form designation?
3971 3. ASCII should be designated to G0 before control characters?
3972 4. ASCII should be designated to G0 at end of line?
3973 5. 7-bit environment or 8-bit environment?
3974 6. Use locking-shift?
3975 7. Use Single-shift?
3976 And the following two are only for Japanese:
3977 8. Use ASCII in place of JIS0201-1976-Roman?
3978 9. Use JISX0208-1983 in place of JISX0208-1978?
3979 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3980 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3981 details.
3984 /* Produce codes (escape sequence) for designating CHARSET to graphic
3985 register REG at DST, and increment DST. If <final-char> of CHARSET is
3986 '@', 'A', or 'B' and the coding system CODING allows, produce
3987 designation sequence of short-form. */
3989 #define ENCODE_DESIGNATION(charset, reg, coding) \
3990 do { \
3991 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3992 const char *intermediate_char_94 = "()*+"; \
3993 const char *intermediate_char_96 = ",-./"; \
3994 int revision = -1; \
3996 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3997 revision = CHARSET_ISO_REVISION (charset); \
3999 if (revision >= 0) \
4001 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4002 EMIT_ONE_BYTE ('@' + revision); \
4004 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4005 if (CHARSET_DIMENSION (charset) == 1) \
4007 int b; \
4008 if (! CHARSET_ISO_CHARS_96 (charset)) \
4009 b = intermediate_char_94[reg]; \
4010 else \
4011 b = intermediate_char_96[reg]; \
4012 EMIT_ONE_ASCII_BYTE (b); \
4014 else \
4016 EMIT_ONE_ASCII_BYTE ('$'); \
4017 if (! CHARSET_ISO_CHARS_96 (charset)) \
4019 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4020 || reg != 0 \
4021 || final_char < '@' || final_char > 'B') \
4022 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4024 else \
4025 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4027 EMIT_ONE_ASCII_BYTE (final_char); \
4029 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4030 } while (0)
4033 /* The following two macros produce codes (control character or escape
4034 sequence) for ISO2022 single-shift functions (single-shift-2 and
4035 single-shift-3). */
4037 #define ENCODE_SINGLE_SHIFT_2 \
4038 do { \
4039 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4040 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4041 else \
4042 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4043 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4044 } while (0)
4047 #define ENCODE_SINGLE_SHIFT_3 \
4048 do { \
4049 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4050 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4051 else \
4052 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4053 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4054 } while (0)
4057 /* The following four macros produce codes (control character or
4058 escape sequence) for ISO2022 locking-shift functions (shift-in,
4059 shift-out, locking-shift-2, and locking-shift-3). */
4061 #define ENCODE_SHIFT_IN \
4062 do { \
4063 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4064 CODING_ISO_INVOCATION (coding, 0) = 0; \
4065 } while (0)
4068 #define ENCODE_SHIFT_OUT \
4069 do { \
4070 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4071 CODING_ISO_INVOCATION (coding, 0) = 1; \
4072 } while (0)
4075 #define ENCODE_LOCKING_SHIFT_2 \
4076 do { \
4077 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4078 CODING_ISO_INVOCATION (coding, 0) = 2; \
4079 } while (0)
4082 #define ENCODE_LOCKING_SHIFT_3 \
4083 do { \
4084 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4085 CODING_ISO_INVOCATION (coding, 0) = 3; \
4086 } while (0)
4089 /* Produce codes for a DIMENSION1 character whose character set is
4090 CHARSET and whose position-code is C1. Designation and invocation
4091 sequences are also produced in advance if necessary. */
4093 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4094 do { \
4095 int id = CHARSET_ID (charset); \
4097 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4098 && id == charset_ascii) \
4100 id = charset_jisx0201_roman; \
4101 charset = CHARSET_FROM_ID (id); \
4104 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4106 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4107 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4108 else \
4109 EMIT_ONE_BYTE (c1 | 0x80); \
4110 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4111 break; \
4113 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4115 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4116 break; \
4118 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4120 EMIT_ONE_BYTE (c1 | 0x80); \
4121 break; \
4123 else \
4124 /* Since CHARSET is not yet invoked to any graphic planes, we \
4125 must invoke it, or, at first, designate it to some graphic \
4126 register. Then repeat the loop to actually produce the \
4127 character. */ \
4128 dst = encode_invocation_designation (charset, coding, dst, \
4129 &produced_chars); \
4130 } while (1)
4133 /* Produce codes for a DIMENSION2 character whose character set is
4134 CHARSET and whose position-codes are C1 and C2. Designation and
4135 invocation codes are also produced in advance if necessary. */
4137 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4138 do { \
4139 int id = CHARSET_ID (charset); \
4141 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4142 && id == charset_jisx0208) \
4144 id = charset_jisx0208_1978; \
4145 charset = CHARSET_FROM_ID (id); \
4148 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4150 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4151 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4152 else \
4153 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4154 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4155 break; \
4157 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4159 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4160 break; \
4162 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4164 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4165 break; \
4167 else \
4168 /* Since CHARSET is not yet invoked to any graphic planes, we \
4169 must invoke it, or, at first, designate it to some graphic \
4170 register. Then repeat the loop to actually produce the \
4171 character. */ \
4172 dst = encode_invocation_designation (charset, coding, dst, \
4173 &produced_chars); \
4174 } while (1)
4177 #define ENCODE_ISO_CHARACTER(charset, c) \
4178 do { \
4179 unsigned code; \
4180 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4182 if (CHARSET_DIMENSION (charset) == 1) \
4183 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4184 else \
4185 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4186 } while (0)
4189 /* Produce designation and invocation codes at a place pointed by DST
4190 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4191 Return new DST. */
4193 static unsigned char *
4194 encode_invocation_designation (struct charset *charset,
4195 struct coding_system *coding,
4196 unsigned char *dst, ptrdiff_t *p_nchars)
4198 bool multibytep = coding->dst_multibyte;
4199 ptrdiff_t produced_chars = *p_nchars;
4200 int reg; /* graphic register number */
4201 int id = CHARSET_ID (charset);
4203 /* At first, check designations. */
4204 for (reg = 0; reg < 4; reg++)
4205 if (id == CODING_ISO_DESIGNATION (coding, reg))
4206 break;
4208 if (reg >= 4)
4210 /* CHARSET is not yet designated to any graphic registers. */
4211 /* At first check the requested designation. */
4212 reg = CODING_ISO_REQUEST (coding, id);
4213 if (reg < 0)
4214 /* Since CHARSET requests no special designation, designate it
4215 to graphic register 0. */
4216 reg = 0;
4218 ENCODE_DESIGNATION (charset, reg, coding);
4221 if (CODING_ISO_INVOCATION (coding, 0) != reg
4222 && CODING_ISO_INVOCATION (coding, 1) != reg)
4224 /* Since the graphic register REG is not invoked to any graphic
4225 planes, invoke it to graphic plane 0. */
4226 switch (reg)
4228 case 0: /* graphic register 0 */
4229 ENCODE_SHIFT_IN;
4230 break;
4232 case 1: /* graphic register 1 */
4233 ENCODE_SHIFT_OUT;
4234 break;
4236 case 2: /* graphic register 2 */
4237 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4238 ENCODE_SINGLE_SHIFT_2;
4239 else
4240 ENCODE_LOCKING_SHIFT_2;
4241 break;
4243 case 3: /* graphic register 3 */
4244 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4245 ENCODE_SINGLE_SHIFT_3;
4246 else
4247 ENCODE_LOCKING_SHIFT_3;
4248 break;
4252 *p_nchars = produced_chars;
4253 return dst;
4257 /* Produce codes for designation and invocation to reset the graphic
4258 planes and registers to initial state. */
4259 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4260 do { \
4261 int reg; \
4262 struct charset *charset; \
4264 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4265 ENCODE_SHIFT_IN; \
4266 for (reg = 0; reg < 4; reg++) \
4267 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4268 && (CODING_ISO_DESIGNATION (coding, reg) \
4269 != CODING_ISO_INITIAL (coding, reg))) \
4271 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4272 ENCODE_DESIGNATION (charset, reg, coding); \
4274 } while (0)
4277 /* Produce designation sequences of charsets in the line started from
4278 CHARBUF to a place pointed by DST, and return the number of
4279 produced bytes. DST should not directly point a buffer text area
4280 which may be relocated by char_charset call.
4282 If the current block ends before any end-of-line, we may fail to
4283 find all the necessary designations. */
4285 static ptrdiff_t
4286 encode_designation_at_bol (struct coding_system *coding,
4287 int *charbuf, int *charbuf_end,
4288 unsigned char *dst)
4290 unsigned char *orig = dst;
4291 struct charset *charset;
4292 /* Table of charsets to be designated to each graphic register. */
4293 int r[4];
4294 int c, found = 0, reg;
4295 ptrdiff_t produced_chars = 0;
4296 bool multibytep = coding->dst_multibyte;
4297 Lisp_Object attrs;
4298 Lisp_Object charset_list;
4300 attrs = CODING_ID_ATTRS (coding->id);
4301 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4302 if (EQ (charset_list, Qiso_2022))
4303 charset_list = Viso_2022_charset_list;
4305 for (reg = 0; reg < 4; reg++)
4306 r[reg] = -1;
4308 while (charbuf < charbuf_end && found < 4)
4310 int id;
4312 c = *charbuf++;
4313 if (c == '\n')
4314 break;
4315 charset = char_charset (c, charset_list, NULL);
4316 id = CHARSET_ID (charset);
4317 reg = CODING_ISO_REQUEST (coding, id);
4318 if (reg >= 0 && r[reg] < 0)
4320 found++;
4321 r[reg] = id;
4325 if (found)
4327 for (reg = 0; reg < 4; reg++)
4328 if (r[reg] >= 0
4329 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4330 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4333 return dst - orig;
4336 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4338 static bool
4339 encode_coding_iso_2022 (struct coding_system *coding)
4341 bool multibytep = coding->dst_multibyte;
4342 int *charbuf = coding->charbuf;
4343 int *charbuf_end = charbuf + coding->charbuf_used;
4344 unsigned char *dst = coding->destination + coding->produced;
4345 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4346 int safe_room = 16;
4347 bool bol_designation
4348 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4349 && CODING_ISO_BOL (coding));
4350 ptrdiff_t produced_chars = 0;
4351 Lisp_Object attrs, eol_type, charset_list;
4352 bool ascii_compatible;
4353 int c;
4354 int preferred_charset_id = -1;
4356 CODING_GET_INFO (coding, attrs, charset_list);
4357 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4358 if (VECTORP (eol_type))
4359 eol_type = Qunix;
4361 setup_iso_safe_charsets (attrs);
4362 /* Charset list may have been changed. */
4363 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4364 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4366 ascii_compatible
4367 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4368 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4369 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4371 while (charbuf < charbuf_end)
4373 ASSURE_DESTINATION (safe_room);
4375 if (bol_designation)
4377 /* We have to produce designation sequences if any now. */
4378 unsigned char desig_buf[16];
4379 int nbytes;
4380 ptrdiff_t offset;
4382 charset_map_loaded = 0;
4383 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4384 desig_buf);
4385 if (charset_map_loaded
4386 && (offset = coding_change_destination (coding)))
4388 dst += offset;
4389 dst_end += offset;
4391 memcpy (dst, desig_buf, nbytes);
4392 dst += nbytes;
4393 /* We are sure that designation sequences are all ASCII bytes. */
4394 produced_chars += nbytes;
4395 bol_designation = 0;
4396 ASSURE_DESTINATION (safe_room);
4399 c = *charbuf++;
4401 if (c < 0)
4403 /* Handle an annotation. */
4404 switch (*charbuf)
4406 case CODING_ANNOTATE_COMPOSITION_MASK:
4407 /* Not yet implemented. */
4408 break;
4409 case CODING_ANNOTATE_CHARSET_MASK:
4410 preferred_charset_id = charbuf[2];
4411 if (preferred_charset_id >= 0
4412 && NILP (Fmemq (make_number (preferred_charset_id),
4413 charset_list)))
4414 preferred_charset_id = -1;
4415 break;
4416 default:
4417 emacs_abort ();
4419 charbuf += -c - 1;
4420 continue;
4423 /* Now encode the character C. */
4424 if (c < 0x20 || c == 0x7F)
4426 if (c == '\n'
4427 || (c == '\r' && EQ (eol_type, Qmac)))
4429 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4430 ENCODE_RESET_PLANE_AND_REGISTER ();
4431 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4433 int i;
4435 for (i = 0; i < 4; i++)
4436 CODING_ISO_DESIGNATION (coding, i)
4437 = CODING_ISO_INITIAL (coding, i);
4439 bol_designation = ((CODING_ISO_FLAGS (coding)
4440 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4441 != 0);
4443 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4444 ENCODE_RESET_PLANE_AND_REGISTER ();
4445 EMIT_ONE_ASCII_BYTE (c);
4447 else if (ASCII_CHAR_P (c))
4449 if (ascii_compatible)
4450 EMIT_ONE_ASCII_BYTE (c);
4451 else
4453 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4454 ENCODE_ISO_CHARACTER (charset, c);
4457 else if (CHAR_BYTE8_P (c))
4459 c = CHAR_TO_BYTE8 (c);
4460 EMIT_ONE_BYTE (c);
4462 else
4464 struct charset *charset;
4466 if (preferred_charset_id >= 0)
4468 bool result;
4470 charset = CHARSET_FROM_ID (preferred_charset_id);
4471 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4472 if (! result)
4473 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4474 NULL, charset);
4476 else
4477 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4478 NULL, charset);
4479 if (!charset)
4481 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4483 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4484 charset = CHARSET_FROM_ID (charset_ascii);
4486 else
4488 c = coding->default_char;
4489 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4490 charset_list, NULL, charset);
4493 ENCODE_ISO_CHARACTER (charset, c);
4497 if (coding->mode & CODING_MODE_LAST_BLOCK
4498 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4500 ASSURE_DESTINATION (safe_room);
4501 ENCODE_RESET_PLANE_AND_REGISTER ();
4503 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4504 CODING_ISO_BOL (coding) = bol_designation;
4505 coding->produced_char += produced_chars;
4506 coding->produced = dst - coding->destination;
4507 return 0;
4511 /*** 8,9. SJIS and BIG5 handlers ***/
4513 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4514 quite widely. So, for the moment, Emacs supports them in the bare
4515 C code. But, in the future, they may be supported only by CCL. */
4517 /* SJIS is a coding system encoding three character sets: ASCII, right
4518 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4519 as is. A character of charset katakana-jisx0201 is encoded by
4520 "position-code + 0x80". A character of charset japanese-jisx0208
4521 is encoded in 2-byte but two position-codes are divided and shifted
4522 so that it fit in the range below.
4524 --- CODE RANGE of SJIS ---
4525 (character set) (range)
4526 ASCII 0x00 .. 0x7F
4527 KATAKANA-JISX0201 0xA0 .. 0xDF
4528 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4529 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4530 -------------------------------
4534 /* BIG5 is a coding system encoding two character sets: ASCII and
4535 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4536 character set and is encoded in two-byte.
4538 --- CODE RANGE of BIG5 ---
4539 (character set) (range)
4540 ASCII 0x00 .. 0x7F
4541 Big5 (1st byte) 0xA1 .. 0xFE
4542 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4543 --------------------------
4547 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4548 Return true if a text is encoded in SJIS. */
4550 static bool
4551 detect_coding_sjis (struct coding_system *coding,
4552 struct coding_detection_info *detect_info)
4554 const unsigned char *src = coding->source, *src_base;
4555 const unsigned char *src_end = coding->source + coding->src_bytes;
4556 bool multibytep = coding->src_multibyte;
4557 ptrdiff_t consumed_chars = 0;
4558 int found = 0;
4559 int c;
4560 Lisp_Object attrs, charset_list;
4561 int max_first_byte_of_2_byte_code;
4563 CODING_GET_INFO (coding, attrs, charset_list);
4564 max_first_byte_of_2_byte_code
4565 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4567 detect_info->checked |= CATEGORY_MASK_SJIS;
4568 /* A coding system of this category is always ASCII compatible. */
4569 src += coding->head_ascii;
4571 while (1)
4573 src_base = src;
4574 ONE_MORE_BYTE (c);
4575 if (c < 0x80)
4576 continue;
4577 if ((c >= 0x81 && c <= 0x9F)
4578 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4580 ONE_MORE_BYTE (c);
4581 if (c < 0x40 || c == 0x7F || c > 0xFC)
4582 break;
4583 found = CATEGORY_MASK_SJIS;
4585 else if (c >= 0xA0 && c < 0xE0)
4586 found = CATEGORY_MASK_SJIS;
4587 else
4588 break;
4590 detect_info->rejected |= CATEGORY_MASK_SJIS;
4591 return 0;
4593 no_more_source:
4594 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4596 detect_info->rejected |= CATEGORY_MASK_SJIS;
4597 return 0;
4599 detect_info->found |= found;
4600 return 1;
4603 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4604 Return true if a text is encoded in BIG5. */
4606 static bool
4607 detect_coding_big5 (struct coding_system *coding,
4608 struct coding_detection_info *detect_info)
4610 const unsigned char *src = coding->source, *src_base;
4611 const unsigned char *src_end = coding->source + coding->src_bytes;
4612 bool multibytep = coding->src_multibyte;
4613 ptrdiff_t consumed_chars = 0;
4614 int found = 0;
4615 int c;
4617 detect_info->checked |= CATEGORY_MASK_BIG5;
4618 /* A coding system of this category is always ASCII compatible. */
4619 src += coding->head_ascii;
4621 while (1)
4623 src_base = src;
4624 ONE_MORE_BYTE (c);
4625 if (c < 0x80)
4626 continue;
4627 if (c >= 0xA1)
4629 ONE_MORE_BYTE (c);
4630 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4631 return 0;
4632 found = CATEGORY_MASK_BIG5;
4634 else
4635 break;
4637 detect_info->rejected |= CATEGORY_MASK_BIG5;
4638 return 0;
4640 no_more_source:
4641 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4643 detect_info->rejected |= CATEGORY_MASK_BIG5;
4644 return 0;
4646 detect_info->found |= found;
4647 return 1;
4650 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4652 static void
4653 decode_coding_sjis (struct coding_system *coding)
4655 const unsigned char *src = coding->source + coding->consumed;
4656 const unsigned char *src_end = coding->source + coding->src_bytes;
4657 const unsigned char *src_base;
4658 int *charbuf = coding->charbuf + coding->charbuf_used;
4659 /* We may produce one charset annotation in one loop and one more at
4660 the end. */
4661 int *charbuf_end
4662 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4663 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4664 bool multibytep = coding->src_multibyte;
4665 struct charset *charset_roman, *charset_kanji, *charset_kana;
4666 struct charset *charset_kanji2;
4667 Lisp_Object attrs, charset_list, val;
4668 ptrdiff_t char_offset = coding->produced_char;
4669 ptrdiff_t last_offset = char_offset;
4670 int last_id = charset_ascii;
4671 bool eol_dos
4672 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4673 int byte_after_cr = -1;
4675 CODING_GET_INFO (coding, attrs, charset_list);
4677 val = charset_list;
4678 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4679 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4680 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4681 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4683 while (1)
4685 int c, c1;
4686 struct charset *charset;
4688 src_base = src;
4689 consumed_chars_base = consumed_chars;
4691 if (charbuf >= charbuf_end)
4693 if (byte_after_cr >= 0)
4694 src_base--;
4695 break;
4698 if (byte_after_cr >= 0)
4699 c = byte_after_cr, byte_after_cr = -1;
4700 else
4701 ONE_MORE_BYTE (c);
4702 if (c < 0)
4703 goto invalid_code;
4704 if (c < 0x80)
4706 if (eol_dos && c == '\r')
4707 ONE_MORE_BYTE (byte_after_cr);
4708 charset = charset_roman;
4710 else if (c == 0x80 || c == 0xA0)
4711 goto invalid_code;
4712 else if (c >= 0xA1 && c <= 0xDF)
4714 /* SJIS -> JISX0201-Kana */
4715 c &= 0x7F;
4716 charset = charset_kana;
4718 else if (c <= 0xEF)
4720 /* SJIS -> JISX0208 */
4721 ONE_MORE_BYTE (c1);
4722 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4723 goto invalid_code;
4724 c = (c << 8) | c1;
4725 SJIS_TO_JIS (c);
4726 charset = charset_kanji;
4728 else if (c <= 0xFC && charset_kanji2)
4730 /* SJIS -> JISX0213-2 */
4731 ONE_MORE_BYTE (c1);
4732 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4733 goto invalid_code;
4734 c = (c << 8) | c1;
4735 SJIS_TO_JIS2 (c);
4736 charset = charset_kanji2;
4738 else
4739 goto invalid_code;
4740 if (charset->id != charset_ascii
4741 && last_id != charset->id)
4743 if (last_id != charset_ascii)
4744 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4745 last_id = charset->id;
4746 last_offset = char_offset;
4748 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4749 *charbuf++ = c;
4750 char_offset++;
4751 continue;
4753 invalid_code:
4754 src = src_base;
4755 consumed_chars = consumed_chars_base;
4756 ONE_MORE_BYTE (c);
4757 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4758 char_offset++;
4759 coding->errors++;
4762 no_more_source:
4763 if (last_id != charset_ascii)
4764 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4765 coding->consumed_char += consumed_chars_base;
4766 coding->consumed = src_base - coding->source;
4767 coding->charbuf_used = charbuf - coding->charbuf;
4770 static void
4771 decode_coding_big5 (struct coding_system *coding)
4773 const unsigned char *src = coding->source + coding->consumed;
4774 const unsigned char *src_end = coding->source + coding->src_bytes;
4775 const unsigned char *src_base;
4776 int *charbuf = coding->charbuf + coding->charbuf_used;
4777 /* We may produce one charset annotation in one loop and one more at
4778 the end. */
4779 int *charbuf_end
4780 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4781 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4782 bool multibytep = coding->src_multibyte;
4783 struct charset *charset_roman, *charset_big5;
4784 Lisp_Object attrs, charset_list, val;
4785 ptrdiff_t char_offset = coding->produced_char;
4786 ptrdiff_t last_offset = char_offset;
4787 int last_id = charset_ascii;
4788 bool eol_dos
4789 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4790 int byte_after_cr = -1;
4792 CODING_GET_INFO (coding, attrs, charset_list);
4793 val = charset_list;
4794 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4795 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4797 while (1)
4799 int c, c1;
4800 struct charset *charset;
4802 src_base = src;
4803 consumed_chars_base = consumed_chars;
4805 if (charbuf >= charbuf_end)
4807 if (byte_after_cr >= 0)
4808 src_base--;
4809 break;
4812 if (byte_after_cr >= 0)
4813 c = byte_after_cr, byte_after_cr = -1;
4814 else
4815 ONE_MORE_BYTE (c);
4817 if (c < 0)
4818 goto invalid_code;
4819 if (c < 0x80)
4821 if (eol_dos && c == '\r')
4822 ONE_MORE_BYTE (byte_after_cr);
4823 charset = charset_roman;
4825 else
4827 /* BIG5 -> Big5 */
4828 if (c < 0xA1 || c > 0xFE)
4829 goto invalid_code;
4830 ONE_MORE_BYTE (c1);
4831 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4832 goto invalid_code;
4833 c = c << 8 | c1;
4834 charset = charset_big5;
4836 if (charset->id != charset_ascii
4837 && last_id != charset->id)
4839 if (last_id != charset_ascii)
4840 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4841 last_id = charset->id;
4842 last_offset = char_offset;
4844 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4845 *charbuf++ = c;
4846 char_offset++;
4847 continue;
4849 invalid_code:
4850 src = src_base;
4851 consumed_chars = consumed_chars_base;
4852 ONE_MORE_BYTE (c);
4853 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4854 char_offset++;
4855 coding->errors++;
4858 no_more_source:
4859 if (last_id != charset_ascii)
4860 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4861 coding->consumed_char += consumed_chars_base;
4862 coding->consumed = src_base - coding->source;
4863 coding->charbuf_used = charbuf - coding->charbuf;
4866 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4867 This function can encode charsets `ascii', `katakana-jisx0201',
4868 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4869 are sure that all these charsets are registered as official charset
4870 (i.e. do not have extended leading-codes). Characters of other
4871 charsets are produced without any encoding. */
4873 static bool
4874 encode_coding_sjis (struct coding_system *coding)
4876 bool multibytep = coding->dst_multibyte;
4877 int *charbuf = coding->charbuf;
4878 int *charbuf_end = charbuf + coding->charbuf_used;
4879 unsigned char *dst = coding->destination + coding->produced;
4880 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4881 int safe_room = 4;
4882 ptrdiff_t produced_chars = 0;
4883 Lisp_Object attrs, charset_list, val;
4884 bool ascii_compatible;
4885 struct charset *charset_kanji, *charset_kana;
4886 struct charset *charset_kanji2;
4887 int c;
4889 CODING_GET_INFO (coding, attrs, charset_list);
4890 val = XCDR (charset_list);
4891 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4892 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4893 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4895 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4897 while (charbuf < charbuf_end)
4899 ASSURE_DESTINATION (safe_room);
4900 c = *charbuf++;
4901 /* Now encode the character C. */
4902 if (ASCII_CHAR_P (c) && ascii_compatible)
4903 EMIT_ONE_ASCII_BYTE (c);
4904 else if (CHAR_BYTE8_P (c))
4906 c = CHAR_TO_BYTE8 (c);
4907 EMIT_ONE_BYTE (c);
4909 else
4911 unsigned code;
4912 struct charset *charset;
4913 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4914 &code, charset);
4916 if (!charset)
4918 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4920 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4921 charset = CHARSET_FROM_ID (charset_ascii);
4923 else
4925 c = coding->default_char;
4926 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4927 charset_list, &code, charset);
4930 if (code == CHARSET_INVALID_CODE (charset))
4931 emacs_abort ();
4932 if (charset == charset_kanji)
4934 int c1, c2;
4935 JIS_TO_SJIS (code);
4936 c1 = code >> 8, c2 = code & 0xFF;
4937 EMIT_TWO_BYTES (c1, c2);
4939 else if (charset == charset_kana)
4940 EMIT_ONE_BYTE (code | 0x80);
4941 else if (charset_kanji2 && charset == charset_kanji2)
4943 int c1, c2;
4945 c1 = code >> 8;
4946 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4947 || c1 == 0x28
4948 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4950 JIS_TO_SJIS2 (code);
4951 c1 = code >> 8, c2 = code & 0xFF;
4952 EMIT_TWO_BYTES (c1, c2);
4954 else
4955 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4957 else
4958 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4961 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4962 coding->produced_char += produced_chars;
4963 coding->produced = dst - coding->destination;
4964 return 0;
4967 static bool
4968 encode_coding_big5 (struct coding_system *coding)
4970 bool multibytep = coding->dst_multibyte;
4971 int *charbuf = coding->charbuf;
4972 int *charbuf_end = charbuf + coding->charbuf_used;
4973 unsigned char *dst = coding->destination + coding->produced;
4974 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4975 int safe_room = 4;
4976 ptrdiff_t produced_chars = 0;
4977 Lisp_Object attrs, charset_list, val;
4978 bool ascii_compatible;
4979 struct charset *charset_big5;
4980 int c;
4982 CODING_GET_INFO (coding, attrs, charset_list);
4983 val = XCDR (charset_list);
4984 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4985 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4987 while (charbuf < charbuf_end)
4989 ASSURE_DESTINATION (safe_room);
4990 c = *charbuf++;
4991 /* Now encode the character C. */
4992 if (ASCII_CHAR_P (c) && ascii_compatible)
4993 EMIT_ONE_ASCII_BYTE (c);
4994 else if (CHAR_BYTE8_P (c))
4996 c = CHAR_TO_BYTE8 (c);
4997 EMIT_ONE_BYTE (c);
4999 else
5001 unsigned code;
5002 struct charset *charset;
5003 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5004 &code, charset);
5006 if (! charset)
5008 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5010 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5011 charset = CHARSET_FROM_ID (charset_ascii);
5013 else
5015 c = coding->default_char;
5016 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5017 charset_list, &code, charset);
5020 if (code == CHARSET_INVALID_CODE (charset))
5021 emacs_abort ();
5022 if (charset == charset_big5)
5024 int c1, c2;
5026 c1 = code >> 8, c2 = code & 0xFF;
5027 EMIT_TWO_BYTES (c1, c2);
5029 else
5030 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5033 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5034 coding->produced_char += produced_chars;
5035 coding->produced = dst - coding->destination;
5036 return 0;
5040 /*** 10. CCL handlers ***/
5042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5043 Return true if a text is encoded in a coding system of which
5044 encoder/decoder are written in CCL program. */
5046 static bool
5047 detect_coding_ccl (struct coding_system *coding,
5048 struct coding_detection_info *detect_info)
5050 const unsigned char *src = coding->source, *src_base;
5051 const unsigned char *src_end = coding->source + coding->src_bytes;
5052 bool multibytep = coding->src_multibyte;
5053 ptrdiff_t consumed_chars = 0;
5054 int found = 0;
5055 unsigned char *valids;
5056 ptrdiff_t head_ascii = coding->head_ascii;
5057 Lisp_Object attrs;
5059 detect_info->checked |= CATEGORY_MASK_CCL;
5061 coding = &coding_categories[coding_category_ccl];
5062 valids = CODING_CCL_VALIDS (coding);
5063 attrs = CODING_ID_ATTRS (coding->id);
5064 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5065 src += head_ascii;
5067 while (1)
5069 int c;
5071 src_base = src;
5072 ONE_MORE_BYTE (c);
5073 if (c < 0 || ! valids[c])
5074 break;
5075 if ((valids[c] > 1))
5076 found = CATEGORY_MASK_CCL;
5078 detect_info->rejected |= CATEGORY_MASK_CCL;
5079 return 0;
5081 no_more_source:
5082 detect_info->found |= found;
5083 return 1;
5086 static void
5087 decode_coding_ccl (struct coding_system *coding)
5089 const unsigned char *src = coding->source + coding->consumed;
5090 const unsigned char *src_end = coding->source + coding->src_bytes;
5091 int *charbuf = coding->charbuf + coding->charbuf_used;
5092 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5093 ptrdiff_t consumed_chars = 0;
5094 bool multibytep = coding->src_multibyte;
5095 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5096 int source_charbuf[1024];
5097 int source_byteidx[1025];
5098 Lisp_Object attrs, charset_list;
5100 CODING_GET_INFO (coding, attrs, charset_list);
5102 while (1)
5104 const unsigned char *p = src;
5105 ptrdiff_t offset;
5106 int i = 0;
5108 if (multibytep)
5110 while (i < 1024 && p < src_end)
5112 source_byteidx[i] = p - src;
5113 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5115 source_byteidx[i] = p - src;
5117 else
5118 while (i < 1024 && p < src_end)
5119 source_charbuf[i++] = *p++;
5121 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5122 ccl->last_block = 1;
5123 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5124 charset_map_loaded = 0;
5125 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5126 charset_list);
5127 if (charset_map_loaded
5128 && (offset = coding_change_source (coding)))
5130 p += offset;
5131 src += offset;
5132 src_end += offset;
5134 charbuf += ccl->produced;
5135 if (multibytep)
5136 src += source_byteidx[ccl->consumed];
5137 else
5138 src += ccl->consumed;
5139 consumed_chars += ccl->consumed;
5140 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5141 break;
5144 switch (ccl->status)
5146 case CCL_STAT_SUSPEND_BY_SRC:
5147 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5148 break;
5149 case CCL_STAT_SUSPEND_BY_DST:
5150 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5151 break;
5152 case CCL_STAT_QUIT:
5153 case CCL_STAT_INVALID_CMD:
5154 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5155 break;
5156 default:
5157 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5158 break;
5160 coding->consumed_char += consumed_chars;
5161 coding->consumed = src - coding->source;
5162 coding->charbuf_used = charbuf - coding->charbuf;
5165 static bool
5166 encode_coding_ccl (struct coding_system *coding)
5168 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5169 bool multibytep = coding->dst_multibyte;
5170 int *charbuf = coding->charbuf;
5171 int *charbuf_end = charbuf + coding->charbuf_used;
5172 unsigned char *dst = coding->destination + coding->produced;
5173 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5174 int destination_charbuf[1024];
5175 ptrdiff_t produced_chars = 0;
5176 int i;
5177 Lisp_Object attrs, charset_list;
5179 CODING_GET_INFO (coding, attrs, charset_list);
5180 if (coding->consumed_char == coding->src_chars
5181 && coding->mode & CODING_MODE_LAST_BLOCK)
5182 ccl->last_block = 1;
5186 ptrdiff_t offset;
5188 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5189 charset_map_loaded = 0;
5190 ccl_driver (ccl, charbuf, destination_charbuf,
5191 charbuf_end - charbuf, 1024, charset_list);
5192 if (charset_map_loaded
5193 && (offset = coding_change_destination (coding)))
5194 dst += offset;
5195 if (multibytep)
5197 ASSURE_DESTINATION (ccl->produced * 2);
5198 for (i = 0; i < ccl->produced; i++)
5199 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5201 else
5203 ASSURE_DESTINATION (ccl->produced);
5204 for (i = 0; i < ccl->produced; i++)
5205 *dst++ = destination_charbuf[i] & 0xFF;
5206 produced_chars += ccl->produced;
5208 charbuf += ccl->consumed;
5209 if (ccl->status == CCL_STAT_QUIT
5210 || ccl->status == CCL_STAT_INVALID_CMD)
5211 break;
5213 while (charbuf < charbuf_end);
5215 switch (ccl->status)
5217 case CCL_STAT_SUSPEND_BY_SRC:
5218 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5219 break;
5220 case CCL_STAT_SUSPEND_BY_DST:
5221 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5222 break;
5223 case CCL_STAT_QUIT:
5224 case CCL_STAT_INVALID_CMD:
5225 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5226 break;
5227 default:
5228 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5229 break;
5232 coding->produced_char += produced_chars;
5233 coding->produced = dst - coding->destination;
5234 return 0;
5238 /*** 10, 11. no-conversion handlers ***/
5240 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5242 static void
5243 decode_coding_raw_text (struct coding_system *coding)
5245 bool eol_dos
5246 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5248 coding->chars_at_source = 1;
5249 coding->consumed_char = coding->src_chars;
5250 coding->consumed = coding->src_bytes;
5251 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5253 coding->consumed_char--;
5254 coding->consumed--;
5255 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5257 else
5258 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5261 static bool
5262 encode_coding_raw_text (struct coding_system *coding)
5264 bool multibytep = coding->dst_multibyte;
5265 int *charbuf = coding->charbuf;
5266 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5267 unsigned char *dst = coding->destination + coding->produced;
5268 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5269 ptrdiff_t produced_chars = 0;
5270 int c;
5272 if (multibytep)
5274 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5276 if (coding->src_multibyte)
5277 while (charbuf < charbuf_end)
5279 ASSURE_DESTINATION (safe_room);
5280 c = *charbuf++;
5281 if (ASCII_CHAR_P (c))
5282 EMIT_ONE_ASCII_BYTE (c);
5283 else if (CHAR_BYTE8_P (c))
5285 c = CHAR_TO_BYTE8 (c);
5286 EMIT_ONE_BYTE (c);
5288 else
5290 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5292 CHAR_STRING_ADVANCE (c, p1);
5295 EMIT_ONE_BYTE (*p0);
5296 p0++;
5298 while (p0 < p1);
5301 else
5302 while (charbuf < charbuf_end)
5304 ASSURE_DESTINATION (safe_room);
5305 c = *charbuf++;
5306 EMIT_ONE_BYTE (c);
5309 else
5311 if (coding->src_multibyte)
5313 int safe_room = MAX_MULTIBYTE_LENGTH;
5315 while (charbuf < charbuf_end)
5317 ASSURE_DESTINATION (safe_room);
5318 c = *charbuf++;
5319 if (ASCII_CHAR_P (c))
5320 *dst++ = c;
5321 else if (CHAR_BYTE8_P (c))
5322 *dst++ = CHAR_TO_BYTE8 (c);
5323 else
5324 CHAR_STRING_ADVANCE (c, dst);
5327 else
5329 ASSURE_DESTINATION (charbuf_end - charbuf);
5330 while (charbuf < charbuf_end && dst < dst_end)
5331 *dst++ = *charbuf++;
5333 produced_chars = dst - (coding->destination + coding->produced);
5335 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5336 coding->produced_char += produced_chars;
5337 coding->produced = dst - coding->destination;
5338 return 0;
5341 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5342 Return true if a text is encoded in a charset-based coding system. */
5344 static bool
5345 detect_coding_charset (struct coding_system *coding,
5346 struct coding_detection_info *detect_info)
5348 const unsigned char *src = coding->source, *src_base;
5349 const unsigned char *src_end = coding->source + coding->src_bytes;
5350 bool multibytep = coding->src_multibyte;
5351 ptrdiff_t consumed_chars = 0;
5352 Lisp_Object attrs, valids, name;
5353 int found = 0;
5354 ptrdiff_t head_ascii = coding->head_ascii;
5355 bool check_latin_extra = 0;
5357 detect_info->checked |= CATEGORY_MASK_CHARSET;
5359 coding = &coding_categories[coding_category_charset];
5360 attrs = CODING_ID_ATTRS (coding->id);
5361 valids = AREF (attrs, coding_attr_charset_valids);
5362 name = CODING_ID_NAME (coding->id);
5363 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5364 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5365 || strncmp (SSDATA (SYMBOL_NAME (name)),
5366 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5367 check_latin_extra = 1;
5369 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5370 src += head_ascii;
5372 while (1)
5374 int c;
5375 Lisp_Object val;
5376 struct charset *charset;
5377 int dim, idx;
5379 src_base = src;
5380 ONE_MORE_BYTE (c);
5381 if (c < 0)
5382 continue;
5383 val = AREF (valids, c);
5384 if (NILP (val))
5385 break;
5386 if (c >= 0x80)
5388 if (c < 0xA0
5389 && check_latin_extra
5390 && (!VECTORP (Vlatin_extra_code_table)
5391 || NILP (AREF (Vlatin_extra_code_table, c))))
5392 break;
5393 found = CATEGORY_MASK_CHARSET;
5395 if (INTEGERP (val))
5397 charset = CHARSET_FROM_ID (XFASTINT (val));
5398 dim = CHARSET_DIMENSION (charset);
5399 for (idx = 1; idx < dim; idx++)
5401 if (src == src_end)
5402 goto too_short;
5403 ONE_MORE_BYTE (c);
5404 if (c < charset->code_space[(dim - 1 - idx) * 4]
5405 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5406 break;
5408 if (idx < dim)
5409 break;
5411 else
5413 idx = 1;
5414 for (; CONSP (val); val = XCDR (val))
5416 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5417 dim = CHARSET_DIMENSION (charset);
5418 while (idx < dim)
5420 if (src == src_end)
5421 goto too_short;
5422 ONE_MORE_BYTE (c);
5423 if (c < charset->code_space[(dim - 1 - idx) * 4]
5424 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5425 break;
5426 idx++;
5428 if (idx == dim)
5430 val = Qnil;
5431 break;
5434 if (CONSP (val))
5435 break;
5438 too_short:
5439 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5440 return 0;
5442 no_more_source:
5443 detect_info->found |= found;
5444 return 1;
5447 static void
5448 decode_coding_charset (struct coding_system *coding)
5450 const unsigned char *src = coding->source + coding->consumed;
5451 const unsigned char *src_end = coding->source + coding->src_bytes;
5452 const unsigned char *src_base;
5453 int *charbuf = coding->charbuf + coding->charbuf_used;
5454 /* We may produce one charset annotation in one loop and one more at
5455 the end. */
5456 int *charbuf_end
5457 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5458 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5459 bool multibytep = coding->src_multibyte;
5460 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5461 Lisp_Object valids;
5462 ptrdiff_t char_offset = coding->produced_char;
5463 ptrdiff_t last_offset = char_offset;
5464 int last_id = charset_ascii;
5465 bool eol_dos
5466 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5467 int byte_after_cr = -1;
5469 valids = AREF (attrs, coding_attr_charset_valids);
5471 while (1)
5473 int c;
5474 Lisp_Object val;
5475 struct charset *charset;
5476 int dim;
5477 int len = 1;
5478 unsigned code;
5480 src_base = src;
5481 consumed_chars_base = consumed_chars;
5483 if (charbuf >= charbuf_end)
5485 if (byte_after_cr >= 0)
5486 src_base--;
5487 break;
5490 if (byte_after_cr >= 0)
5492 c = byte_after_cr;
5493 byte_after_cr = -1;
5495 else
5497 ONE_MORE_BYTE (c);
5498 if (eol_dos && c == '\r')
5499 ONE_MORE_BYTE (byte_after_cr);
5501 if (c < 0)
5502 goto invalid_code;
5503 code = c;
5505 val = AREF (valids, c);
5506 if (! INTEGERP (val) && ! CONSP (val))
5507 goto invalid_code;
5508 if (INTEGERP (val))
5510 charset = CHARSET_FROM_ID (XFASTINT (val));
5511 dim = CHARSET_DIMENSION (charset);
5512 while (len < dim)
5514 ONE_MORE_BYTE (c);
5515 code = (code << 8) | c;
5516 len++;
5518 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5519 charset, code, c);
5521 else
5523 /* VAL is a list of charset IDs. It is assured that the
5524 list is sorted by charset dimensions (smaller one
5525 comes first). */
5526 while (CONSP (val))
5528 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5529 dim = CHARSET_DIMENSION (charset);
5530 while (len < dim)
5532 ONE_MORE_BYTE (c);
5533 code = (code << 8) | c;
5534 len++;
5536 CODING_DECODE_CHAR (coding, src, src_base,
5537 src_end, charset, code, c);
5538 if (c >= 0)
5539 break;
5540 val = XCDR (val);
5543 if (c < 0)
5544 goto invalid_code;
5545 if (charset->id != charset_ascii
5546 && last_id != charset->id)
5548 if (last_id != charset_ascii)
5549 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5550 last_id = charset->id;
5551 last_offset = char_offset;
5554 *charbuf++ = c;
5555 char_offset++;
5556 continue;
5558 invalid_code:
5559 src = src_base;
5560 consumed_chars = consumed_chars_base;
5561 ONE_MORE_BYTE (c);
5562 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5563 char_offset++;
5564 coding->errors++;
5567 no_more_source:
5568 if (last_id != charset_ascii)
5569 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5570 coding->consumed_char += consumed_chars_base;
5571 coding->consumed = src_base - coding->source;
5572 coding->charbuf_used = charbuf - coding->charbuf;
5575 static bool
5576 encode_coding_charset (struct coding_system *coding)
5578 bool multibytep = coding->dst_multibyte;
5579 int *charbuf = coding->charbuf;
5580 int *charbuf_end = charbuf + coding->charbuf_used;
5581 unsigned char *dst = coding->destination + coding->produced;
5582 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5583 int safe_room = MAX_MULTIBYTE_LENGTH;
5584 ptrdiff_t produced_chars = 0;
5585 Lisp_Object attrs, charset_list;
5586 bool ascii_compatible;
5587 int c;
5589 CODING_GET_INFO (coding, attrs, charset_list);
5590 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5592 while (charbuf < charbuf_end)
5594 struct charset *charset;
5595 unsigned code;
5597 ASSURE_DESTINATION (safe_room);
5598 c = *charbuf++;
5599 if (ascii_compatible && ASCII_CHAR_P (c))
5600 EMIT_ONE_ASCII_BYTE (c);
5601 else if (CHAR_BYTE8_P (c))
5603 c = CHAR_TO_BYTE8 (c);
5604 EMIT_ONE_BYTE (c);
5606 else
5608 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5609 &code, charset);
5611 if (charset)
5613 if (CHARSET_DIMENSION (charset) == 1)
5614 EMIT_ONE_BYTE (code);
5615 else if (CHARSET_DIMENSION (charset) == 2)
5616 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5617 else if (CHARSET_DIMENSION (charset) == 3)
5618 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5619 else
5620 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5621 (code >> 8) & 0xFF, code & 0xFF);
5623 else
5625 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5626 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5627 else
5628 c = coding->default_char;
5629 EMIT_ONE_BYTE (c);
5634 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5635 coding->produced_char += produced_chars;
5636 coding->produced = dst - coding->destination;
5637 return 0;
5641 /*** 7. C library functions ***/
5643 /* Setup coding context CODING from information about CODING_SYSTEM.
5644 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5645 CODING_SYSTEM is invalid, signal an error. */
5647 void
5648 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5650 Lisp_Object attrs;
5651 Lisp_Object eol_type;
5652 Lisp_Object coding_type;
5653 Lisp_Object val;
5655 if (NILP (coding_system))
5656 coding_system = Qundecided;
5658 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5660 attrs = CODING_ID_ATTRS (coding->id);
5661 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5663 coding->mode = 0;
5664 if (VECTORP (eol_type))
5665 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5666 | CODING_REQUIRE_DETECTION_MASK);
5667 else if (! EQ (eol_type, Qunix))
5668 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5669 | CODING_REQUIRE_ENCODING_MASK);
5670 else
5671 coding->common_flags = 0;
5672 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5673 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5674 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5675 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5676 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5677 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5679 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5680 coding->max_charset_id = SCHARS (val) - 1;
5681 coding->safe_charsets = SDATA (val);
5682 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5683 coding->carryover_bytes = 0;
5685 coding_type = CODING_ATTR_TYPE (attrs);
5686 if (EQ (coding_type, Qundecided))
5688 coding->detector = NULL;
5689 coding->decoder = decode_coding_raw_text;
5690 coding->encoder = encode_coding_raw_text;
5691 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5693 else if (EQ (coding_type, Qiso_2022))
5695 int i;
5696 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5698 /* Invoke graphic register 0 to plane 0. */
5699 CODING_ISO_INVOCATION (coding, 0) = 0;
5700 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5701 CODING_ISO_INVOCATION (coding, 1)
5702 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5703 /* Setup the initial status of designation. */
5704 for (i = 0; i < 4; i++)
5705 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5706 /* Not single shifting initially. */
5707 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5708 /* Beginning of buffer should also be regarded as bol. */
5709 CODING_ISO_BOL (coding) = 1;
5710 coding->detector = detect_coding_iso_2022;
5711 coding->decoder = decode_coding_iso_2022;
5712 coding->encoder = encode_coding_iso_2022;
5713 if (flags & CODING_ISO_FLAG_SAFE)
5714 coding->mode |= CODING_MODE_SAFE_ENCODING;
5715 coding->common_flags
5716 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5717 | CODING_REQUIRE_FLUSHING_MASK);
5718 if (flags & CODING_ISO_FLAG_COMPOSITION)
5719 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5720 if (flags & CODING_ISO_FLAG_DESIGNATION)
5721 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5722 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5724 setup_iso_safe_charsets (attrs);
5725 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5726 coding->max_charset_id = SCHARS (val) - 1;
5727 coding->safe_charsets = SDATA (val);
5729 CODING_ISO_FLAGS (coding) = flags;
5730 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5731 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5732 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5733 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5735 else if (EQ (coding_type, Qcharset))
5737 coding->detector = detect_coding_charset;
5738 coding->decoder = decode_coding_charset;
5739 coding->encoder = encode_coding_charset;
5740 coding->common_flags
5741 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5743 else if (EQ (coding_type, Qutf_8))
5745 val = AREF (attrs, coding_attr_utf_bom);
5746 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5747 : EQ (val, Qt) ? utf_with_bom
5748 : utf_without_bom);
5749 coding->detector = detect_coding_utf_8;
5750 coding->decoder = decode_coding_utf_8;
5751 coding->encoder = encode_coding_utf_8;
5752 coding->common_flags
5753 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5754 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5755 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5757 else if (EQ (coding_type, Qutf_16))
5759 val = AREF (attrs, coding_attr_utf_bom);
5760 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5761 : EQ (val, Qt) ? utf_with_bom
5762 : utf_without_bom);
5763 val = AREF (attrs, coding_attr_utf_16_endian);
5764 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5765 : utf_16_little_endian);
5766 CODING_UTF_16_SURROGATE (coding) = 0;
5767 coding->detector = detect_coding_utf_16;
5768 coding->decoder = decode_coding_utf_16;
5769 coding->encoder = encode_coding_utf_16;
5770 coding->common_flags
5771 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5772 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5773 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5775 else if (EQ (coding_type, Qccl))
5777 coding->detector = detect_coding_ccl;
5778 coding->decoder = decode_coding_ccl;
5779 coding->encoder = encode_coding_ccl;
5780 coding->common_flags
5781 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5782 | CODING_REQUIRE_FLUSHING_MASK);
5784 else if (EQ (coding_type, Qemacs_mule))
5786 coding->detector = detect_coding_emacs_mule;
5787 coding->decoder = decode_coding_emacs_mule;
5788 coding->encoder = encode_coding_emacs_mule;
5789 coding->common_flags
5790 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5791 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5792 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5794 Lisp_Object tail, safe_charsets;
5795 int max_charset_id = 0;
5797 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5798 tail = XCDR (tail))
5799 if (max_charset_id < XFASTINT (XCAR (tail)))
5800 max_charset_id = XFASTINT (XCAR (tail));
5801 safe_charsets = make_uninit_string (max_charset_id + 1);
5802 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5803 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5804 tail = XCDR (tail))
5805 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5806 coding->max_charset_id = max_charset_id;
5807 coding->safe_charsets = SDATA (safe_charsets);
5809 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5810 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5812 else if (EQ (coding_type, Qshift_jis))
5814 coding->detector = detect_coding_sjis;
5815 coding->decoder = decode_coding_sjis;
5816 coding->encoder = encode_coding_sjis;
5817 coding->common_flags
5818 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5820 else if (EQ (coding_type, Qbig5))
5822 coding->detector = detect_coding_big5;
5823 coding->decoder = decode_coding_big5;
5824 coding->encoder = encode_coding_big5;
5825 coding->common_flags
5826 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5828 else /* EQ (coding_type, Qraw_text) */
5830 coding->detector = NULL;
5831 coding->decoder = decode_coding_raw_text;
5832 coding->encoder = encode_coding_raw_text;
5833 if (! EQ (eol_type, Qunix))
5835 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5836 if (! VECTORP (eol_type))
5837 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5842 return;
5845 /* Return a list of charsets supported by CODING. */
5847 Lisp_Object
5848 coding_charset_list (struct coding_system *coding)
5850 Lisp_Object attrs, charset_list;
5852 CODING_GET_INFO (coding, attrs, charset_list);
5853 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5855 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5857 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5858 charset_list = Viso_2022_charset_list;
5860 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5862 charset_list = Vemacs_mule_charset_list;
5864 return charset_list;
5868 /* Return a list of charsets supported by CODING-SYSTEM. */
5870 Lisp_Object
5871 coding_system_charset_list (Lisp_Object coding_system)
5873 ptrdiff_t id;
5874 Lisp_Object attrs, charset_list;
5876 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5877 attrs = CODING_ID_ATTRS (id);
5879 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5881 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5883 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5884 charset_list = Viso_2022_charset_list;
5885 else
5886 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5888 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5890 charset_list = Vemacs_mule_charset_list;
5892 else
5894 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5896 return charset_list;
5900 /* Return raw-text or one of its subsidiaries that has the same
5901 eol_type as CODING-SYSTEM. */
5903 Lisp_Object
5904 raw_text_coding_system (Lisp_Object coding_system)
5906 Lisp_Object spec, attrs;
5907 Lisp_Object eol_type, raw_text_eol_type;
5909 if (NILP (coding_system))
5910 return Qraw_text;
5911 spec = CODING_SYSTEM_SPEC (coding_system);
5912 attrs = AREF (spec, 0);
5914 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5915 return coding_system;
5917 eol_type = AREF (spec, 2);
5918 if (VECTORP (eol_type))
5919 return Qraw_text;
5920 spec = CODING_SYSTEM_SPEC (Qraw_text);
5921 raw_text_eol_type = AREF (spec, 2);
5922 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5923 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5924 : AREF (raw_text_eol_type, 2));
5928 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5929 the subsidiary that has the same eol-spec as PARENT (if it is not
5930 nil and specifies end-of-line format) or the system's setting
5931 (system_eol_type). */
5933 Lisp_Object
5934 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5936 Lisp_Object spec, eol_type;
5938 if (NILP (coding_system))
5939 coding_system = Qraw_text;
5940 spec = CODING_SYSTEM_SPEC (coding_system);
5941 eol_type = AREF (spec, 2);
5942 if (VECTORP (eol_type))
5944 Lisp_Object parent_eol_type;
5946 if (! NILP (parent))
5948 Lisp_Object parent_spec;
5950 parent_spec = CODING_SYSTEM_SPEC (parent);
5951 parent_eol_type = AREF (parent_spec, 2);
5952 if (VECTORP (parent_eol_type))
5953 parent_eol_type = system_eol_type;
5955 else
5956 parent_eol_type = system_eol_type;
5957 if (EQ (parent_eol_type, Qunix))
5958 coding_system = AREF (eol_type, 0);
5959 else if (EQ (parent_eol_type, Qdos))
5960 coding_system = AREF (eol_type, 1);
5961 else if (EQ (parent_eol_type, Qmac))
5962 coding_system = AREF (eol_type, 2);
5964 return coding_system;
5968 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5969 decided for writing to a process. If not, complement them, and
5970 return a new coding system. */
5972 Lisp_Object
5973 complement_process_encoding_system (Lisp_Object coding_system)
5975 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5976 Lisp_Object spec, attrs;
5977 int i;
5979 for (i = 0; i < 3; i++)
5981 if (i == 1)
5982 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5983 else if (i == 2)
5984 coding_system = preferred_coding_system ();
5985 spec = CODING_SYSTEM_SPEC (coding_system);
5986 if (NILP (spec))
5987 continue;
5988 attrs = AREF (spec, 0);
5989 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5990 coding_base = CODING_ATTR_BASE_NAME (attrs);
5991 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5992 eol_base = coding_system;
5993 if (! NILP (coding_base) && ! NILP (eol_base))
5994 break;
5997 if (i > 0)
5998 /* The original CODING_SYSTEM didn't specify text-conversion or
5999 eol-conversion. Be sure that we return a fully complemented
6000 coding system. */
6001 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6002 return coding_system;
6006 /* Emacs has a mechanism to automatically detect a coding system if it
6007 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6008 it's impossible to distinguish some coding systems accurately
6009 because they use the same range of codes. So, at first, coding
6010 systems are categorized into 7, those are:
6012 o coding-category-emacs-mule
6014 The category for a coding system which has the same code range
6015 as Emacs' internal format. Assigned the coding-system (Lisp
6016 symbol) `emacs-mule' by default.
6018 o coding-category-sjis
6020 The category for a coding system which has the same code range
6021 as SJIS. Assigned the coding-system (Lisp
6022 symbol) `japanese-shift-jis' by default.
6024 o coding-category-iso-7
6026 The category for a coding system which has the same code range
6027 as ISO2022 of 7-bit environment. This doesn't use any locking
6028 shift and single shift functions. This can encode/decode all
6029 charsets. Assigned the coding-system (Lisp symbol)
6030 `iso-2022-7bit' by default.
6032 o coding-category-iso-7-tight
6034 Same as coding-category-iso-7 except that this can
6035 encode/decode only the specified charsets.
6037 o coding-category-iso-8-1
6039 The category for a coding system which has the same code range
6040 as ISO2022 of 8-bit environment and graphic plane 1 used only
6041 for DIMENSION1 charset. This doesn't use any locking shift
6042 and single shift functions. Assigned the coding-system (Lisp
6043 symbol) `iso-latin-1' by default.
6045 o coding-category-iso-8-2
6047 The category for a coding system which has the same code range
6048 as ISO2022 of 8-bit environment and graphic plane 1 used only
6049 for DIMENSION2 charset. This doesn't use any locking shift
6050 and single shift functions. Assigned the coding-system (Lisp
6051 symbol) `japanese-iso-8bit' by default.
6053 o coding-category-iso-7-else
6055 The category for a coding system which has the same code range
6056 as ISO2022 of 7-bit environment but uses locking shift or
6057 single shift functions. Assigned the coding-system (Lisp
6058 symbol) `iso-2022-7bit-lock' by default.
6060 o coding-category-iso-8-else
6062 The category for a coding system which has the same code range
6063 as ISO2022 of 8-bit environment but uses locking shift or
6064 single shift functions. Assigned the coding-system (Lisp
6065 symbol) `iso-2022-8bit-ss2' by default.
6067 o coding-category-big5
6069 The category for a coding system which has the same code range
6070 as BIG5. Assigned the coding-system (Lisp symbol)
6071 `cn-big5' by default.
6073 o coding-category-utf-8
6075 The category for a coding system which has the same code range
6076 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6077 symbol) `utf-8' by default.
6079 o coding-category-utf-16-be
6081 The category for a coding system in which a text has an
6082 Unicode signature (cf. Unicode Standard) in the order of BIG
6083 endian at the head. Assigned the coding-system (Lisp symbol)
6084 `utf-16-be' by default.
6086 o coding-category-utf-16-le
6088 The category for a coding system in which a text has an
6089 Unicode signature (cf. Unicode Standard) in the order of
6090 LITTLE endian at the head. Assigned the coding-system (Lisp
6091 symbol) `utf-16-le' by default.
6093 o coding-category-ccl
6095 The category for a coding system of which encoder/decoder is
6096 written in CCL programs. The default value is nil, i.e., no
6097 coding system is assigned.
6099 o coding-category-binary
6101 The category for a coding system not categorized in any of the
6102 above. Assigned the coding-system (Lisp symbol)
6103 `no-conversion' by default.
6105 Each of them is a Lisp symbol and the value is an actual
6106 `coding-system's (this is also a Lisp symbol) assigned by a user.
6107 What Emacs does actually is to detect a category of coding system.
6108 Then, it uses a `coding-system' assigned to it. If Emacs can't
6109 decide only one possible category, it selects a category of the
6110 highest priority. Priorities of categories are also specified by a
6111 user in a Lisp variable `coding-category-list'.
6115 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6116 int eol_seen);
6119 /* Return the number of ASCII characters at the head of the source.
6120 By side effects, set coding->head_ascii and update
6121 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6122 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6123 reliable only when all the source bytes are ASCII. */
6125 static int
6126 check_ascii (struct coding_system *coding)
6128 const unsigned char *src, *end;
6129 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6130 int eol_seen = coding->eol_seen;
6132 coding_set_source (coding);
6133 src = coding->source;
6134 end = src + coding->src_bytes;
6136 if (inhibit_eol_conversion
6137 || SYMBOLP (eol_type))
6139 /* We don't have to check EOL format. */
6140 while (src < end && !( *src & 0x80))
6142 if (*src++ == '\n')
6143 eol_seen |= EOL_SEEN_LF;
6146 else
6148 end--; /* We look ahead one byte for "CR LF". */
6149 while (src < end)
6151 int c = *src;
6153 if (c & 0x80)
6154 break;
6155 src++;
6156 if (c == '\r')
6158 if (*src == '\n')
6160 eol_seen |= EOL_SEEN_CRLF;
6161 src++;
6163 else
6164 eol_seen |= EOL_SEEN_CR;
6166 else if (c == '\n')
6167 eol_seen |= EOL_SEEN_LF;
6169 if (src == end)
6171 int c = *src;
6173 /* All bytes but the last one C are ASCII. */
6174 if (! (c & 0x80))
6176 if (c == '\r')
6177 eol_seen |= EOL_SEEN_CR;
6178 else if (c == '\n')
6179 eol_seen |= EOL_SEEN_LF;
6180 src++;
6184 coding->head_ascii = src - coding->source;
6185 coding->eol_seen = eol_seen;
6186 return (coding->head_ascii);
6190 /* Return the number of characters at the source if all the bytes are
6191 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6192 effects, update coding->eol_seen. The value of coding->eol_seen is
6193 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6194 the value is reliable only when all the source bytes are valid
6195 UTF-8. */
6197 static int
6198 check_utf_8 (struct coding_system *coding)
6200 const unsigned char *src, *end;
6201 int eol_seen;
6202 int nchars = coding->head_ascii;
6204 if (coding->head_ascii < 0)
6205 check_ascii (coding);
6206 else
6207 coding_set_source (coding);
6208 src = coding->source + coding->head_ascii;
6209 /* We look ahead one byte for CR LF. */
6210 end = coding->source + coding->src_bytes - 1;
6211 eol_seen = coding->eol_seen;
6212 while (src < end)
6214 int c = *src;
6216 if (UTF_8_1_OCTET_P (*src))
6218 src++;
6219 if (c < 0x20)
6221 if (c == '\r')
6223 if (*src == '\n')
6225 eol_seen |= EOL_SEEN_CRLF;
6226 src++;
6227 nchars++;
6229 else
6230 eol_seen |= EOL_SEEN_CR;
6232 else if (c == '\n')
6233 eol_seen |= EOL_SEEN_LF;
6236 else if (UTF_8_2_OCTET_LEADING_P (c))
6238 if (c < 0xC2 /* overlong sequence */
6239 || src + 1 >= end
6240 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6241 return -1;
6242 src += 2;
6244 else if (UTF_8_3_OCTET_LEADING_P (c))
6246 if (src + 2 >= end
6247 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6248 && UTF_8_EXTRA_OCTET_P (src[2])))
6249 return -1;
6250 c = (((c & 0xF) << 12)
6251 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6252 if (c < 0x800 /* overlong sequence */
6253 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6254 return -1;
6255 src += 3;
6257 else if (UTF_8_4_OCTET_LEADING_P (c))
6259 if (src + 3 >= end
6260 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6261 && UTF_8_EXTRA_OCTET_P (src[2])
6262 && UTF_8_EXTRA_OCTET_P (src[3])))
6263 return -1;
6264 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6265 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6266 if (c < 0x10000 /* overlong sequence */
6267 || c >= 0x110000) /* non-Unicode character */
6268 return -1;
6269 src += 4;
6271 else
6272 return -1;
6273 nchars++;
6276 if (src == end)
6278 if (! UTF_8_1_OCTET_P (*src))
6279 return -1;
6280 nchars++;
6281 if (*src == '\r')
6282 eol_seen |= EOL_SEEN_CR;
6283 else if (*src == '\n')
6284 eol_seen |= EOL_SEEN_LF;
6286 coding->eol_seen = eol_seen;
6287 return nchars;
6291 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6292 SOURCE is encoded. If CATEGORY is one of
6293 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6294 two-byte, else they are encoded by one-byte.
6296 Return one of EOL_SEEN_XXX. */
6298 #define MAX_EOL_CHECK_COUNT 3
6300 static int
6301 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6302 enum coding_category category)
6304 const unsigned char *src = source, *src_end = src + src_bytes;
6305 unsigned char c;
6306 int total = 0;
6307 int eol_seen = EOL_SEEN_NONE;
6309 if ((1 << category) & CATEGORY_MASK_UTF_16)
6311 bool msb = category == (coding_category_utf_16_le
6312 | coding_category_utf_16_le_nosig);
6313 bool lsb = !msb;
6315 while (src + 1 < src_end)
6317 c = src[lsb];
6318 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6320 int this_eol;
6322 if (c == '\n')
6323 this_eol = EOL_SEEN_LF;
6324 else if (src + 3 >= src_end
6325 || src[msb + 2] != 0
6326 || src[lsb + 2] != '\n')
6327 this_eol = EOL_SEEN_CR;
6328 else
6330 this_eol = EOL_SEEN_CRLF;
6331 src += 2;
6334 if (eol_seen == EOL_SEEN_NONE)
6335 /* This is the first end-of-line. */
6336 eol_seen = this_eol;
6337 else if (eol_seen != this_eol)
6339 /* The found type is different from what found before.
6340 Allow for stray ^M characters in DOS EOL files. */
6341 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6342 || (eol_seen == EOL_SEEN_CRLF
6343 && this_eol == EOL_SEEN_CR))
6344 eol_seen = EOL_SEEN_CRLF;
6345 else
6347 eol_seen = EOL_SEEN_LF;
6348 break;
6351 if (++total == MAX_EOL_CHECK_COUNT)
6352 break;
6354 src += 2;
6357 else
6358 while (src < src_end)
6360 c = *src++;
6361 if (c == '\n' || c == '\r')
6363 int this_eol;
6365 if (c == '\n')
6366 this_eol = EOL_SEEN_LF;
6367 else if (src >= src_end || *src != '\n')
6368 this_eol = EOL_SEEN_CR;
6369 else
6370 this_eol = EOL_SEEN_CRLF, src++;
6372 if (eol_seen == EOL_SEEN_NONE)
6373 /* This is the first end-of-line. */
6374 eol_seen = this_eol;
6375 else if (eol_seen != this_eol)
6377 /* The found type is different from what found before.
6378 Allow for stray ^M characters in DOS EOL files. */
6379 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6380 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6381 eol_seen = EOL_SEEN_CRLF;
6382 else
6384 eol_seen = EOL_SEEN_LF;
6385 break;
6388 if (++total == MAX_EOL_CHECK_COUNT)
6389 break;
6392 return eol_seen;
6396 static Lisp_Object
6397 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6399 Lisp_Object eol_type;
6401 eol_type = CODING_ID_EOL_TYPE (coding->id);
6402 if (! VECTORP (eol_type))
6403 /* Already adjusted. */
6404 return eol_type;
6405 if (eol_seen & EOL_SEEN_LF)
6407 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6408 eol_type = Qunix;
6410 else if (eol_seen & EOL_SEEN_CRLF)
6412 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6413 eol_type = Qdos;
6415 else if (eol_seen & EOL_SEEN_CR)
6417 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6418 eol_type = Qmac;
6420 return eol_type;
6423 /* Detect how a text specified in CODING is encoded. If a coding
6424 system is detected, update fields of CODING by the detected coding
6425 system. */
6427 static void
6428 detect_coding (struct coding_system *coding)
6430 const unsigned char *src, *src_end;
6431 unsigned int saved_mode = coding->mode;
6432 Lisp_Object found = Qnil;
6433 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6435 coding->consumed = coding->consumed_char = 0;
6436 coding->produced = coding->produced_char = 0;
6437 coding_set_source (coding);
6439 src_end = coding->source + coding->src_bytes;
6441 coding->eol_seen = EOL_SEEN_NONE;
6442 /* If we have not yet decided the text encoding type, detect it
6443 now. */
6444 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6446 int c, i;
6447 struct coding_detection_info detect_info;
6448 bool null_byte_found = 0, eight_bit_found = 0;
6450 coding->head_ascii = 0;
6451 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6452 for (src = coding->source; src < src_end; src++)
6454 c = *src;
6455 if (c & 0x80)
6457 eight_bit_found = 1;
6458 if (null_byte_found)
6459 break;
6461 else if (c < 0x20)
6463 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6464 && ! inhibit_iso_escape_detection
6465 && ! detect_info.checked)
6467 if (detect_coding_iso_2022 (coding, &detect_info))
6469 /* We have scanned the whole data. */
6470 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6472 /* We didn't find an 8-bit code. We may
6473 have found a null-byte, but it's very
6474 rare that a binary file conforms to
6475 ISO-2022. */
6476 src = src_end;
6477 coding->head_ascii = src - coding->source;
6479 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6480 break;
6483 else if (! c && !inhibit_null_byte_detection)
6485 null_byte_found = 1;
6486 if (eight_bit_found)
6487 break;
6489 else if (! disable_ascii_optimization
6490 && ! inhibit_eol_conversion)
6492 if (c == '\r')
6494 if (src < src_end && src[1] == '\n')
6496 coding->eol_seen |= EOL_SEEN_CRLF;
6497 src++;
6498 if (! eight_bit_found)
6499 coding->head_ascii++;
6501 else
6502 coding->eol_seen |= EOL_SEEN_CR;
6504 else if (c == '\n')
6506 coding->eol_seen |= EOL_SEEN_LF;
6510 if (! eight_bit_found)
6511 coding->head_ascii++;
6513 else if (! eight_bit_found)
6514 coding->head_ascii++;
6517 if (null_byte_found || eight_bit_found
6518 || coding->head_ascii < coding->src_bytes
6519 || detect_info.found)
6521 enum coding_category category;
6522 struct coding_system *this;
6524 if (coding->head_ascii == coding->src_bytes)
6525 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6526 for (i = 0; i < coding_category_raw_text; i++)
6528 category = coding_priorities[i];
6529 this = coding_categories + category;
6530 if (detect_info.found & (1 << category))
6531 break;
6533 else
6535 if (null_byte_found)
6537 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6538 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6540 for (i = 0; i < coding_category_raw_text; i++)
6542 category = coding_priorities[i];
6543 this = coding_categories + category;
6544 /* Some of this->detector (e.g. detect_coding_sjis)
6545 require this information. */
6546 coding->id = this->id;
6547 if (this->id < 0)
6549 /* No coding system of this category is defined. */
6550 detect_info.rejected |= (1 << category);
6552 else if (category >= coding_category_raw_text)
6553 continue;
6554 else if (detect_info.checked & (1 << category))
6556 if (detect_info.found & (1 << category))
6557 break;
6559 else if ((*(this->detector)) (coding, &detect_info)
6560 && detect_info.found & (1 << category))
6561 break;
6565 if (i < coding_category_raw_text)
6567 if (category == coding_category_utf_8_auto)
6569 Lisp_Object coding_systems;
6571 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6572 coding_attr_utf_bom);
6573 if (CONSP (coding_systems))
6575 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6576 found = XCAR (coding_systems);
6577 else
6578 found = XCDR (coding_systems);
6580 else
6581 found = CODING_ID_NAME (this->id);
6583 else if (category == coding_category_utf_16_auto)
6585 Lisp_Object coding_systems;
6587 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6588 coding_attr_utf_bom);
6589 if (CONSP (coding_systems))
6591 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6592 found = XCAR (coding_systems);
6593 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6594 found = XCDR (coding_systems);
6596 else
6597 found = CODING_ID_NAME (this->id);
6599 else
6600 found = CODING_ID_NAME (this->id);
6602 else if (null_byte_found)
6603 found = Qno_conversion;
6604 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6605 == CATEGORY_MASK_ANY)
6606 found = Qraw_text;
6607 else if (detect_info.rejected)
6608 for (i = 0; i < coding_category_raw_text; i++)
6609 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6611 this = coding_categories + coding_priorities[i];
6612 found = CODING_ID_NAME (this->id);
6613 break;
6617 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6618 == coding_category_utf_8_auto)
6620 Lisp_Object coding_systems;
6621 struct coding_detection_info detect_info;
6623 coding_systems
6624 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6625 detect_info.found = detect_info.rejected = 0;
6626 if (check_ascii (coding) == coding->src_bytes)
6628 if (CONSP (coding_systems))
6629 found = XCDR (coding_systems);
6631 else
6633 if (CONSP (coding_systems)
6634 && detect_coding_utf_8 (coding, &detect_info))
6636 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6637 found = XCAR (coding_systems);
6638 else
6639 found = XCDR (coding_systems);
6643 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6644 == coding_category_utf_16_auto)
6646 Lisp_Object coding_systems;
6647 struct coding_detection_info detect_info;
6649 coding_systems
6650 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6651 detect_info.found = detect_info.rejected = 0;
6652 coding->head_ascii = 0;
6653 if (CONSP (coding_systems)
6654 && detect_coding_utf_16 (coding, &detect_info))
6656 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6657 found = XCAR (coding_systems);
6658 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6659 found = XCDR (coding_systems);
6663 if (! NILP (found))
6665 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6666 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6667 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6668 : EOL_SEEN_LF);
6670 setup_coding_system (found, coding);
6671 if (specified_eol != EOL_SEEN_NONE)
6672 adjust_coding_eol_type (coding, specified_eol);
6675 coding->mode = saved_mode;
6679 static void
6680 decode_eol (struct coding_system *coding)
6682 Lisp_Object eol_type;
6683 unsigned char *p, *pbeg, *pend;
6685 eol_type = CODING_ID_EOL_TYPE (coding->id);
6686 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6687 return;
6689 if (NILP (coding->dst_object))
6690 pbeg = coding->destination;
6691 else
6692 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6693 pend = pbeg + coding->produced;
6695 if (VECTORP (eol_type))
6697 int eol_seen = EOL_SEEN_NONE;
6699 for (p = pbeg; p < pend; p++)
6701 if (*p == '\n')
6702 eol_seen |= EOL_SEEN_LF;
6703 else if (*p == '\r')
6705 if (p + 1 < pend && *(p + 1) == '\n')
6707 eol_seen |= EOL_SEEN_CRLF;
6708 p++;
6710 else
6711 eol_seen |= EOL_SEEN_CR;
6714 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6715 if ((eol_seen & EOL_SEEN_CRLF) != 0
6716 && (eol_seen & EOL_SEEN_CR) != 0
6717 && (eol_seen & EOL_SEEN_LF) == 0)
6718 eol_seen = EOL_SEEN_CRLF;
6719 else if (eol_seen != EOL_SEEN_NONE
6720 && eol_seen != EOL_SEEN_LF
6721 && eol_seen != EOL_SEEN_CRLF
6722 && eol_seen != EOL_SEEN_CR)
6723 eol_seen = EOL_SEEN_LF;
6724 if (eol_seen != EOL_SEEN_NONE)
6725 eol_type = adjust_coding_eol_type (coding, eol_seen);
6728 if (EQ (eol_type, Qmac))
6730 for (p = pbeg; p < pend; p++)
6731 if (*p == '\r')
6732 *p = '\n';
6734 else if (EQ (eol_type, Qdos))
6736 ptrdiff_t n = 0;
6738 if (NILP (coding->dst_object))
6740 /* Start deleting '\r' from the tail to minimize the memory
6741 movement. */
6742 for (p = pend - 2; p >= pbeg; p--)
6743 if (*p == '\r')
6745 memmove (p, p + 1, pend-- - p - 1);
6746 n++;
6749 else
6751 ptrdiff_t pos_byte = coding->dst_pos_byte;
6752 ptrdiff_t pos = coding->dst_pos;
6753 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6755 while (pos < pos_end)
6757 p = BYTE_POS_ADDR (pos_byte);
6758 if (*p == '\r' && p[1] == '\n')
6760 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6761 n++;
6762 pos_end--;
6764 pos++;
6765 if (coding->dst_multibyte)
6766 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6767 else
6768 pos_byte++;
6771 coding->produced -= n;
6772 coding->produced_char -= n;
6777 /* Return a translation table (or list of them) from coding system
6778 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6779 not ENCODEP). */
6781 static Lisp_Object
6782 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6784 Lisp_Object standard, translation_table;
6785 Lisp_Object val;
6787 if (NILP (Venable_character_translation))
6789 if (max_lookup)
6790 *max_lookup = 0;
6791 return Qnil;
6793 if (encodep)
6794 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6795 standard = Vstandard_translation_table_for_encode;
6796 else
6797 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6798 standard = Vstandard_translation_table_for_decode;
6799 if (NILP (translation_table))
6800 translation_table = standard;
6801 else
6803 if (SYMBOLP (translation_table))
6804 translation_table = Fget (translation_table, Qtranslation_table);
6805 else if (CONSP (translation_table))
6807 translation_table = Fcopy_sequence (translation_table);
6808 for (val = translation_table; CONSP (val); val = XCDR (val))
6809 if (SYMBOLP (XCAR (val)))
6810 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6812 if (CHAR_TABLE_P (standard))
6814 if (CONSP (translation_table))
6815 translation_table = nconc2 (translation_table,
6816 Fcons (standard, Qnil));
6817 else
6818 translation_table = Fcons (translation_table,
6819 Fcons (standard, Qnil));
6823 if (max_lookup)
6825 *max_lookup = 1;
6826 if (CHAR_TABLE_P (translation_table)
6827 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6829 val = XCHAR_TABLE (translation_table)->extras[1];
6830 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6831 *max_lookup = XFASTINT (val);
6833 else if (CONSP (translation_table))
6835 Lisp_Object tail;
6837 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6838 if (CHAR_TABLE_P (XCAR (tail))
6839 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6841 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6842 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6843 *max_lookup = XFASTINT (tailval);
6847 return translation_table;
6850 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6851 do { \
6852 trans = Qnil; \
6853 if (CHAR_TABLE_P (table)) \
6855 trans = CHAR_TABLE_REF (table, c); \
6856 if (CHARACTERP (trans)) \
6857 c = XFASTINT (trans), trans = Qnil; \
6859 else if (CONSP (table)) \
6861 Lisp_Object tail; \
6863 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6864 if (CHAR_TABLE_P (XCAR (tail))) \
6866 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6867 if (CHARACTERP (trans)) \
6868 c = XFASTINT (trans), trans = Qnil; \
6869 else if (! NILP (trans)) \
6870 break; \
6873 } while (0)
6876 /* Return a translation of character(s) at BUF according to TRANS.
6877 TRANS is TO-CHAR or ((FROM . TO) ...) where
6878 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6879 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6880 translation is found, and Qnil if not found..
6881 If BUF is too short to lookup characters in FROM, return Qt. */
6883 static Lisp_Object
6884 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6887 if (INTEGERP (trans))
6888 return trans;
6889 for (; CONSP (trans); trans = XCDR (trans))
6891 Lisp_Object val = XCAR (trans);
6892 Lisp_Object from = XCAR (val);
6893 ptrdiff_t len = ASIZE (from);
6894 ptrdiff_t i;
6896 for (i = 0; i < len; i++)
6898 if (buf + i == buf_end)
6899 return Qt;
6900 if (XINT (AREF (from, i)) != buf[i])
6901 break;
6903 if (i == len)
6904 return val;
6906 return Qnil;
6910 static int
6911 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6912 bool last_block)
6914 unsigned char *dst = coding->destination + coding->produced;
6915 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6916 ptrdiff_t produced;
6917 ptrdiff_t produced_chars = 0;
6918 int carryover = 0;
6920 if (! coding->chars_at_source)
6922 /* Source characters are in coding->charbuf. */
6923 int *buf = coding->charbuf;
6924 int *buf_end = buf + coding->charbuf_used;
6926 if (EQ (coding->src_object, coding->dst_object))
6928 coding_set_source (coding);
6929 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6932 while (buf < buf_end)
6934 int c = *buf;
6935 ptrdiff_t i;
6937 if (c >= 0)
6939 ptrdiff_t from_nchars = 1, to_nchars = 1;
6940 Lisp_Object trans = Qnil;
6942 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6943 if (! NILP (trans))
6945 trans = get_translation (trans, buf, buf_end);
6946 if (INTEGERP (trans))
6947 c = XINT (trans);
6948 else if (CONSP (trans))
6950 from_nchars = ASIZE (XCAR (trans));
6951 trans = XCDR (trans);
6952 if (INTEGERP (trans))
6953 c = XINT (trans);
6954 else
6956 to_nchars = ASIZE (trans);
6957 c = XINT (AREF (trans, 0));
6960 else if (EQ (trans, Qt) && ! last_block)
6961 break;
6964 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6966 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6967 / MAX_MULTIBYTE_LENGTH)
6968 < to_nchars)
6969 memory_full (SIZE_MAX);
6970 dst = alloc_destination (coding,
6971 buf_end - buf
6972 + MAX_MULTIBYTE_LENGTH * to_nchars,
6973 dst);
6974 if (EQ (coding->src_object, coding->dst_object))
6976 coding_set_source (coding);
6977 dst_end = (((unsigned char *) coding->source)
6978 + coding->consumed);
6980 else
6981 dst_end = coding->destination + coding->dst_bytes;
6984 for (i = 0; i < to_nchars; i++)
6986 if (i > 0)
6987 c = XINT (AREF (trans, i));
6988 if (coding->dst_multibyte
6989 || ! CHAR_BYTE8_P (c))
6990 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6991 else
6992 *dst++ = CHAR_TO_BYTE8 (c);
6994 produced_chars += to_nchars;
6995 buf += from_nchars;
6997 else
6998 /* This is an annotation datum. (-C) is the length. */
6999 buf += -c;
7001 carryover = buf_end - buf;
7003 else
7005 /* Source characters are at coding->source. */
7006 const unsigned char *src = coding->source;
7007 const unsigned char *src_end = src + coding->consumed;
7009 if (EQ (coding->dst_object, coding->src_object))
7010 dst_end = (unsigned char *) src;
7011 if (coding->src_multibyte != coding->dst_multibyte)
7013 if (coding->src_multibyte)
7015 bool multibytep = 1;
7016 ptrdiff_t consumed_chars = 0;
7018 while (1)
7020 const unsigned char *src_base = src;
7021 int c;
7023 ONE_MORE_BYTE (c);
7024 if (dst == dst_end)
7026 if (EQ (coding->src_object, coding->dst_object))
7027 dst_end = (unsigned char *) src;
7028 if (dst == dst_end)
7030 ptrdiff_t offset = src - coding->source;
7032 dst = alloc_destination (coding, src_end - src + 1,
7033 dst);
7034 dst_end = coding->destination + coding->dst_bytes;
7035 coding_set_source (coding);
7036 src = coding->source + offset;
7037 src_end = coding->source + coding->consumed;
7038 if (EQ (coding->src_object, coding->dst_object))
7039 dst_end = (unsigned char *) src;
7042 *dst++ = c;
7043 produced_chars++;
7045 no_more_source:
7048 else
7049 while (src < src_end)
7051 bool multibytep = 1;
7052 int c = *src++;
7054 if (dst >= dst_end - 1)
7056 if (EQ (coding->src_object, coding->dst_object))
7057 dst_end = (unsigned char *) src;
7058 if (dst >= dst_end - 1)
7060 ptrdiff_t offset = src - coding->source;
7061 ptrdiff_t more_bytes;
7063 if (EQ (coding->src_object, coding->dst_object))
7064 more_bytes = ((src_end - src) / 2) + 2;
7065 else
7066 more_bytes = src_end - src + 2;
7067 dst = alloc_destination (coding, more_bytes, dst);
7068 dst_end = coding->destination + coding->dst_bytes;
7069 coding_set_source (coding);
7070 src = coding->source + offset;
7071 src_end = coding->source + coding->consumed;
7072 if (EQ (coding->src_object, coding->dst_object))
7073 dst_end = (unsigned char *) src;
7076 EMIT_ONE_BYTE (c);
7079 else
7081 if (!EQ (coding->src_object, coding->dst_object))
7083 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7085 if (require > 0)
7087 ptrdiff_t offset = src - coding->source;
7089 dst = alloc_destination (coding, require, dst);
7090 coding_set_source (coding);
7091 src = coding->source + offset;
7092 src_end = coding->source + coding->consumed;
7095 produced_chars = coding->consumed_char;
7096 while (src < src_end)
7097 *dst++ = *src++;
7101 produced = dst - (coding->destination + coding->produced);
7102 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7103 insert_from_gap (produced_chars, produced, 0);
7104 coding->produced += produced;
7105 coding->produced_char += produced_chars;
7106 return carryover;
7109 /* Compose text in CODING->object according to the annotation data at
7110 CHARBUF. CHARBUF is an array:
7111 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7114 static void
7115 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7117 int len;
7118 ptrdiff_t to;
7119 enum composition_method method;
7120 Lisp_Object components;
7122 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7123 to = pos + charbuf[2];
7124 method = (enum composition_method) (charbuf[4]);
7126 if (method == COMPOSITION_RELATIVE)
7127 components = Qnil;
7128 else
7130 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7131 int i, j;
7133 if (method == COMPOSITION_WITH_RULE)
7134 len = charbuf[2] * 3 - 2;
7135 charbuf += MAX_ANNOTATION_LENGTH;
7136 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7137 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7139 if (charbuf[i] >= 0)
7140 args[j] = make_number (charbuf[i]);
7141 else
7143 i++;
7144 args[j] = make_number (charbuf[i] % 0x100);
7147 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7149 compose_text (pos, to, components, Qnil, coding->dst_object);
7153 /* Put `charset' property on text in CODING->object according to
7154 the annotation data at CHARBUF. CHARBUF is an array:
7155 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7158 static void
7159 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7161 ptrdiff_t from = pos - charbuf[2];
7162 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7164 Fput_text_property (make_number (from), make_number (pos),
7165 Qcharset, CHARSET_NAME (charset),
7166 coding->dst_object);
7170 #define CHARBUF_SIZE 0x4000
7172 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7173 do { \
7174 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7175 coding->charbuf_size = CHARBUF_SIZE; \
7176 } while (0)
7179 static void
7180 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7182 int *charbuf = coding->charbuf;
7183 int *charbuf_end = charbuf + coding->charbuf_used;
7185 if (NILP (coding->dst_object))
7186 return;
7188 while (charbuf < charbuf_end)
7190 if (*charbuf >= 0)
7191 pos++, charbuf++;
7192 else
7194 int len = -*charbuf;
7196 if (len > 2)
7197 switch (charbuf[1])
7199 case CODING_ANNOTATE_COMPOSITION_MASK:
7200 produce_composition (coding, charbuf, pos);
7201 break;
7202 case CODING_ANNOTATE_CHARSET_MASK:
7203 produce_charset (coding, charbuf, pos);
7204 break;
7206 charbuf += len;
7211 /* Decode the data at CODING->src_object into CODING->dst_object.
7212 CODING->src_object is a buffer, a string, or nil.
7213 CODING->dst_object is a buffer.
7215 If CODING->src_object is a buffer, it must be the current buffer.
7216 In this case, if CODING->src_pos is positive, it is a position of
7217 the source text in the buffer, otherwise, the source text is in the
7218 gap area of the buffer, and CODING->src_pos specifies the offset of
7219 the text from GPT (which must be the same as PT). If this is the
7220 same buffer as CODING->dst_object, CODING->src_pos must be
7221 negative.
7223 If CODING->src_object is a string, CODING->src_pos is an index to
7224 that string.
7226 If CODING->src_object is nil, CODING->source must already point to
7227 the non-relocatable memory area. In this case, CODING->src_pos is
7228 an offset from CODING->source.
7230 The decoded data is inserted at the current point of the buffer
7231 CODING->dst_object.
7234 static void
7235 decode_coding (struct coding_system *coding)
7237 Lisp_Object attrs;
7238 Lisp_Object undo_list;
7239 Lisp_Object translation_table;
7240 struct ccl_spec cclspec;
7241 int carryover;
7242 int i;
7244 USE_SAFE_ALLOCA;
7246 if (BUFFERP (coding->src_object)
7247 && coding->src_pos > 0
7248 && coding->src_pos < GPT
7249 && coding->src_pos + coding->src_chars > GPT)
7250 move_gap_both (coding->src_pos, coding->src_pos_byte);
7252 undo_list = Qt;
7253 if (BUFFERP (coding->dst_object))
7255 set_buffer_internal (XBUFFER (coding->dst_object));
7256 if (GPT != PT)
7257 move_gap_both (PT, PT_BYTE);
7259 /* We must disable undo_list in order to record the whole insert
7260 transaction via record_insert at the end. But doing so also
7261 disables the recording of the first change to the undo_list.
7262 Therefore we check for first change here and record it via
7263 record_first_change if needed. */
7264 if (MODIFF <= SAVE_MODIFF)
7265 record_first_change ();
7267 undo_list = BVAR (current_buffer, undo_list);
7268 bset_undo_list (current_buffer, Qt);
7271 coding->consumed = coding->consumed_char = 0;
7272 coding->produced = coding->produced_char = 0;
7273 coding->chars_at_source = 0;
7274 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7275 coding->errors = 0;
7277 ALLOC_CONVERSION_WORK_AREA (coding);
7279 attrs = CODING_ID_ATTRS (coding->id);
7280 translation_table = get_translation_table (attrs, 0, NULL);
7282 carryover = 0;
7283 if (coding->decoder == decode_coding_ccl)
7285 coding->spec.ccl = &cclspec;
7286 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7290 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7292 coding_set_source (coding);
7293 coding->annotated = 0;
7294 coding->charbuf_used = carryover;
7295 (*(coding->decoder)) (coding);
7296 coding_set_destination (coding);
7297 carryover = produce_chars (coding, translation_table, 0);
7298 if (coding->annotated)
7299 produce_annotation (coding, pos);
7300 for (i = 0; i < carryover; i++)
7301 coding->charbuf[i]
7302 = coding->charbuf[coding->charbuf_used - carryover + i];
7304 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7305 || (coding->consumed < coding->src_bytes
7306 && (coding->result == CODING_RESULT_SUCCESS
7307 || coding->result == CODING_RESULT_INVALID_SRC)));
7309 if (carryover > 0)
7311 coding_set_destination (coding);
7312 coding->charbuf_used = carryover;
7313 produce_chars (coding, translation_table, 1);
7316 coding->carryover_bytes = 0;
7317 if (coding->consumed < coding->src_bytes)
7319 int nbytes = coding->src_bytes - coding->consumed;
7320 const unsigned char *src;
7322 coding_set_source (coding);
7323 coding_set_destination (coding);
7324 src = coding->source + coding->consumed;
7326 if (coding->mode & CODING_MODE_LAST_BLOCK)
7328 /* Flush out unprocessed data as binary chars. We are sure
7329 that the number of data is less than the size of
7330 coding->charbuf. */
7331 coding->charbuf_used = 0;
7332 coding->chars_at_source = 0;
7334 while (nbytes-- > 0)
7336 int c = *src++;
7338 if (c & 0x80)
7339 c = BYTE8_TO_CHAR (c);
7340 coding->charbuf[coding->charbuf_used++] = c;
7342 produce_chars (coding, Qnil, 1);
7344 else
7346 /* Record unprocessed bytes in coding->carryover. We are
7347 sure that the number of data is less than the size of
7348 coding->carryover. */
7349 unsigned char *p = coding->carryover;
7351 if (nbytes > sizeof coding->carryover)
7352 nbytes = sizeof coding->carryover;
7353 coding->carryover_bytes = nbytes;
7354 while (nbytes-- > 0)
7355 *p++ = *src++;
7357 coding->consumed = coding->src_bytes;
7360 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7361 && !inhibit_eol_conversion)
7362 decode_eol (coding);
7363 if (BUFFERP (coding->dst_object))
7365 bset_undo_list (current_buffer, undo_list);
7366 record_insert (coding->dst_pos, coding->produced_char);
7369 SAFE_FREE ();
7373 /* Extract an annotation datum from a composition starting at POS and
7374 ending before LIMIT of CODING->src_object (buffer or string), store
7375 the data in BUF, set *STOP to a starting position of the next
7376 composition (if any) or to LIMIT, and return the address of the
7377 next element of BUF.
7379 If such an annotation is not found, set *STOP to a starting
7380 position of a composition after POS (if any) or to LIMIT, and
7381 return BUF. */
7383 static int *
7384 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7385 struct coding_system *coding, int *buf,
7386 ptrdiff_t *stop)
7388 ptrdiff_t start, end;
7389 Lisp_Object prop;
7391 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7392 || end > limit)
7393 *stop = limit;
7394 else if (start > pos)
7395 *stop = start;
7396 else
7398 if (start == pos)
7400 /* We found a composition. Store the corresponding
7401 annotation data in BUF. */
7402 int *head = buf;
7403 enum composition_method method = COMPOSITION_METHOD (prop);
7404 int nchars = COMPOSITION_LENGTH (prop);
7406 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7407 if (method != COMPOSITION_RELATIVE)
7409 Lisp_Object components;
7410 ptrdiff_t i, len, i_byte;
7412 components = COMPOSITION_COMPONENTS (prop);
7413 if (VECTORP (components))
7415 len = ASIZE (components);
7416 for (i = 0; i < len; i++)
7417 *buf++ = XINT (AREF (components, i));
7419 else if (STRINGP (components))
7421 len = SCHARS (components);
7422 i = i_byte = 0;
7423 while (i < len)
7425 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7426 buf++;
7429 else if (INTEGERP (components))
7431 len = 1;
7432 *buf++ = XINT (components);
7434 else if (CONSP (components))
7436 for (len = 0; CONSP (components);
7437 len++, components = XCDR (components))
7438 *buf++ = XINT (XCAR (components));
7440 else
7441 emacs_abort ();
7442 *head -= len;
7446 if (find_composition (end, limit, &start, &end, &prop,
7447 coding->src_object)
7448 && end <= limit)
7449 *stop = start;
7450 else
7451 *stop = limit;
7453 return buf;
7457 /* Extract an annotation datum from a text property `charset' at POS of
7458 CODING->src_object (buffer of string), store the data in BUF, set
7459 *STOP to the position where the value of `charset' property changes
7460 (limiting by LIMIT), and return the address of the next element of
7461 BUF.
7463 If the property value is nil, set *STOP to the position where the
7464 property value is non-nil (limiting by LIMIT), and return BUF. */
7466 static int *
7467 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7468 struct coding_system *coding, int *buf,
7469 ptrdiff_t *stop)
7471 Lisp_Object val, next;
7472 int id;
7474 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7475 if (! NILP (val) && CHARSETP (val))
7476 id = XINT (CHARSET_SYMBOL_ID (val));
7477 else
7478 id = -1;
7479 ADD_CHARSET_DATA (buf, 0, id);
7480 next = Fnext_single_property_change (make_number (pos), Qcharset,
7481 coding->src_object,
7482 make_number (limit));
7483 *stop = XINT (next);
7484 return buf;
7488 static void
7489 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7490 int max_lookup)
7492 int *buf = coding->charbuf;
7493 int *buf_end = coding->charbuf + coding->charbuf_size;
7494 const unsigned char *src = coding->source + coding->consumed;
7495 const unsigned char *src_end = coding->source + coding->src_bytes;
7496 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7497 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7498 bool multibytep = coding->src_multibyte;
7499 Lisp_Object eol_type;
7500 int c;
7501 ptrdiff_t stop, stop_composition, stop_charset;
7502 int *lookup_buf = NULL;
7504 if (! NILP (translation_table))
7505 lookup_buf = alloca (sizeof (int) * max_lookup);
7507 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7508 if (VECTORP (eol_type))
7509 eol_type = Qunix;
7511 /* Note: composition handling is not yet implemented. */
7512 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7514 if (NILP (coding->src_object))
7515 stop = stop_composition = stop_charset = end_pos;
7516 else
7518 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7519 stop = stop_composition = pos;
7520 else
7521 stop = stop_composition = end_pos;
7522 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7523 stop = stop_charset = pos;
7524 else
7525 stop_charset = end_pos;
7528 /* Compensate for CRLF and conversion. */
7529 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7530 while (buf < buf_end)
7532 Lisp_Object trans;
7534 if (pos == stop)
7536 if (pos == end_pos)
7537 break;
7538 if (pos == stop_composition)
7539 buf = handle_composition_annotation (pos, end_pos, coding,
7540 buf, &stop_composition);
7541 if (pos == stop_charset)
7542 buf = handle_charset_annotation (pos, end_pos, coding,
7543 buf, &stop_charset);
7544 stop = (stop_composition < stop_charset
7545 ? stop_composition : stop_charset);
7548 if (! multibytep)
7550 int bytes;
7552 if (coding->encoder == encode_coding_raw_text
7553 || coding->encoder == encode_coding_ccl)
7554 c = *src++, pos++;
7555 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7556 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7557 else
7558 c = BYTE8_TO_CHAR (*src), src++, pos++;
7560 else
7561 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7562 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7563 c = '\n';
7564 if (! EQ (eol_type, Qunix))
7566 if (c == '\n')
7568 if (EQ (eol_type, Qdos))
7569 *buf++ = '\r';
7570 else
7571 c = '\r';
7575 trans = Qnil;
7576 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7577 if (NILP (trans))
7578 *buf++ = c;
7579 else
7581 ptrdiff_t from_nchars = 1, to_nchars = 1;
7582 int *lookup_buf_end;
7583 const unsigned char *p = src;
7584 int i;
7586 lookup_buf[0] = c;
7587 for (i = 1; i < max_lookup && p < src_end; i++)
7588 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7589 lookup_buf_end = lookup_buf + i;
7590 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7591 if (INTEGERP (trans))
7592 c = XINT (trans);
7593 else if (CONSP (trans))
7595 from_nchars = ASIZE (XCAR (trans));
7596 trans = XCDR (trans);
7597 if (INTEGERP (trans))
7598 c = XINT (trans);
7599 else
7601 to_nchars = ASIZE (trans);
7602 if (buf_end - buf < to_nchars)
7603 break;
7604 c = XINT (AREF (trans, 0));
7607 else
7608 break;
7609 *buf++ = c;
7610 for (i = 1; i < to_nchars; i++)
7611 *buf++ = XINT (AREF (trans, i));
7612 for (i = 1; i < from_nchars; i++, pos++)
7613 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7617 coding->consumed = src - coding->source;
7618 coding->consumed_char = pos - coding->src_pos;
7619 coding->charbuf_used = buf - coding->charbuf;
7620 coding->chars_at_source = 0;
7624 /* Encode the text at CODING->src_object into CODING->dst_object.
7625 CODING->src_object is a buffer or a string.
7626 CODING->dst_object is a buffer or nil.
7628 If CODING->src_object is a buffer, it must be the current buffer.
7629 In this case, if CODING->src_pos is positive, it is a position of
7630 the source text in the buffer, otherwise. the source text is in the
7631 gap area of the buffer, and coding->src_pos specifies the offset of
7632 the text from GPT (which must be the same as PT). If this is the
7633 same buffer as CODING->dst_object, CODING->src_pos must be
7634 negative and CODING should not have `pre-write-conversion'.
7636 If CODING->src_object is a string, CODING should not have
7637 `pre-write-conversion'.
7639 If CODING->dst_object is a buffer, the encoded data is inserted at
7640 the current point of that buffer.
7642 If CODING->dst_object is nil, the encoded data is placed at the
7643 memory area specified by CODING->destination. */
7645 static void
7646 encode_coding (struct coding_system *coding)
7648 Lisp_Object attrs;
7649 Lisp_Object translation_table;
7650 int max_lookup;
7651 struct ccl_spec cclspec;
7653 USE_SAFE_ALLOCA;
7655 attrs = CODING_ID_ATTRS (coding->id);
7656 if (coding->encoder == encode_coding_raw_text)
7657 translation_table = Qnil, max_lookup = 0;
7658 else
7659 translation_table = get_translation_table (attrs, 1, &max_lookup);
7661 if (BUFFERP (coding->dst_object))
7663 set_buffer_internal (XBUFFER (coding->dst_object));
7664 coding->dst_multibyte
7665 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7668 coding->consumed = coding->consumed_char = 0;
7669 coding->produced = coding->produced_char = 0;
7670 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7671 coding->errors = 0;
7673 ALLOC_CONVERSION_WORK_AREA (coding);
7675 if (coding->encoder == encode_coding_ccl)
7677 coding->spec.ccl = &cclspec;
7678 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7680 do {
7681 coding_set_source (coding);
7682 consume_chars (coding, translation_table, max_lookup);
7683 coding_set_destination (coding);
7684 (*(coding->encoder)) (coding);
7685 } while (coding->consumed_char < coding->src_chars);
7687 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7688 insert_from_gap (coding->produced_char, coding->produced, 0);
7690 SAFE_FREE ();
7694 /* Name (or base name) of work buffer for code conversion. */
7695 static Lisp_Object Vcode_conversion_workbuf_name;
7697 /* A working buffer used by the top level conversion. Once it is
7698 created, it is never destroyed. It has the name
7699 Vcode_conversion_workbuf_name. The other working buffers are
7700 destroyed after the use is finished, and their names are modified
7701 versions of Vcode_conversion_workbuf_name. */
7702 static Lisp_Object Vcode_conversion_reused_workbuf;
7704 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7705 static bool reused_workbuf_in_use;
7708 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7709 multibyteness of returning buffer. */
7711 static Lisp_Object
7712 make_conversion_work_buffer (bool multibyte)
7714 Lisp_Object name, workbuf;
7715 struct buffer *current;
7717 if (reused_workbuf_in_use)
7719 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7720 workbuf = Fget_buffer_create (name);
7722 else
7724 reused_workbuf_in_use = 1;
7725 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7726 Vcode_conversion_reused_workbuf
7727 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7728 workbuf = Vcode_conversion_reused_workbuf;
7730 current = current_buffer;
7731 set_buffer_internal (XBUFFER (workbuf));
7732 /* We can't allow modification hooks to run in the work buffer. For
7733 instance, directory_files_internal assumes that file decoding
7734 doesn't compile new regexps. */
7735 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7736 Ferase_buffer ();
7737 bset_undo_list (current_buffer, Qt);
7738 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7739 set_buffer_internal (current);
7740 return workbuf;
7744 static Lisp_Object
7745 code_conversion_restore (Lisp_Object arg)
7747 Lisp_Object current, workbuf;
7748 struct gcpro gcpro1;
7750 GCPRO1 (arg);
7751 current = XCAR (arg);
7752 workbuf = XCDR (arg);
7753 if (! NILP (workbuf))
7755 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7756 reused_workbuf_in_use = 0;
7757 else
7758 Fkill_buffer (workbuf);
7760 set_buffer_internal (XBUFFER (current));
7761 UNGCPRO;
7762 return Qnil;
7765 Lisp_Object
7766 code_conversion_save (bool with_work_buf, bool multibyte)
7768 Lisp_Object workbuf = Qnil;
7770 if (with_work_buf)
7771 workbuf = make_conversion_work_buffer (multibyte);
7772 record_unwind_protect (code_conversion_restore,
7773 Fcons (Fcurrent_buffer (), workbuf));
7774 return workbuf;
7777 void
7778 decode_coding_gap (struct coding_system *coding,
7779 ptrdiff_t chars, ptrdiff_t bytes)
7781 ptrdiff_t count = SPECPDL_INDEX ();
7782 Lisp_Object attrs;
7784 coding->src_object = Fcurrent_buffer ();
7785 coding->src_chars = chars;
7786 coding->src_bytes = bytes;
7787 coding->src_pos = -chars;
7788 coding->src_pos_byte = -bytes;
7789 coding->src_multibyte = chars < bytes;
7790 coding->dst_object = coding->src_object;
7791 coding->dst_pos = PT;
7792 coding->dst_pos_byte = PT_BYTE;
7793 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7795 coding->head_ascii = -1;
7796 coding->detected_utf8_chars = -1;
7797 coding->eol_seen = EOL_SEEN_NONE;
7798 if (CODING_REQUIRE_DETECTION (coding))
7799 detect_coding (coding);
7800 attrs = CODING_ID_ATTRS (coding->id);
7801 if (! disable_ascii_optimization
7802 && ! coding->src_multibyte
7803 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7804 && NILP (CODING_ATTR_POST_READ (attrs))
7805 && NILP (get_translation_table (attrs, 0, NULL)))
7807 chars = coding->head_ascii;
7808 if (chars < 0)
7809 chars = check_ascii (coding);
7810 if (chars != bytes)
7812 /* There exists a non-ASCII byte. */
7813 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7815 if (coding->detected_utf8_chars >= 0)
7816 chars = coding->detected_utf8_chars;
7817 else
7818 chars = check_utf_8 (coding);
7819 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7820 && coding->head_ascii == 0
7821 && coding->source[0] == UTF_8_BOM_1
7822 && coding->source[1] == UTF_8_BOM_2
7823 && coding->source[2] == UTF_8_BOM_3)
7825 chars--;
7826 bytes -= 3;
7827 coding->src_bytes -= 3;
7830 else
7831 chars = -1;
7833 if (chars >= 0)
7835 Lisp_Object eol_type;
7837 eol_type = CODING_ID_EOL_TYPE (coding->id);
7838 if (VECTORP (eol_type))
7840 if (coding->eol_seen != EOL_SEEN_NONE)
7841 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7843 if (EQ (eol_type, Qmac))
7845 unsigned char *src_end = GAP_END_ADDR;
7846 unsigned char *src = src_end - coding->src_bytes;
7848 while (src < src_end)
7850 if (*src++ == '\r')
7851 src[-1] = '\n';
7854 else if (EQ (eol_type, Qdos))
7856 unsigned char *src = GAP_END_ADDR;
7857 unsigned char *src_beg = src - coding->src_bytes;
7858 unsigned char *dst = src;
7859 ptrdiff_t diff;
7861 while (src_beg < src)
7863 *--dst = *--src;
7864 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7865 src--;
7867 diff = dst - src;
7868 bytes -= diff;
7869 chars -= diff;
7871 coding->produced = bytes;
7872 coding->produced_char = chars;
7873 insert_from_gap (chars, bytes, 1);
7874 return;
7877 code_conversion_save (0, 0);
7879 coding->mode |= CODING_MODE_LAST_BLOCK;
7880 current_buffer->text->inhibit_shrinking = 1;
7881 decode_coding (coding);
7882 current_buffer->text->inhibit_shrinking = 0;
7884 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7886 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7887 Lisp_Object val;
7889 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7890 val = call1 (CODING_ATTR_POST_READ (attrs),
7891 make_number (coding->produced_char));
7892 CHECK_NATNUM (val);
7893 coding->produced_char += Z - prev_Z;
7894 coding->produced += Z_BYTE - prev_Z_BYTE;
7897 unbind_to (count, Qnil);
7901 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7902 SRC_OBJECT into DST_OBJECT by coding context CODING.
7904 SRC_OBJECT is a buffer, a string, or Qnil.
7906 If it is a buffer, the text is at point of the buffer. FROM and TO
7907 are positions in the buffer.
7909 If it is a string, the text is at the beginning of the string.
7910 FROM and TO are indices to the string.
7912 If it is nil, the text is at coding->source. FROM and TO are
7913 indices to coding->source.
7915 DST_OBJECT is a buffer, Qt, or Qnil.
7917 If it is a buffer, the decoded text is inserted at point of the
7918 buffer. If the buffer is the same as SRC_OBJECT, the source text
7919 is deleted.
7921 If it is Qt, a string is made from the decoded text, and
7922 set in CODING->dst_object.
7924 If it is Qnil, the decoded text is stored at CODING->destination.
7925 The caller must allocate CODING->dst_bytes bytes at
7926 CODING->destination by xmalloc. If the decoded text is longer than
7927 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7930 void
7931 decode_coding_object (struct coding_system *coding,
7932 Lisp_Object src_object,
7933 ptrdiff_t from, ptrdiff_t from_byte,
7934 ptrdiff_t to, ptrdiff_t to_byte,
7935 Lisp_Object dst_object)
7937 ptrdiff_t count = SPECPDL_INDEX ();
7938 unsigned char *destination IF_LINT (= NULL);
7939 ptrdiff_t dst_bytes IF_LINT (= 0);
7940 ptrdiff_t chars = to - from;
7941 ptrdiff_t bytes = to_byte - from_byte;
7942 Lisp_Object attrs;
7943 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7944 bool need_marker_adjustment = 0;
7945 Lisp_Object old_deactivate_mark;
7947 old_deactivate_mark = Vdeactivate_mark;
7949 if (NILP (dst_object))
7951 destination = coding->destination;
7952 dst_bytes = coding->dst_bytes;
7955 coding->src_object = src_object;
7956 coding->src_chars = chars;
7957 coding->src_bytes = bytes;
7958 coding->src_multibyte = chars < bytes;
7960 if (STRINGP (src_object))
7962 coding->src_pos = from;
7963 coding->src_pos_byte = from_byte;
7965 else if (BUFFERP (src_object))
7967 set_buffer_internal (XBUFFER (src_object));
7968 if (from != GPT)
7969 move_gap_both (from, from_byte);
7970 if (EQ (src_object, dst_object))
7972 struct Lisp_Marker *tail;
7974 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7976 tail->need_adjustment
7977 = tail->charpos == (tail->insertion_type ? from : to);
7978 need_marker_adjustment |= tail->need_adjustment;
7980 saved_pt = PT, saved_pt_byte = PT_BYTE;
7981 TEMP_SET_PT_BOTH (from, from_byte);
7982 current_buffer->text->inhibit_shrinking = 1;
7983 del_range_both (from, from_byte, to, to_byte, 1);
7984 coding->src_pos = -chars;
7985 coding->src_pos_byte = -bytes;
7987 else
7989 coding->src_pos = from;
7990 coding->src_pos_byte = from_byte;
7994 if (CODING_REQUIRE_DETECTION (coding))
7995 detect_coding (coding);
7996 attrs = CODING_ID_ATTRS (coding->id);
7998 if (EQ (dst_object, Qt)
7999 || (! NILP (CODING_ATTR_POST_READ (attrs))
8000 && NILP (dst_object)))
8002 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8003 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8004 coding->dst_pos = BEG;
8005 coding->dst_pos_byte = BEG_BYTE;
8007 else if (BUFFERP (dst_object))
8009 code_conversion_save (0, 0);
8010 coding->dst_object = dst_object;
8011 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8012 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8013 coding->dst_multibyte
8014 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8016 else
8018 code_conversion_save (0, 0);
8019 coding->dst_object = Qnil;
8020 /* Most callers presume this will return a multibyte result, and they
8021 won't use `binary' or `raw-text' anyway, so let's not worry about
8022 CODING_FOR_UNIBYTE. */
8023 coding->dst_multibyte = 1;
8026 decode_coding (coding);
8028 if (BUFFERP (coding->dst_object))
8029 set_buffer_internal (XBUFFER (coding->dst_object));
8031 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8033 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8034 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8035 Lisp_Object val;
8037 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8038 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8039 old_deactivate_mark);
8040 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8041 make_number (coding->produced_char));
8042 UNGCPRO;
8043 CHECK_NATNUM (val);
8044 coding->produced_char += Z - prev_Z;
8045 coding->produced += Z_BYTE - prev_Z_BYTE;
8048 if (EQ (dst_object, Qt))
8050 coding->dst_object = Fbuffer_string ();
8052 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8054 set_buffer_internal (XBUFFER (coding->dst_object));
8055 if (dst_bytes < coding->produced)
8057 eassert (coding->produced > 0);
8058 destination = xrealloc (destination, coding->produced);
8059 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8060 move_gap_both (BEGV, BEGV_BYTE);
8061 memcpy (destination, BEGV_ADDR, coding->produced);
8062 coding->destination = destination;
8066 if (saved_pt >= 0)
8068 /* This is the case of:
8069 (BUFFERP (src_object) && EQ (src_object, dst_object))
8070 As we have moved PT while replacing the original buffer
8071 contents, we must recover it now. */
8072 set_buffer_internal (XBUFFER (src_object));
8073 current_buffer->text->inhibit_shrinking = 0;
8074 if (saved_pt < from)
8075 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8076 else if (saved_pt < from + chars)
8077 TEMP_SET_PT_BOTH (from, from_byte);
8078 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8079 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8080 saved_pt_byte + (coding->produced - bytes));
8081 else
8082 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8083 saved_pt_byte + (coding->produced - bytes));
8085 if (need_marker_adjustment)
8087 struct Lisp_Marker *tail;
8089 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8090 if (tail->need_adjustment)
8092 tail->need_adjustment = 0;
8093 if (tail->insertion_type)
8095 tail->bytepos = from_byte;
8096 tail->charpos = from;
8098 else
8100 tail->bytepos = from_byte + coding->produced;
8101 tail->charpos
8102 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8103 ? tail->bytepos : from + coding->produced_char);
8109 Vdeactivate_mark = old_deactivate_mark;
8110 unbind_to (count, coding->dst_object);
8114 void
8115 encode_coding_object (struct coding_system *coding,
8116 Lisp_Object src_object,
8117 ptrdiff_t from, ptrdiff_t from_byte,
8118 ptrdiff_t to, ptrdiff_t to_byte,
8119 Lisp_Object dst_object)
8121 ptrdiff_t count = SPECPDL_INDEX ();
8122 ptrdiff_t chars = to - from;
8123 ptrdiff_t bytes = to_byte - from_byte;
8124 Lisp_Object attrs;
8125 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8126 bool need_marker_adjustment = 0;
8127 bool kill_src_buffer = 0;
8128 Lisp_Object old_deactivate_mark;
8130 old_deactivate_mark = Vdeactivate_mark;
8132 coding->src_object = src_object;
8133 coding->src_chars = chars;
8134 coding->src_bytes = bytes;
8135 coding->src_multibyte = chars < bytes;
8137 attrs = CODING_ID_ATTRS (coding->id);
8139 if (EQ (src_object, dst_object))
8141 struct Lisp_Marker *tail;
8143 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8145 tail->need_adjustment
8146 = tail->charpos == (tail->insertion_type ? from : to);
8147 need_marker_adjustment |= tail->need_adjustment;
8151 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8153 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8154 set_buffer_internal (XBUFFER (coding->src_object));
8155 if (STRINGP (src_object))
8156 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8157 else if (BUFFERP (src_object))
8158 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8159 else
8160 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8162 if (EQ (src_object, dst_object))
8164 set_buffer_internal (XBUFFER (src_object));
8165 saved_pt = PT, saved_pt_byte = PT_BYTE;
8166 del_range_both (from, from_byte, to, to_byte, 1);
8167 set_buffer_internal (XBUFFER (coding->src_object));
8171 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8173 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8174 old_deactivate_mark);
8175 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8176 make_number (BEG), make_number (Z));
8177 UNGCPRO;
8179 if (XBUFFER (coding->src_object) != current_buffer)
8180 kill_src_buffer = 1;
8181 coding->src_object = Fcurrent_buffer ();
8182 if (BEG != GPT)
8183 move_gap_both (BEG, BEG_BYTE);
8184 coding->src_chars = Z - BEG;
8185 coding->src_bytes = Z_BYTE - BEG_BYTE;
8186 coding->src_pos = BEG;
8187 coding->src_pos_byte = BEG_BYTE;
8188 coding->src_multibyte = Z < Z_BYTE;
8190 else if (STRINGP (src_object))
8192 code_conversion_save (0, 0);
8193 coding->src_pos = from;
8194 coding->src_pos_byte = from_byte;
8196 else if (BUFFERP (src_object))
8198 code_conversion_save (0, 0);
8199 set_buffer_internal (XBUFFER (src_object));
8200 if (EQ (src_object, dst_object))
8202 saved_pt = PT, saved_pt_byte = PT_BYTE;
8203 coding->src_object = del_range_1 (from, to, 1, 1);
8204 coding->src_pos = 0;
8205 coding->src_pos_byte = 0;
8207 else
8209 if (from < GPT && to >= GPT)
8210 move_gap_both (from, from_byte);
8211 coding->src_pos = from;
8212 coding->src_pos_byte = from_byte;
8215 else
8216 code_conversion_save (0, 0);
8218 if (BUFFERP (dst_object))
8220 coding->dst_object = dst_object;
8221 if (EQ (src_object, dst_object))
8223 coding->dst_pos = from;
8224 coding->dst_pos_byte = from_byte;
8226 else
8228 struct buffer *current = current_buffer;
8230 set_buffer_temp (XBUFFER (dst_object));
8231 coding->dst_pos = PT;
8232 coding->dst_pos_byte = PT_BYTE;
8233 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8234 set_buffer_temp (current);
8236 coding->dst_multibyte
8237 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8239 else if (EQ (dst_object, Qt))
8241 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8242 coding->dst_object = Qnil;
8243 coding->destination = xmalloc (dst_bytes);
8244 coding->dst_bytes = dst_bytes;
8245 coding->dst_multibyte = 0;
8247 else
8249 coding->dst_object = Qnil;
8250 coding->dst_multibyte = 0;
8253 encode_coding (coding);
8255 if (EQ (dst_object, Qt))
8257 if (BUFFERP (coding->dst_object))
8258 coding->dst_object = Fbuffer_string ();
8259 else
8261 coding->dst_object
8262 = make_unibyte_string ((char *) coding->destination,
8263 coding->produced);
8264 xfree (coding->destination);
8268 if (saved_pt >= 0)
8270 /* This is the case of:
8271 (BUFFERP (src_object) && EQ (src_object, dst_object))
8272 As we have moved PT while replacing the original buffer
8273 contents, we must recover it now. */
8274 set_buffer_internal (XBUFFER (src_object));
8275 if (saved_pt < from)
8276 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8277 else if (saved_pt < from + chars)
8278 TEMP_SET_PT_BOTH (from, from_byte);
8279 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8280 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8281 saved_pt_byte + (coding->produced - bytes));
8282 else
8283 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8284 saved_pt_byte + (coding->produced - bytes));
8286 if (need_marker_adjustment)
8288 struct Lisp_Marker *tail;
8290 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8291 if (tail->need_adjustment)
8293 tail->need_adjustment = 0;
8294 if (tail->insertion_type)
8296 tail->bytepos = from_byte;
8297 tail->charpos = from;
8299 else
8301 tail->bytepos = from_byte + coding->produced;
8302 tail->charpos
8303 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8304 ? tail->bytepos : from + coding->produced_char);
8310 if (kill_src_buffer)
8311 Fkill_buffer (coding->src_object);
8313 Vdeactivate_mark = old_deactivate_mark;
8314 unbind_to (count, Qnil);
8318 Lisp_Object
8319 preferred_coding_system (void)
8321 int id = coding_categories[coding_priorities[0]].id;
8323 return CODING_ID_NAME (id);
8326 #if defined (WINDOWSNT) || defined (CYGWIN)
8328 Lisp_Object
8329 from_unicode (Lisp_Object str)
8331 CHECK_STRING (str);
8332 if (!STRING_MULTIBYTE (str) &&
8333 SBYTES (str) & 1)
8335 str = Fsubstring (str, make_number (0), make_number (-1));
8338 return code_convert_string_norecord (str, Qutf_16le, 0);
8341 Lisp_Object
8342 from_unicode_buffer (const wchar_t* wstr)
8344 return from_unicode (
8345 make_unibyte_string (
8346 (char*) wstr,
8347 /* we get one of the two final 0 bytes for free. */
8348 1 + sizeof (wchar_t) * wcslen (wstr)));
8351 wchar_t *
8352 to_unicode (Lisp_Object str, Lisp_Object *buf)
8354 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8355 /* We need to make another copy (in addition to the one made by
8356 code_convert_string_norecord) to ensure that the final string is
8357 _doubly_ zero terminated --- that is, that the string is
8358 terminated by two zero bytes and one utf-16le null character.
8359 Because strings are already terminated with a single zero byte,
8360 we just add one additional zero. */
8361 str = make_uninit_string (SBYTES (*buf) + 1);
8362 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8363 SDATA (str) [SBYTES (*buf)] = '\0';
8364 *buf = str;
8365 return WCSDATA (*buf);
8368 #endif /* WINDOWSNT || CYGWIN */
8371 #ifdef emacs
8372 /*** 8. Emacs Lisp library functions ***/
8374 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8375 doc: /* Return t if OBJECT is nil or a coding-system.
8376 See the documentation of `define-coding-system' for information
8377 about coding-system objects. */)
8378 (Lisp_Object object)
8380 if (NILP (object)
8381 || CODING_SYSTEM_ID (object) >= 0)
8382 return Qt;
8383 if (! SYMBOLP (object)
8384 || NILP (Fget (object, Qcoding_system_define_form)))
8385 return Qnil;
8386 return Qt;
8389 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8390 Sread_non_nil_coding_system, 1, 1, 0,
8391 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8392 (Lisp_Object prompt)
8394 Lisp_Object val;
8397 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8398 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8400 while (SCHARS (val) == 0);
8401 return (Fintern (val, Qnil));
8404 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8405 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8406 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8407 Ignores case when completing coding systems (all Emacs coding systems
8408 are lower-case). */)
8409 (Lisp_Object prompt, Lisp_Object default_coding_system)
8411 Lisp_Object val;
8412 ptrdiff_t count = SPECPDL_INDEX ();
8414 if (SYMBOLP (default_coding_system))
8415 default_coding_system = SYMBOL_NAME (default_coding_system);
8416 specbind (Qcompletion_ignore_case, Qt);
8417 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8418 Qt, Qnil, Qcoding_system_history,
8419 default_coding_system, Qnil);
8420 unbind_to (count, Qnil);
8421 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8424 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8425 1, 1, 0,
8426 doc: /* Check validity of CODING-SYSTEM.
8427 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8428 It is valid if it is nil or a symbol defined as a coding system by the
8429 function `define-coding-system'. */)
8430 (Lisp_Object coding_system)
8432 Lisp_Object define_form;
8434 define_form = Fget (coding_system, Qcoding_system_define_form);
8435 if (! NILP (define_form))
8437 Fput (coding_system, Qcoding_system_define_form, Qnil);
8438 safe_eval (define_form);
8440 if (!NILP (Fcoding_system_p (coding_system)))
8441 return coding_system;
8442 xsignal1 (Qcoding_system_error, coding_system);
8446 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8447 HIGHEST, return the coding system of the highest
8448 priority among the detected coding systems. Otherwise return a
8449 list of detected coding systems sorted by their priorities. If
8450 MULTIBYTEP, it is assumed that the bytes are in correct
8451 multibyte form but contains only ASCII and eight-bit chars.
8452 Otherwise, the bytes are raw bytes.
8454 CODING-SYSTEM controls the detection as below:
8456 If it is nil, detect both text-format and eol-format. If the
8457 text-format part of CODING-SYSTEM is already specified
8458 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8459 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8460 detect only text-format. */
8462 Lisp_Object
8463 detect_coding_system (const unsigned char *src,
8464 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8465 bool highest, bool multibytep,
8466 Lisp_Object coding_system)
8468 const unsigned char *src_end = src + src_bytes;
8469 Lisp_Object attrs, eol_type;
8470 Lisp_Object val = Qnil;
8471 struct coding_system coding;
8472 ptrdiff_t id;
8473 struct coding_detection_info detect_info;
8474 enum coding_category base_category;
8475 bool null_byte_found = 0, eight_bit_found = 0;
8477 if (NILP (coding_system))
8478 coding_system = Qundecided;
8479 setup_coding_system (coding_system, &coding);
8480 attrs = CODING_ID_ATTRS (coding.id);
8481 eol_type = CODING_ID_EOL_TYPE (coding.id);
8482 coding_system = CODING_ATTR_BASE_NAME (attrs);
8484 coding.source = src;
8485 coding.src_chars = src_chars;
8486 coding.src_bytes = src_bytes;
8487 coding.src_multibyte = multibytep;
8488 coding.consumed = 0;
8489 coding.mode |= CODING_MODE_LAST_BLOCK;
8490 coding.head_ascii = 0;
8492 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8494 /* At first, detect text-format if necessary. */
8495 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8496 if (base_category == coding_category_undecided)
8498 enum coding_category category IF_LINT (= 0);
8499 struct coding_system *this IF_LINT (= NULL);
8500 int c, i;
8502 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8503 for (; src < src_end; src++)
8505 c = *src;
8506 if (c & 0x80)
8508 eight_bit_found = 1;
8509 if (null_byte_found)
8510 break;
8512 else if (c < 0x20)
8514 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8515 && ! inhibit_iso_escape_detection
8516 && ! detect_info.checked)
8518 if (detect_coding_iso_2022 (&coding, &detect_info))
8520 /* We have scanned the whole data. */
8521 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8523 /* We didn't find an 8-bit code. We may
8524 have found a null-byte, but it's very
8525 rare that a binary file confirm to
8526 ISO-2022. */
8527 src = src_end;
8528 coding.head_ascii = src - coding.source;
8530 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8531 break;
8534 else if (! c && !inhibit_null_byte_detection)
8536 null_byte_found = 1;
8537 if (eight_bit_found)
8538 break;
8540 if (! eight_bit_found)
8541 coding.head_ascii++;
8543 else if (! eight_bit_found)
8544 coding.head_ascii++;
8547 if (null_byte_found || eight_bit_found
8548 || coding.head_ascii < coding.src_bytes
8549 || detect_info.found)
8551 if (coding.head_ascii == coding.src_bytes)
8552 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8553 for (i = 0; i < coding_category_raw_text; i++)
8555 category = coding_priorities[i];
8556 this = coding_categories + category;
8557 if (detect_info.found & (1 << category))
8558 break;
8560 else
8562 if (null_byte_found)
8564 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8565 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8567 for (i = 0; i < coding_category_raw_text; i++)
8569 category = coding_priorities[i];
8570 this = coding_categories + category;
8572 if (this->id < 0)
8574 /* No coding system of this category is defined. */
8575 detect_info.rejected |= (1 << category);
8577 else if (category >= coding_category_raw_text)
8578 continue;
8579 else if (detect_info.checked & (1 << category))
8581 if (highest
8582 && (detect_info.found & (1 << category)))
8583 break;
8585 else if ((*(this->detector)) (&coding, &detect_info)
8586 && highest
8587 && (detect_info.found & (1 << category)))
8589 if (category == coding_category_utf_16_auto)
8591 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8592 category = coding_category_utf_16_le;
8593 else
8594 category = coding_category_utf_16_be;
8596 break;
8602 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8603 || null_byte_found)
8605 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8606 id = CODING_SYSTEM_ID (Qno_conversion);
8607 val = Fcons (make_number (id), Qnil);
8609 else if (! detect_info.rejected && ! detect_info.found)
8611 detect_info.found = CATEGORY_MASK_ANY;
8612 id = coding_categories[coding_category_undecided].id;
8613 val = Fcons (make_number (id), Qnil);
8615 else if (highest)
8617 if (detect_info.found)
8619 detect_info.found = 1 << category;
8620 val = Fcons (make_number (this->id), Qnil);
8622 else
8623 for (i = 0; i < coding_category_raw_text; i++)
8624 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8626 detect_info.found = 1 << coding_priorities[i];
8627 id = coding_categories[coding_priorities[i]].id;
8628 val = Fcons (make_number (id), Qnil);
8629 break;
8632 else
8634 int mask = detect_info.rejected | detect_info.found;
8635 int found = 0;
8637 for (i = coding_category_raw_text - 1; i >= 0; i--)
8639 category = coding_priorities[i];
8640 if (! (mask & (1 << category)))
8642 found |= 1 << category;
8643 id = coding_categories[category].id;
8644 if (id >= 0)
8645 val = Fcons (make_number (id), val);
8648 for (i = coding_category_raw_text - 1; i >= 0; i--)
8650 category = coding_priorities[i];
8651 if (detect_info.found & (1 << category))
8653 id = coding_categories[category].id;
8654 val = Fcons (make_number (id), val);
8657 detect_info.found |= found;
8660 else if (base_category == coding_category_utf_8_auto)
8662 if (detect_coding_utf_8 (&coding, &detect_info))
8664 struct coding_system *this;
8666 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8667 this = coding_categories + coding_category_utf_8_sig;
8668 else
8669 this = coding_categories + coding_category_utf_8_nosig;
8670 val = Fcons (make_number (this->id), Qnil);
8673 else if (base_category == coding_category_utf_16_auto)
8675 if (detect_coding_utf_16 (&coding, &detect_info))
8677 struct coding_system *this;
8679 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8680 this = coding_categories + coding_category_utf_16_le;
8681 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8682 this = coding_categories + coding_category_utf_16_be;
8683 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8684 this = coding_categories + coding_category_utf_16_be_nosig;
8685 else
8686 this = coding_categories + coding_category_utf_16_le_nosig;
8687 val = Fcons (make_number (this->id), Qnil);
8690 else
8692 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8693 val = Fcons (make_number (coding.id), Qnil);
8696 /* Then, detect eol-format if necessary. */
8698 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8699 Lisp_Object tail;
8701 if (VECTORP (eol_type))
8703 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8705 if (null_byte_found)
8706 normal_eol = EOL_SEEN_LF;
8707 else
8708 normal_eol = detect_eol (coding.source, src_bytes,
8709 coding_category_raw_text);
8711 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8712 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8713 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8714 coding_category_utf_16_be);
8715 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8716 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8717 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8718 coding_category_utf_16_le);
8720 else
8722 if (EQ (eol_type, Qunix))
8723 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8724 else if (EQ (eol_type, Qdos))
8725 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8726 else
8727 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8730 for (tail = val; CONSP (tail); tail = XCDR (tail))
8732 enum coding_category category;
8733 int this_eol;
8735 id = XINT (XCAR (tail));
8736 attrs = CODING_ID_ATTRS (id);
8737 category = XINT (CODING_ATTR_CATEGORY (attrs));
8738 eol_type = CODING_ID_EOL_TYPE (id);
8739 if (VECTORP (eol_type))
8741 if (category == coding_category_utf_16_be
8742 || category == coding_category_utf_16_be_nosig)
8743 this_eol = utf_16_be_eol;
8744 else if (category == coding_category_utf_16_le
8745 || category == coding_category_utf_16_le_nosig)
8746 this_eol = utf_16_le_eol;
8747 else
8748 this_eol = normal_eol;
8750 if (this_eol == EOL_SEEN_LF)
8751 XSETCAR (tail, AREF (eol_type, 0));
8752 else if (this_eol == EOL_SEEN_CRLF)
8753 XSETCAR (tail, AREF (eol_type, 1));
8754 else if (this_eol == EOL_SEEN_CR)
8755 XSETCAR (tail, AREF (eol_type, 2));
8756 else
8757 XSETCAR (tail, CODING_ID_NAME (id));
8759 else
8760 XSETCAR (tail, CODING_ID_NAME (id));
8764 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8768 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8769 2, 3, 0,
8770 doc: /* Detect coding system of the text in the region between START and END.
8771 Return a list of possible coding systems ordered by priority.
8772 The coding systems to try and their priorities follows what
8773 the function `coding-system-priority-list' (which see) returns.
8775 If only ASCII characters are found (except for such ISO-2022 control
8776 characters as ESC), it returns a list of single element `undecided'
8777 or its subsidiary coding system according to a detected end-of-line
8778 format.
8780 If optional argument HIGHEST is non-nil, return the coding system of
8781 highest priority. */)
8782 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8784 ptrdiff_t from, to;
8785 ptrdiff_t from_byte, to_byte;
8787 validate_region (&start, &end);
8788 from = XINT (start), to = XINT (end);
8789 from_byte = CHAR_TO_BYTE (from);
8790 to_byte = CHAR_TO_BYTE (to);
8792 if (from < GPT && to >= GPT)
8793 move_gap_both (to, to_byte);
8795 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8796 to - from, to_byte - from_byte,
8797 !NILP (highest),
8798 !NILP (BVAR (current_buffer
8799 , enable_multibyte_characters)),
8800 Qnil);
8803 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8804 1, 2, 0,
8805 doc: /* Detect coding system of the text in STRING.
8806 Return a list of possible coding systems ordered by priority.
8807 The coding systems to try and their priorities follows what
8808 the function `coding-system-priority-list' (which see) returns.
8810 If only ASCII characters are found (except for such ISO-2022 control
8811 characters as ESC), it returns a list of single element `undecided'
8812 or its subsidiary coding system according to a detected end-of-line
8813 format.
8815 If optional argument HIGHEST is non-nil, return the coding system of
8816 highest priority. */)
8817 (Lisp_Object string, Lisp_Object highest)
8819 CHECK_STRING (string);
8821 return detect_coding_system (SDATA (string),
8822 SCHARS (string), SBYTES (string),
8823 !NILP (highest), STRING_MULTIBYTE (string),
8824 Qnil);
8828 static bool
8829 char_encodable_p (int c, Lisp_Object attrs)
8831 Lisp_Object tail;
8832 struct charset *charset;
8833 Lisp_Object translation_table;
8835 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8836 if (! NILP (translation_table))
8837 c = translate_char (translation_table, c);
8838 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8839 CONSP (tail); tail = XCDR (tail))
8841 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8842 if (CHAR_CHARSET_P (c, charset))
8843 break;
8845 return (! NILP (tail));
8849 /* Return a list of coding systems that safely encode the text between
8850 START and END. If EXCLUDE is non-nil, it is a list of coding
8851 systems not to check. The returned list doesn't contain any such
8852 coding systems. In any case, if the text contains only ASCII or is
8853 unibyte, return t. */
8855 DEFUN ("find-coding-systems-region-internal",
8856 Ffind_coding_systems_region_internal,
8857 Sfind_coding_systems_region_internal, 2, 3, 0,
8858 doc: /* Internal use only. */)
8859 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8861 Lisp_Object coding_attrs_list, safe_codings;
8862 ptrdiff_t start_byte, end_byte;
8863 const unsigned char *p, *pbeg, *pend;
8864 int c;
8865 Lisp_Object tail, elt, work_table;
8867 if (STRINGP (start))
8869 if (!STRING_MULTIBYTE (start)
8870 || SCHARS (start) == SBYTES (start))
8871 return Qt;
8872 start_byte = 0;
8873 end_byte = SBYTES (start);
8875 else
8877 CHECK_NUMBER_COERCE_MARKER (start);
8878 CHECK_NUMBER_COERCE_MARKER (end);
8879 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8880 args_out_of_range (start, end);
8881 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8882 return Qt;
8883 start_byte = CHAR_TO_BYTE (XINT (start));
8884 end_byte = CHAR_TO_BYTE (XINT (end));
8885 if (XINT (end) - XINT (start) == end_byte - start_byte)
8886 return Qt;
8888 if (XINT (start) < GPT && XINT (end) > GPT)
8890 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8891 move_gap_both (XINT (start), start_byte);
8892 else
8893 move_gap_both (XINT (end), end_byte);
8897 coding_attrs_list = Qnil;
8898 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8899 if (NILP (exclude)
8900 || NILP (Fmemq (XCAR (tail), exclude)))
8902 Lisp_Object attrs;
8904 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8905 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8906 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8908 ASET (attrs, coding_attr_trans_tbl,
8909 get_translation_table (attrs, 1, NULL));
8910 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8914 if (STRINGP (start))
8915 p = pbeg = SDATA (start);
8916 else
8917 p = pbeg = BYTE_POS_ADDR (start_byte);
8918 pend = p + (end_byte - start_byte);
8920 while (p < pend && ASCII_BYTE_P (*p)) p++;
8921 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8923 work_table = Fmake_char_table (Qnil, Qnil);
8924 while (p < pend)
8926 if (ASCII_BYTE_P (*p))
8927 p++;
8928 else
8930 c = STRING_CHAR_ADVANCE (p);
8931 if (!NILP (char_table_ref (work_table, c)))
8932 /* This character was already checked. Ignore it. */
8933 continue;
8935 charset_map_loaded = 0;
8936 for (tail = coding_attrs_list; CONSP (tail);)
8938 elt = XCAR (tail);
8939 if (NILP (elt))
8940 tail = XCDR (tail);
8941 else if (char_encodable_p (c, elt))
8942 tail = XCDR (tail);
8943 else if (CONSP (XCDR (tail)))
8945 XSETCAR (tail, XCAR (XCDR (tail)));
8946 XSETCDR (tail, XCDR (XCDR (tail)));
8948 else
8950 XSETCAR (tail, Qnil);
8951 tail = XCDR (tail);
8954 if (charset_map_loaded)
8956 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8958 if (STRINGP (start))
8959 pbeg = SDATA (start);
8960 else
8961 pbeg = BYTE_POS_ADDR (start_byte);
8962 p = pbeg + p_offset;
8963 pend = pbeg + pend_offset;
8965 char_table_set (work_table, c, Qt);
8969 safe_codings = list2 (Qraw_text, Qno_conversion);
8970 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8971 if (! NILP (XCAR (tail)))
8972 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8974 return safe_codings;
8978 DEFUN ("unencodable-char-position", Funencodable_char_position,
8979 Sunencodable_char_position, 3, 5, 0,
8980 doc: /*
8981 Return position of first un-encodable character in a region.
8982 START and END specify the region and CODING-SYSTEM specifies the
8983 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8985 If optional 4th argument COUNT is non-nil, it specifies at most how
8986 many un-encodable characters to search. In this case, the value is a
8987 list of positions.
8989 If optional 5th argument STRING is non-nil, it is a string to search
8990 for un-encodable characters. In that case, START and END are indexes
8991 to the string. */)
8992 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8994 EMACS_INT n;
8995 struct coding_system coding;
8996 Lisp_Object attrs, charset_list, translation_table;
8997 Lisp_Object positions;
8998 ptrdiff_t from, to;
8999 const unsigned char *p, *stop, *pend;
9000 bool ascii_compatible;
9002 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9003 attrs = CODING_ID_ATTRS (coding.id);
9004 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9005 return Qnil;
9006 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9007 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9008 translation_table = get_translation_table (attrs, 1, NULL);
9010 if (NILP (string))
9012 validate_region (&start, &end);
9013 from = XINT (start);
9014 to = XINT (end);
9015 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9016 || (ascii_compatible
9017 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9018 return Qnil;
9019 p = CHAR_POS_ADDR (from);
9020 pend = CHAR_POS_ADDR (to);
9021 if (from < GPT && to >= GPT)
9022 stop = GPT_ADDR;
9023 else
9024 stop = pend;
9026 else
9028 CHECK_STRING (string);
9029 CHECK_NATNUM (start);
9030 CHECK_NATNUM (end);
9031 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9032 args_out_of_range_3 (string, start, end);
9033 from = XINT (start);
9034 to = XINT (end);
9035 if (! STRING_MULTIBYTE (string))
9036 return Qnil;
9037 p = SDATA (string) + string_char_to_byte (string, from);
9038 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9039 if (ascii_compatible && (to - from) == (pend - p))
9040 return Qnil;
9043 if (NILP (count))
9044 n = 1;
9045 else
9047 CHECK_NATNUM (count);
9048 n = XINT (count);
9051 positions = Qnil;
9052 charset_map_loaded = 0;
9053 while (1)
9055 int c;
9057 if (ascii_compatible)
9058 while (p < stop && ASCII_BYTE_P (*p))
9059 p++, from++;
9060 if (p >= stop)
9062 if (p >= pend)
9063 break;
9064 stop = pend;
9065 p = GAP_END_ADDR;
9068 c = STRING_CHAR_ADVANCE (p);
9069 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9070 && ! char_charset (translate_char (translation_table, c),
9071 charset_list, NULL))
9073 positions = Fcons (make_number (from), positions);
9074 n--;
9075 if (n == 0)
9076 break;
9079 from++;
9080 if (charset_map_loaded && NILP (string))
9082 p = CHAR_POS_ADDR (from);
9083 pend = CHAR_POS_ADDR (to);
9084 if (from < GPT && to >= GPT)
9085 stop = GPT_ADDR;
9086 else
9087 stop = pend;
9088 charset_map_loaded = 0;
9092 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9096 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9097 Scheck_coding_systems_region, 3, 3, 0,
9098 doc: /* Check if the region is encodable by coding systems.
9100 START and END are buffer positions specifying the region.
9101 CODING-SYSTEM-LIST is a list of coding systems to check.
9103 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9104 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9105 whole region, POS0, POS1, ... are buffer positions where non-encodable
9106 characters are found.
9108 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9109 value is nil.
9111 START may be a string. In that case, check if the string is
9112 encodable, and the value contains indices to the string instead of
9113 buffer positions. END is ignored.
9115 If the current buffer (or START if it is a string) is unibyte, the value
9116 is nil. */)
9117 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9119 Lisp_Object list;
9120 ptrdiff_t start_byte, end_byte;
9121 ptrdiff_t pos;
9122 const unsigned char *p, *pbeg, *pend;
9123 int c;
9124 Lisp_Object tail, elt, attrs;
9126 if (STRINGP (start))
9128 if (!STRING_MULTIBYTE (start)
9129 || SCHARS (start) == SBYTES (start))
9130 return Qnil;
9131 start_byte = 0;
9132 end_byte = SBYTES (start);
9133 pos = 0;
9135 else
9137 CHECK_NUMBER_COERCE_MARKER (start);
9138 CHECK_NUMBER_COERCE_MARKER (end);
9139 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9140 args_out_of_range (start, end);
9141 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9142 return Qnil;
9143 start_byte = CHAR_TO_BYTE (XINT (start));
9144 end_byte = CHAR_TO_BYTE (XINT (end));
9145 if (XINT (end) - XINT (start) == end_byte - start_byte)
9146 return Qnil;
9148 if (XINT (start) < GPT && XINT (end) > GPT)
9150 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9151 move_gap_both (XINT (start), start_byte);
9152 else
9153 move_gap_both (XINT (end), end_byte);
9155 pos = XINT (start);
9158 list = Qnil;
9159 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9161 elt = XCAR (tail);
9162 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9163 ASET (attrs, coding_attr_trans_tbl,
9164 get_translation_table (attrs, 1, NULL));
9165 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9168 if (STRINGP (start))
9169 p = pbeg = SDATA (start);
9170 else
9171 p = pbeg = BYTE_POS_ADDR (start_byte);
9172 pend = p + (end_byte - start_byte);
9174 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9175 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9177 while (p < pend)
9179 if (ASCII_BYTE_P (*p))
9180 p++;
9181 else
9183 c = STRING_CHAR_ADVANCE (p);
9185 charset_map_loaded = 0;
9186 for (tail = list; CONSP (tail); tail = XCDR (tail))
9188 elt = XCDR (XCAR (tail));
9189 if (! char_encodable_p (c, XCAR (elt)))
9190 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9192 if (charset_map_loaded)
9194 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9196 if (STRINGP (start))
9197 pbeg = SDATA (start);
9198 else
9199 pbeg = BYTE_POS_ADDR (start_byte);
9200 p = pbeg + p_offset;
9201 pend = pbeg + pend_offset;
9204 pos++;
9207 tail = list;
9208 list = Qnil;
9209 for (; CONSP (tail); tail = XCDR (tail))
9211 elt = XCAR (tail);
9212 if (CONSP (XCDR (XCDR (elt))))
9213 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9214 list);
9217 return list;
9221 static Lisp_Object
9222 code_convert_region (Lisp_Object start, Lisp_Object end,
9223 Lisp_Object coding_system, Lisp_Object dst_object,
9224 bool encodep, bool norecord)
9226 struct coding_system coding;
9227 ptrdiff_t from, from_byte, to, to_byte;
9228 Lisp_Object src_object;
9230 if (NILP (coding_system))
9231 coding_system = Qno_conversion;
9232 else
9233 CHECK_CODING_SYSTEM (coding_system);
9234 src_object = Fcurrent_buffer ();
9235 if (NILP (dst_object))
9236 dst_object = src_object;
9237 else if (! EQ (dst_object, Qt))
9238 CHECK_BUFFER (dst_object);
9240 validate_region (&start, &end);
9241 from = XFASTINT (start);
9242 from_byte = CHAR_TO_BYTE (from);
9243 to = XFASTINT (end);
9244 to_byte = CHAR_TO_BYTE (to);
9246 setup_coding_system (coding_system, &coding);
9247 coding.mode |= CODING_MODE_LAST_BLOCK;
9249 if (encodep)
9250 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9251 dst_object);
9252 else
9253 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9254 dst_object);
9255 if (! norecord)
9256 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9258 return (BUFFERP (dst_object)
9259 ? make_number (coding.produced_char)
9260 : coding.dst_object);
9264 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9265 3, 4, "r\nzCoding system: ",
9266 doc: /* Decode the current region from the specified coding system.
9267 When called from a program, takes four arguments:
9268 START, END, CODING-SYSTEM, and DESTINATION.
9269 START and END are buffer positions.
9271 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9272 If nil, the region between START and END is replaced by the decoded text.
9273 If buffer, the decoded text is inserted in that buffer after point (point
9274 does not move).
9275 In those cases, the length of the decoded text is returned.
9276 If DESTINATION is t, the decoded text is returned.
9278 This function sets `last-coding-system-used' to the precise coding system
9279 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9280 not fully specified.) */)
9281 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9283 return code_convert_region (start, end, coding_system, destination, 0, 0);
9286 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9287 3, 4, "r\nzCoding system: ",
9288 doc: /* Encode the current region by specified coding system.
9289 When called from a program, takes four arguments:
9290 START, END, CODING-SYSTEM and DESTINATION.
9291 START and END are buffer positions.
9293 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9294 If nil, the region between START and END is replace by the encoded text.
9295 If buffer, the encoded text is inserted in that buffer after point (point
9296 does not move).
9297 In those cases, the length of the encoded text is returned.
9298 If DESTINATION is t, the encoded text is returned.
9300 This function sets `last-coding-system-used' to the precise coding system
9301 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9302 not fully specified.) */)
9303 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9305 return code_convert_region (start, end, coding_system, destination, 1, 0);
9308 Lisp_Object
9309 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9310 Lisp_Object dst_object, bool encodep, bool nocopy,
9311 bool norecord)
9313 struct coding_system coding;
9314 ptrdiff_t chars, bytes;
9316 CHECK_STRING (string);
9317 if (NILP (coding_system))
9319 if (! norecord)
9320 Vlast_coding_system_used = Qno_conversion;
9321 if (NILP (dst_object))
9322 return (nocopy ? Fcopy_sequence (string) : string);
9325 if (NILP (coding_system))
9326 coding_system = Qno_conversion;
9327 else
9328 CHECK_CODING_SYSTEM (coding_system);
9329 if (NILP (dst_object))
9330 dst_object = Qt;
9331 else if (! EQ (dst_object, Qt))
9332 CHECK_BUFFER (dst_object);
9334 setup_coding_system (coding_system, &coding);
9335 coding.mode |= CODING_MODE_LAST_BLOCK;
9336 chars = SCHARS (string);
9337 bytes = SBYTES (string);
9338 if (encodep)
9339 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9340 else
9341 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9342 if (! norecord)
9343 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9345 return (BUFFERP (dst_object)
9346 ? make_number (coding.produced_char)
9347 : coding.dst_object);
9351 /* Encode or decode STRING according to CODING_SYSTEM.
9352 Do not set Vlast_coding_system_used.
9354 This function is called only from macros DECODE_FILE and
9355 ENCODE_FILE, thus we ignore character composition. */
9357 Lisp_Object
9358 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9359 bool encodep)
9361 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9365 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9366 2, 4, 0,
9367 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9369 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9370 if the decoding operation is trivial.
9372 Optional fourth arg BUFFER non-nil means that the decoded text is
9373 inserted in that buffer after point (point does not move). In this
9374 case, the return value is the length of the decoded text.
9376 This function sets `last-coding-system-used' to the precise coding system
9377 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9378 not fully specified.) */)
9379 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9381 return code_convert_string (string, coding_system, buffer,
9382 0, ! NILP (nocopy), 0);
9385 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9386 2, 4, 0,
9387 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9389 Optional third arg NOCOPY non-nil means it is OK to return STRING
9390 itself if the encoding operation is trivial.
9392 Optional fourth arg BUFFER non-nil means that the encoded text is
9393 inserted in that buffer after point (point does not move). In this
9394 case, the return value is the length of the encoded text.
9396 This function sets `last-coding-system-used' to the precise coding system
9397 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9398 not fully specified.) */)
9399 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9401 return code_convert_string (string, coding_system, buffer,
9402 1, ! NILP (nocopy), 0);
9406 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9407 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9408 Return the corresponding character. */)
9409 (Lisp_Object code)
9411 Lisp_Object spec, attrs, val;
9412 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9413 EMACS_INT ch;
9414 int c;
9416 CHECK_NATNUM (code);
9417 ch = XFASTINT (code);
9418 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9419 attrs = AREF (spec, 0);
9421 if (ASCII_BYTE_P (ch)
9422 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9423 return code;
9425 val = CODING_ATTR_CHARSET_LIST (attrs);
9426 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9427 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9428 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9430 if (ch <= 0x7F)
9432 c = ch;
9433 charset = charset_roman;
9435 else if (ch >= 0xA0 && ch < 0xDF)
9437 c = ch - 0x80;
9438 charset = charset_kana;
9440 else
9442 EMACS_INT c1 = ch >> 8;
9443 int c2 = ch & 0xFF;
9445 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9446 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9447 error ("Invalid code: %"pI"d", ch);
9448 c = ch;
9449 SJIS_TO_JIS (c);
9450 charset = charset_kanji;
9452 c = DECODE_CHAR (charset, c);
9453 if (c < 0)
9454 error ("Invalid code: %"pI"d", ch);
9455 return make_number (c);
9459 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9460 doc: /* Encode a Japanese character CH to shift_jis encoding.
9461 Return the corresponding code in SJIS. */)
9462 (Lisp_Object ch)
9464 Lisp_Object spec, attrs, charset_list;
9465 int c;
9466 struct charset *charset;
9467 unsigned code;
9469 CHECK_CHARACTER (ch);
9470 c = XFASTINT (ch);
9471 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9472 attrs = AREF (spec, 0);
9474 if (ASCII_CHAR_P (c)
9475 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9476 return ch;
9478 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9479 charset = char_charset (c, charset_list, &code);
9480 if (code == CHARSET_INVALID_CODE (charset))
9481 error ("Can't encode by shift_jis encoding: %c", c);
9482 JIS_TO_SJIS (code);
9484 return make_number (code);
9487 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9488 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9489 Return the corresponding character. */)
9490 (Lisp_Object code)
9492 Lisp_Object spec, attrs, val;
9493 struct charset *charset_roman, *charset_big5, *charset;
9494 EMACS_INT ch;
9495 int c;
9497 CHECK_NATNUM (code);
9498 ch = XFASTINT (code);
9499 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9500 attrs = AREF (spec, 0);
9502 if (ASCII_BYTE_P (ch)
9503 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9504 return code;
9506 val = CODING_ATTR_CHARSET_LIST (attrs);
9507 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9508 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9510 if (ch <= 0x7F)
9512 c = ch;
9513 charset = charset_roman;
9515 else
9517 EMACS_INT b1 = ch >> 8;
9518 int b2 = ch & 0x7F;
9519 if (b1 < 0xA1 || b1 > 0xFE
9520 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9521 error ("Invalid code: %"pI"d", ch);
9522 c = ch;
9523 charset = charset_big5;
9525 c = DECODE_CHAR (charset, c);
9526 if (c < 0)
9527 error ("Invalid code: %"pI"d", ch);
9528 return make_number (c);
9531 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9532 doc: /* Encode the Big5 character CH to BIG5 coding system.
9533 Return the corresponding character code in Big5. */)
9534 (Lisp_Object ch)
9536 Lisp_Object spec, attrs, charset_list;
9537 struct charset *charset;
9538 int c;
9539 unsigned code;
9541 CHECK_CHARACTER (ch);
9542 c = XFASTINT (ch);
9543 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9544 attrs = AREF (spec, 0);
9545 if (ASCII_CHAR_P (c)
9546 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9547 return ch;
9549 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9550 charset = char_charset (c, charset_list, &code);
9551 if (code == CHARSET_INVALID_CODE (charset))
9552 error ("Can't encode by Big5 encoding: %c", c);
9554 return make_number (code);
9558 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9559 Sset_terminal_coding_system_internal, 1, 2, 0,
9560 doc: /* Internal use only. */)
9561 (Lisp_Object coding_system, Lisp_Object terminal)
9563 struct terminal *term = get_terminal (terminal, 1);
9564 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9565 CHECK_SYMBOL (coding_system);
9566 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9567 /* We had better not send unsafe characters to terminal. */
9568 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9569 /* Character composition should be disabled. */
9570 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9571 terminal_coding->src_multibyte = 1;
9572 terminal_coding->dst_multibyte = 0;
9573 tset_charset_list
9574 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9575 ? coding_charset_list (terminal_coding)
9576 : Fcons (make_number (charset_ascii), Qnil)));
9577 return Qnil;
9580 DEFUN ("set-safe-terminal-coding-system-internal",
9581 Fset_safe_terminal_coding_system_internal,
9582 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9583 doc: /* Internal use only. */)
9584 (Lisp_Object coding_system)
9586 CHECK_SYMBOL (coding_system);
9587 setup_coding_system (Fcheck_coding_system (coding_system),
9588 &safe_terminal_coding);
9589 /* Character composition should be disabled. */
9590 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9591 safe_terminal_coding.src_multibyte = 1;
9592 safe_terminal_coding.dst_multibyte = 0;
9593 return Qnil;
9596 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9597 Sterminal_coding_system, 0, 1, 0,
9598 doc: /* Return coding system specified for terminal output on the given terminal.
9599 TERMINAL may be a terminal object, a frame, or nil for the selected
9600 frame's terminal device. */)
9601 (Lisp_Object terminal)
9603 struct coding_system *terminal_coding
9604 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9605 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9607 /* For backward compatibility, return nil if it is `undecided'. */
9608 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9611 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9612 Sset_keyboard_coding_system_internal, 1, 2, 0,
9613 doc: /* Internal use only. */)
9614 (Lisp_Object coding_system, Lisp_Object terminal)
9616 struct terminal *t = get_terminal (terminal, 1);
9617 CHECK_SYMBOL (coding_system);
9618 if (NILP (coding_system))
9619 coding_system = Qno_conversion;
9620 else
9621 Fcheck_coding_system (coding_system);
9622 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9623 /* Character composition should be disabled. */
9624 TERMINAL_KEYBOARD_CODING (t)->common_flags
9625 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9626 return Qnil;
9629 DEFUN ("keyboard-coding-system",
9630 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9631 doc: /* Return coding system specified for decoding keyboard input. */)
9632 (Lisp_Object terminal)
9634 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9635 (get_terminal (terminal, 1))->id);
9639 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9640 Sfind_operation_coding_system, 1, MANY, 0,
9641 doc: /* Choose a coding system for an operation based on the target name.
9642 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9643 DECODING-SYSTEM is the coding system to use for decoding
9644 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9645 for encoding (in case OPERATION does encoding).
9647 The first argument OPERATION specifies an I/O primitive:
9648 For file I/O, `insert-file-contents' or `write-region'.
9649 For process I/O, `call-process', `call-process-region', or `start-process'.
9650 For network I/O, `open-network-stream'.
9652 The remaining arguments should be the same arguments that were passed
9653 to the primitive. Depending on which primitive, one of those arguments
9654 is selected as the TARGET. For example, if OPERATION does file I/O,
9655 whichever argument specifies the file name is TARGET.
9657 TARGET has a meaning which depends on OPERATION:
9658 For file I/O, TARGET is a file name (except for the special case below).
9659 For process I/O, TARGET is a process name.
9660 For network I/O, TARGET is a service name or a port number.
9662 This function looks up what is specified for TARGET in
9663 `file-coding-system-alist', `process-coding-system-alist',
9664 or `network-coding-system-alist' depending on OPERATION.
9665 They may specify a coding system, a cons of coding systems,
9666 or a function symbol to call.
9667 In the last case, we call the function with one argument,
9668 which is a list of all the arguments given to this function.
9669 If the function can't decide a coding system, it can return
9670 `undecided' so that the normal code-detection is performed.
9672 If OPERATION is `insert-file-contents', the argument corresponding to
9673 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9674 file name to look up, and BUFFER is a buffer that contains the file's
9675 contents (not yet decoded). If `file-coding-system-alist' specifies a
9676 function to call for FILENAME, that function should examine the
9677 contents of BUFFER instead of reading the file.
9679 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9680 (ptrdiff_t nargs, Lisp_Object *args)
9682 Lisp_Object operation, target_idx, target, val;
9683 register Lisp_Object chain;
9685 if (nargs < 2)
9686 error ("Too few arguments");
9687 operation = args[0];
9688 if (!SYMBOLP (operation)
9689 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9690 error ("Invalid first argument");
9691 if (nargs <= 1 + XFASTINT (target_idx))
9692 error ("Too few arguments for operation `%s'",
9693 SDATA (SYMBOL_NAME (operation)));
9694 target = args[XFASTINT (target_idx) + 1];
9695 if (!(STRINGP (target)
9696 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9697 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9698 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9699 error ("Invalid argument %"pI"d of operation `%s'",
9700 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9701 if (CONSP (target))
9702 target = XCAR (target);
9704 chain = ((EQ (operation, Qinsert_file_contents)
9705 || EQ (operation, Qwrite_region))
9706 ? Vfile_coding_system_alist
9707 : (EQ (operation, Qopen_network_stream)
9708 ? Vnetwork_coding_system_alist
9709 : Vprocess_coding_system_alist));
9710 if (NILP (chain))
9711 return Qnil;
9713 for (; CONSP (chain); chain = XCDR (chain))
9715 Lisp_Object elt;
9717 elt = XCAR (chain);
9718 if (CONSP (elt)
9719 && ((STRINGP (target)
9720 && STRINGP (XCAR (elt))
9721 && fast_string_match (XCAR (elt), target) >= 0)
9722 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9724 val = XCDR (elt);
9725 /* Here, if VAL is both a valid coding system and a valid
9726 function symbol, we return VAL as a coding system. */
9727 if (CONSP (val))
9728 return val;
9729 if (! SYMBOLP (val))
9730 return Qnil;
9731 if (! NILP (Fcoding_system_p (val)))
9732 return Fcons (val, val);
9733 if (! NILP (Ffboundp (val)))
9735 /* We use call1 rather than safe_call1
9736 so as to get bug reports about functions called here
9737 which don't handle the current interface. */
9738 val = call1 (val, Flist (nargs, args));
9739 if (CONSP (val))
9740 return val;
9741 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9742 return Fcons (val, val);
9744 return Qnil;
9747 return Qnil;
9750 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9751 Sset_coding_system_priority, 0, MANY, 0,
9752 doc: /* Assign higher priority to the coding systems given as arguments.
9753 If multiple coding systems belong to the same category,
9754 all but the first one are ignored.
9756 usage: (set-coding-system-priority &rest coding-systems) */)
9757 (ptrdiff_t nargs, Lisp_Object *args)
9759 ptrdiff_t i, j;
9760 bool changed[coding_category_max];
9761 enum coding_category priorities[coding_category_max];
9763 memset (changed, 0, sizeof changed);
9765 for (i = j = 0; i < nargs; i++)
9767 enum coding_category category;
9768 Lisp_Object spec, attrs;
9770 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9771 attrs = AREF (spec, 0);
9772 category = XINT (CODING_ATTR_CATEGORY (attrs));
9773 if (changed[category])
9774 /* Ignore this coding system because a coding system of the
9775 same category already had a higher priority. */
9776 continue;
9777 changed[category] = 1;
9778 priorities[j++] = category;
9779 if (coding_categories[category].id >= 0
9780 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9781 setup_coding_system (args[i], &coding_categories[category]);
9782 Fset (AREF (Vcoding_category_table, category), args[i]);
9785 /* Now we have decided top J priorities. Reflect the order of the
9786 original priorities to the remaining priorities. */
9788 for (i = j, j = 0; i < coding_category_max; i++, j++)
9790 while (j < coding_category_max
9791 && changed[coding_priorities[j]])
9792 j++;
9793 if (j == coding_category_max)
9794 emacs_abort ();
9795 priorities[i] = coding_priorities[j];
9798 memcpy (coding_priorities, priorities, sizeof priorities);
9800 /* Update `coding-category-list'. */
9801 Vcoding_category_list = Qnil;
9802 for (i = coding_category_max; i-- > 0; )
9803 Vcoding_category_list
9804 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9805 Vcoding_category_list);
9807 return Qnil;
9810 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9811 Scoding_system_priority_list, 0, 1, 0,
9812 doc: /* Return a list of coding systems ordered by their priorities.
9813 The list contains a subset of coding systems; i.e. coding systems
9814 assigned to each coding category (see `coding-category-list').
9816 HIGHESTP non-nil means just return the highest priority one. */)
9817 (Lisp_Object highestp)
9819 int i;
9820 Lisp_Object val;
9822 for (i = 0, val = Qnil; i < coding_category_max; i++)
9824 enum coding_category category = coding_priorities[i];
9825 int id = coding_categories[category].id;
9826 Lisp_Object attrs;
9828 if (id < 0)
9829 continue;
9830 attrs = CODING_ID_ATTRS (id);
9831 if (! NILP (highestp))
9832 return CODING_ATTR_BASE_NAME (attrs);
9833 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9835 return Fnreverse (val);
9838 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9840 static Lisp_Object
9841 make_subsidiaries (Lisp_Object base)
9843 Lisp_Object subsidiaries;
9844 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9845 char *buf = alloca (base_name_len + 6);
9846 int i;
9848 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9849 subsidiaries = make_uninit_vector (3);
9850 for (i = 0; i < 3; i++)
9852 strcpy (buf + base_name_len, suffixes[i]);
9853 ASET (subsidiaries, i, intern (buf));
9855 return subsidiaries;
9859 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9860 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9861 doc: /* For internal use only.
9862 usage: (define-coding-system-internal ...) */)
9863 (ptrdiff_t nargs, Lisp_Object *args)
9865 Lisp_Object name;
9866 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9867 Lisp_Object attrs; /* Vector of attributes. */
9868 Lisp_Object eol_type;
9869 Lisp_Object aliases;
9870 Lisp_Object coding_type, charset_list, safe_charsets;
9871 enum coding_category category;
9872 Lisp_Object tail, val;
9873 int max_charset_id = 0;
9874 int i;
9876 if (nargs < coding_arg_max)
9877 goto short_args;
9879 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9881 name = args[coding_arg_name];
9882 CHECK_SYMBOL (name);
9883 ASET (attrs, coding_attr_base_name, name);
9885 val = args[coding_arg_mnemonic];
9886 if (! STRINGP (val))
9887 CHECK_CHARACTER (val);
9888 ASET (attrs, coding_attr_mnemonic, val);
9890 coding_type = args[coding_arg_coding_type];
9891 CHECK_SYMBOL (coding_type);
9892 ASET (attrs, coding_attr_type, coding_type);
9894 charset_list = args[coding_arg_charset_list];
9895 if (SYMBOLP (charset_list))
9897 if (EQ (charset_list, Qiso_2022))
9899 if (! EQ (coding_type, Qiso_2022))
9900 error ("Invalid charset-list");
9901 charset_list = Viso_2022_charset_list;
9903 else if (EQ (charset_list, Qemacs_mule))
9905 if (! EQ (coding_type, Qemacs_mule))
9906 error ("Invalid charset-list");
9907 charset_list = Vemacs_mule_charset_list;
9909 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9911 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9912 error ("Invalid charset-list");
9913 if (max_charset_id < XFASTINT (XCAR (tail)))
9914 max_charset_id = XFASTINT (XCAR (tail));
9917 else
9919 charset_list = Fcopy_sequence (charset_list);
9920 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9922 struct charset *charset;
9924 val = XCAR (tail);
9925 CHECK_CHARSET_GET_CHARSET (val, charset);
9926 if (EQ (coding_type, Qiso_2022)
9927 ? CHARSET_ISO_FINAL (charset) < 0
9928 : EQ (coding_type, Qemacs_mule)
9929 ? CHARSET_EMACS_MULE_ID (charset) < 0
9930 : 0)
9931 error ("Can't handle charset `%s'",
9932 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9934 XSETCAR (tail, make_number (charset->id));
9935 if (max_charset_id < charset->id)
9936 max_charset_id = charset->id;
9939 ASET (attrs, coding_attr_charset_list, charset_list);
9941 safe_charsets = make_uninit_string (max_charset_id + 1);
9942 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9943 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9944 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9945 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9947 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9949 val = args[coding_arg_decode_translation_table];
9950 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9951 CHECK_SYMBOL (val);
9952 ASET (attrs, coding_attr_decode_tbl, val);
9954 val = args[coding_arg_encode_translation_table];
9955 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9956 CHECK_SYMBOL (val);
9957 ASET (attrs, coding_attr_encode_tbl, val);
9959 val = args[coding_arg_post_read_conversion];
9960 CHECK_SYMBOL (val);
9961 ASET (attrs, coding_attr_post_read, val);
9963 val = args[coding_arg_pre_write_conversion];
9964 CHECK_SYMBOL (val);
9965 ASET (attrs, coding_attr_pre_write, val);
9967 val = args[coding_arg_default_char];
9968 if (NILP (val))
9969 ASET (attrs, coding_attr_default_char, make_number (' '));
9970 else
9972 CHECK_CHARACTER (val);
9973 ASET (attrs, coding_attr_default_char, val);
9976 val = args[coding_arg_for_unibyte];
9977 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9979 val = args[coding_arg_plist];
9980 CHECK_LIST (val);
9981 ASET (attrs, coding_attr_plist, val);
9983 if (EQ (coding_type, Qcharset))
9985 /* Generate a lisp vector of 256 elements. Each element is nil,
9986 integer, or a list of charset IDs.
9988 If Nth element is nil, the byte code N is invalid in this
9989 coding system.
9991 If Nth element is a number NUM, N is the first byte of a
9992 charset whose ID is NUM.
9994 If Nth element is a list of charset IDs, N is the first byte
9995 of one of them. The list is sorted by dimensions of the
9996 charsets. A charset of smaller dimension comes first. */
9997 val = Fmake_vector (make_number (256), Qnil);
9999 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10001 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10002 int dim = CHARSET_DIMENSION (charset);
10003 int idx = (dim - 1) * 4;
10005 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10006 ASET (attrs, coding_attr_ascii_compat, Qt);
10008 for (i = charset->code_space[idx];
10009 i <= charset->code_space[idx + 1]; i++)
10011 Lisp_Object tmp, tmp2;
10012 int dim2;
10014 tmp = AREF (val, i);
10015 if (NILP (tmp))
10016 tmp = XCAR (tail);
10017 else if (NUMBERP (tmp))
10019 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10020 if (dim < dim2)
10021 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10022 else
10023 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10025 else
10027 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10029 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10030 if (dim < dim2)
10031 break;
10033 if (NILP (tmp2))
10034 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10035 else
10037 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10038 XSETCAR (tmp2, XCAR (tail));
10041 ASET (val, i, tmp);
10044 ASET (attrs, coding_attr_charset_valids, val);
10045 category = coding_category_charset;
10047 else if (EQ (coding_type, Qccl))
10049 Lisp_Object valids;
10051 if (nargs < coding_arg_ccl_max)
10052 goto short_args;
10054 val = args[coding_arg_ccl_decoder];
10055 CHECK_CCL_PROGRAM (val);
10056 if (VECTORP (val))
10057 val = Fcopy_sequence (val);
10058 ASET (attrs, coding_attr_ccl_decoder, val);
10060 val = args[coding_arg_ccl_encoder];
10061 CHECK_CCL_PROGRAM (val);
10062 if (VECTORP (val))
10063 val = Fcopy_sequence (val);
10064 ASET (attrs, coding_attr_ccl_encoder, val);
10066 val = args[coding_arg_ccl_valids];
10067 valids = Fmake_string (make_number (256), make_number (0));
10068 for (tail = val; CONSP (tail); tail = XCDR (tail))
10070 int from, to;
10072 val = XCAR (tail);
10073 if (INTEGERP (val))
10075 if (! (0 <= XINT (val) && XINT (val) <= 255))
10076 args_out_of_range_3 (val, make_number (0), make_number (255));
10077 from = to = XINT (val);
10079 else
10081 CHECK_CONS (val);
10082 CHECK_NATNUM_CAR (val);
10083 CHECK_NUMBER_CDR (val);
10084 if (XINT (XCAR (val)) > 255)
10085 args_out_of_range_3 (XCAR (val),
10086 make_number (0), make_number (255));
10087 from = XINT (XCAR (val));
10088 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10089 args_out_of_range_3 (XCDR (val),
10090 XCAR (val), make_number (255));
10091 to = XINT (XCDR (val));
10093 for (i = from; i <= to; i++)
10094 SSET (valids, i, 1);
10096 ASET (attrs, coding_attr_ccl_valids, valids);
10098 category = coding_category_ccl;
10100 else if (EQ (coding_type, Qutf_16))
10102 Lisp_Object bom, endian;
10104 ASET (attrs, coding_attr_ascii_compat, Qnil);
10106 if (nargs < coding_arg_utf16_max)
10107 goto short_args;
10109 bom = args[coding_arg_utf16_bom];
10110 if (! NILP (bom) && ! EQ (bom, Qt))
10112 CHECK_CONS (bom);
10113 val = XCAR (bom);
10114 CHECK_CODING_SYSTEM (val);
10115 val = XCDR (bom);
10116 CHECK_CODING_SYSTEM (val);
10118 ASET (attrs, coding_attr_utf_bom, bom);
10120 endian = args[coding_arg_utf16_endian];
10121 CHECK_SYMBOL (endian);
10122 if (NILP (endian))
10123 endian = Qbig;
10124 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10125 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10126 ASET (attrs, coding_attr_utf_16_endian, endian);
10128 category = (CONSP (bom)
10129 ? coding_category_utf_16_auto
10130 : NILP (bom)
10131 ? (EQ (endian, Qbig)
10132 ? coding_category_utf_16_be_nosig
10133 : coding_category_utf_16_le_nosig)
10134 : (EQ (endian, Qbig)
10135 ? coding_category_utf_16_be
10136 : coding_category_utf_16_le));
10138 else if (EQ (coding_type, Qiso_2022))
10140 Lisp_Object initial, reg_usage, request, flags;
10142 if (nargs < coding_arg_iso2022_max)
10143 goto short_args;
10145 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10146 CHECK_VECTOR (initial);
10147 for (i = 0; i < 4; i++)
10149 val = AREF (initial, i);
10150 if (! NILP (val))
10152 struct charset *charset;
10154 CHECK_CHARSET_GET_CHARSET (val, charset);
10155 ASET (initial, i, make_number (CHARSET_ID (charset)));
10156 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10157 ASET (attrs, coding_attr_ascii_compat, Qt);
10159 else
10160 ASET (initial, i, make_number (-1));
10163 reg_usage = args[coding_arg_iso2022_reg_usage];
10164 CHECK_CONS (reg_usage);
10165 CHECK_NUMBER_CAR (reg_usage);
10166 CHECK_NUMBER_CDR (reg_usage);
10168 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10169 for (tail = request; CONSP (tail); tail = XCDR (tail))
10171 int id;
10172 Lisp_Object tmp1;
10174 val = XCAR (tail);
10175 CHECK_CONS (val);
10176 tmp1 = XCAR (val);
10177 CHECK_CHARSET_GET_ID (tmp1, id);
10178 CHECK_NATNUM_CDR (val);
10179 if (XINT (XCDR (val)) >= 4)
10180 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10181 XSETCAR (val, make_number (id));
10184 flags = args[coding_arg_iso2022_flags];
10185 CHECK_NATNUM (flags);
10186 i = XINT (flags) & INT_MAX;
10187 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10188 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10189 flags = make_number (i);
10191 ASET (attrs, coding_attr_iso_initial, initial);
10192 ASET (attrs, coding_attr_iso_usage, reg_usage);
10193 ASET (attrs, coding_attr_iso_request, request);
10194 ASET (attrs, coding_attr_iso_flags, flags);
10195 setup_iso_safe_charsets (attrs);
10197 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10198 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10199 | CODING_ISO_FLAG_SINGLE_SHIFT))
10200 ? coding_category_iso_7_else
10201 : EQ (args[coding_arg_charset_list], Qiso_2022)
10202 ? coding_category_iso_7
10203 : coding_category_iso_7_tight);
10204 else
10206 int id = XINT (AREF (initial, 1));
10208 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10209 || EQ (args[coding_arg_charset_list], Qiso_2022)
10210 || id < 0)
10211 ? coding_category_iso_8_else
10212 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10213 ? coding_category_iso_8_1
10214 : coding_category_iso_8_2);
10216 if (category != coding_category_iso_8_1
10217 && category != coding_category_iso_8_2)
10218 ASET (attrs, coding_attr_ascii_compat, Qnil);
10220 else if (EQ (coding_type, Qemacs_mule))
10222 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10223 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10224 ASET (attrs, coding_attr_ascii_compat, Qt);
10225 category = coding_category_emacs_mule;
10227 else if (EQ (coding_type, Qshift_jis))
10230 struct charset *charset;
10232 if (XINT (Flength (charset_list)) != 3
10233 && XINT (Flength (charset_list)) != 4)
10234 error ("There should be three or four charsets");
10236 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10237 if (CHARSET_DIMENSION (charset) != 1)
10238 error ("Dimension of charset %s is not one",
10239 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10240 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10241 ASET (attrs, coding_attr_ascii_compat, Qt);
10243 charset_list = XCDR (charset_list);
10244 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10245 if (CHARSET_DIMENSION (charset) != 1)
10246 error ("Dimension of charset %s is not one",
10247 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10249 charset_list = XCDR (charset_list);
10250 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10251 if (CHARSET_DIMENSION (charset) != 2)
10252 error ("Dimension of charset %s is not two",
10253 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10255 charset_list = XCDR (charset_list);
10256 if (! NILP (charset_list))
10258 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10259 if (CHARSET_DIMENSION (charset) != 2)
10260 error ("Dimension of charset %s is not two",
10261 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10264 category = coding_category_sjis;
10265 Vsjis_coding_system = name;
10267 else if (EQ (coding_type, Qbig5))
10269 struct charset *charset;
10271 if (XINT (Flength (charset_list)) != 2)
10272 error ("There should be just two charsets");
10274 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10275 if (CHARSET_DIMENSION (charset) != 1)
10276 error ("Dimension of charset %s is not one",
10277 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10278 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10279 ASET (attrs, coding_attr_ascii_compat, Qt);
10281 charset_list = XCDR (charset_list);
10282 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10283 if (CHARSET_DIMENSION (charset) != 2)
10284 error ("Dimension of charset %s is not two",
10285 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10287 category = coding_category_big5;
10288 Vbig5_coding_system = name;
10290 else if (EQ (coding_type, Qraw_text))
10292 category = coding_category_raw_text;
10293 ASET (attrs, coding_attr_ascii_compat, Qt);
10295 else if (EQ (coding_type, Qutf_8))
10297 Lisp_Object bom;
10299 if (nargs < coding_arg_utf8_max)
10300 goto short_args;
10302 bom = args[coding_arg_utf8_bom];
10303 if (! NILP (bom) && ! EQ (bom, Qt))
10305 CHECK_CONS (bom);
10306 val = XCAR (bom);
10307 CHECK_CODING_SYSTEM (val);
10308 val = XCDR (bom);
10309 CHECK_CODING_SYSTEM (val);
10311 ASET (attrs, coding_attr_utf_bom, bom);
10312 if (NILP (bom))
10313 ASET (attrs, coding_attr_ascii_compat, Qt);
10315 category = (CONSP (bom) ? coding_category_utf_8_auto
10316 : NILP (bom) ? coding_category_utf_8_nosig
10317 : coding_category_utf_8_sig);
10319 else if (EQ (coding_type, Qundecided))
10320 category = coding_category_undecided;
10321 else
10322 error ("Invalid coding system type: %s",
10323 SDATA (SYMBOL_NAME (coding_type)));
10325 ASET (attrs, coding_attr_category, make_number (category));
10326 ASET (attrs, coding_attr_plist,
10327 Fcons (QCcategory,
10328 Fcons (AREF (Vcoding_category_table, category),
10329 CODING_ATTR_PLIST (attrs))));
10330 ASET (attrs, coding_attr_plist,
10331 Fcons (QCascii_compatible_p,
10332 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10333 CODING_ATTR_PLIST (attrs))));
10335 eol_type = args[coding_arg_eol_type];
10336 if (! NILP (eol_type)
10337 && ! EQ (eol_type, Qunix)
10338 && ! EQ (eol_type, Qdos)
10339 && ! EQ (eol_type, Qmac))
10340 error ("Invalid eol-type");
10342 aliases = Fcons (name, Qnil);
10344 if (NILP (eol_type))
10346 eol_type = make_subsidiaries (name);
10347 for (i = 0; i < 3; i++)
10349 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10351 this_name = AREF (eol_type, i);
10352 this_aliases = Fcons (this_name, Qnil);
10353 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10354 this_spec = make_uninit_vector (3);
10355 ASET (this_spec, 0, attrs);
10356 ASET (this_spec, 1, this_aliases);
10357 ASET (this_spec, 2, this_eol_type);
10358 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10359 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10360 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10361 if (NILP (val))
10362 Vcoding_system_alist
10363 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10364 Vcoding_system_alist);
10368 spec_vec = make_uninit_vector (3);
10369 ASET (spec_vec, 0, attrs);
10370 ASET (spec_vec, 1, aliases);
10371 ASET (spec_vec, 2, eol_type);
10373 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10374 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10375 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10376 if (NILP (val))
10377 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10378 Vcoding_system_alist);
10381 int id = coding_categories[category].id;
10383 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10384 setup_coding_system (name, &coding_categories[category]);
10387 return Qnil;
10389 short_args:
10390 return Fsignal (Qwrong_number_of_arguments,
10391 Fcons (intern ("define-coding-system-internal"),
10392 make_number (nargs)));
10396 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10397 3, 3, 0,
10398 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10399 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10401 Lisp_Object spec, attrs;
10403 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10404 attrs = AREF (spec, 0);
10405 if (EQ (prop, QCmnemonic))
10407 if (! STRINGP (val))
10408 CHECK_CHARACTER (val);
10409 ASET (attrs, coding_attr_mnemonic, val);
10411 else if (EQ (prop, QCdefault_char))
10413 if (NILP (val))
10414 val = make_number (' ');
10415 else
10416 CHECK_CHARACTER (val);
10417 ASET (attrs, coding_attr_default_char, val);
10419 else if (EQ (prop, QCdecode_translation_table))
10421 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10422 CHECK_SYMBOL (val);
10423 ASET (attrs, coding_attr_decode_tbl, val);
10425 else if (EQ (prop, QCencode_translation_table))
10427 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10428 CHECK_SYMBOL (val);
10429 ASET (attrs, coding_attr_encode_tbl, val);
10431 else if (EQ (prop, QCpost_read_conversion))
10433 CHECK_SYMBOL (val);
10434 ASET (attrs, coding_attr_post_read, val);
10436 else if (EQ (prop, QCpre_write_conversion))
10438 CHECK_SYMBOL (val);
10439 ASET (attrs, coding_attr_pre_write, val);
10441 else if (EQ (prop, QCascii_compatible_p))
10443 ASET (attrs, coding_attr_ascii_compat, val);
10446 ASET (attrs, coding_attr_plist,
10447 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10448 return val;
10452 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10453 Sdefine_coding_system_alias, 2, 2, 0,
10454 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10455 (Lisp_Object alias, Lisp_Object coding_system)
10457 Lisp_Object spec, aliases, eol_type, val;
10459 CHECK_SYMBOL (alias);
10460 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10461 aliases = AREF (spec, 1);
10462 /* ALIASES should be a list of length more than zero, and the first
10463 element is a base coding system. Append ALIAS at the tail of the
10464 list. */
10465 while (!NILP (XCDR (aliases)))
10466 aliases = XCDR (aliases);
10467 XSETCDR (aliases, Fcons (alias, Qnil));
10469 eol_type = AREF (spec, 2);
10470 if (VECTORP (eol_type))
10472 Lisp_Object subsidiaries;
10473 int i;
10475 subsidiaries = make_subsidiaries (alias);
10476 for (i = 0; i < 3; i++)
10477 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10478 AREF (eol_type, i));
10481 Fputhash (alias, spec, Vcoding_system_hash_table);
10482 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10483 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10484 if (NILP (val))
10485 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10486 Vcoding_system_alist);
10488 return Qnil;
10491 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10492 1, 1, 0,
10493 doc: /* Return the base of CODING-SYSTEM.
10494 Any alias or subsidiary coding system is not a base coding system. */)
10495 (Lisp_Object coding_system)
10497 Lisp_Object spec, attrs;
10499 if (NILP (coding_system))
10500 return (Qno_conversion);
10501 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10502 attrs = AREF (spec, 0);
10503 return CODING_ATTR_BASE_NAME (attrs);
10506 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10507 1, 1, 0,
10508 doc: "Return the property list of CODING-SYSTEM.")
10509 (Lisp_Object coding_system)
10511 Lisp_Object spec, attrs;
10513 if (NILP (coding_system))
10514 coding_system = Qno_conversion;
10515 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10516 attrs = AREF (spec, 0);
10517 return CODING_ATTR_PLIST (attrs);
10521 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10522 1, 1, 0,
10523 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10524 (Lisp_Object coding_system)
10526 Lisp_Object spec;
10528 if (NILP (coding_system))
10529 coding_system = Qno_conversion;
10530 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10531 return AREF (spec, 1);
10534 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10535 Scoding_system_eol_type, 1, 1, 0,
10536 doc: /* Return eol-type of CODING-SYSTEM.
10537 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10539 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10540 and CR respectively.
10542 A vector value indicates that a format of end-of-line should be
10543 detected automatically. Nth element of the vector is the subsidiary
10544 coding system whose eol-type is N. */)
10545 (Lisp_Object coding_system)
10547 Lisp_Object spec, eol_type;
10548 int n;
10550 if (NILP (coding_system))
10551 coding_system = Qno_conversion;
10552 if (! CODING_SYSTEM_P (coding_system))
10553 return Qnil;
10554 spec = CODING_SYSTEM_SPEC (coding_system);
10555 eol_type = AREF (spec, 2);
10556 if (VECTORP (eol_type))
10557 return Fcopy_sequence (eol_type);
10558 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10559 return make_number (n);
10562 #endif /* emacs */
10565 /*** 9. Post-amble ***/
10567 void
10568 init_coding_once (void)
10570 int i;
10572 for (i = 0; i < coding_category_max; i++)
10574 coding_categories[i].id = -1;
10575 coding_priorities[i] = i;
10578 /* ISO2022 specific initialize routine. */
10579 for (i = 0; i < 0x20; i++)
10580 iso_code_class[i] = ISO_control_0;
10581 for (i = 0x21; i < 0x7F; i++)
10582 iso_code_class[i] = ISO_graphic_plane_0;
10583 for (i = 0x80; i < 0xA0; i++)
10584 iso_code_class[i] = ISO_control_1;
10585 for (i = 0xA1; i < 0xFF; i++)
10586 iso_code_class[i] = ISO_graphic_plane_1;
10587 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10588 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10589 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10590 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10591 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10592 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10593 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10594 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10595 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10597 for (i = 0; i < 256; i++)
10599 emacs_mule_bytes[i] = 1;
10601 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10602 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10603 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10604 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10607 #ifdef emacs
10609 void
10610 syms_of_coding (void)
10612 staticpro (&Vcoding_system_hash_table);
10614 Lisp_Object args[2];
10615 args[0] = QCtest;
10616 args[1] = Qeq;
10617 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10620 staticpro (&Vsjis_coding_system);
10621 Vsjis_coding_system = Qnil;
10623 staticpro (&Vbig5_coding_system);
10624 Vbig5_coding_system = Qnil;
10626 staticpro (&Vcode_conversion_reused_workbuf);
10627 Vcode_conversion_reused_workbuf = Qnil;
10629 staticpro (&Vcode_conversion_workbuf_name);
10630 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10632 reused_workbuf_in_use = 0;
10634 DEFSYM (Qcharset, "charset");
10635 DEFSYM (Qtarget_idx, "target-idx");
10636 DEFSYM (Qcoding_system_history, "coding-system-history");
10637 Fset (Qcoding_system_history, Qnil);
10639 /* Target FILENAME is the first argument. */
10640 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10641 /* Target FILENAME is the third argument. */
10642 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10644 DEFSYM (Qcall_process, "call-process");
10645 /* Target PROGRAM is the first argument. */
10646 Fput (Qcall_process, Qtarget_idx, make_number (0));
10648 DEFSYM (Qcall_process_region, "call-process-region");
10649 /* Target PROGRAM is the third argument. */
10650 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10652 DEFSYM (Qstart_process, "start-process");
10653 /* Target PROGRAM is the third argument. */
10654 Fput (Qstart_process, Qtarget_idx, make_number (2));
10656 DEFSYM (Qopen_network_stream, "open-network-stream");
10657 /* Target SERVICE is the fourth argument. */
10658 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10660 DEFSYM (Qcoding_system, "coding-system");
10661 DEFSYM (Qcoding_aliases, "coding-aliases");
10663 DEFSYM (Qeol_type, "eol-type");
10664 DEFSYM (Qunix, "unix");
10665 DEFSYM (Qdos, "dos");
10666 DEFSYM (Qmac, "mac");
10668 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10669 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10670 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10671 DEFSYM (Qdefault_char, "default-char");
10672 DEFSYM (Qundecided, "undecided");
10673 DEFSYM (Qno_conversion, "no-conversion");
10674 DEFSYM (Qraw_text, "raw-text");
10676 DEFSYM (Qiso_2022, "iso-2022");
10678 DEFSYM (Qutf_8, "utf-8");
10679 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10681 #if defined (WINDOWSNT) || defined (CYGWIN)
10682 /* No, not utf-16-le: that one has a BOM. */
10683 DEFSYM (Qutf_16le, "utf-16le");
10684 #endif
10686 DEFSYM (Qutf_16, "utf-16");
10687 DEFSYM (Qbig, "big");
10688 DEFSYM (Qlittle, "little");
10690 DEFSYM (Qshift_jis, "shift-jis");
10691 DEFSYM (Qbig5, "big5");
10693 DEFSYM (Qcoding_system_p, "coding-system-p");
10695 DEFSYM (Qcoding_system_error, "coding-system-error");
10696 Fput (Qcoding_system_error, Qerror_conditions,
10697 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10698 Fput (Qcoding_system_error, Qerror_message,
10699 build_pure_c_string ("Invalid coding system"));
10701 /* Intern this now in case it isn't already done.
10702 Setting this variable twice is harmless.
10703 But don't staticpro it here--that is done in alloc.c. */
10704 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10706 DEFSYM (Qtranslation_table, "translation-table");
10707 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10708 DEFSYM (Qtranslation_table_id, "translation-table-id");
10709 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10710 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10712 DEFSYM (Qvalid_codes, "valid-codes");
10714 DEFSYM (Qemacs_mule, "emacs-mule");
10716 DEFSYM (QCcategory, ":category");
10717 DEFSYM (QCmnemonic, ":mnemonic");
10718 DEFSYM (QCdefault_char, ":default-char");
10719 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10720 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10721 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10722 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10723 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10725 Vcoding_category_table
10726 = Fmake_vector (make_number (coding_category_max), Qnil);
10727 staticpro (&Vcoding_category_table);
10728 /* Followings are target of code detection. */
10729 ASET (Vcoding_category_table, coding_category_iso_7,
10730 intern_c_string ("coding-category-iso-7"));
10731 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10732 intern_c_string ("coding-category-iso-7-tight"));
10733 ASET (Vcoding_category_table, coding_category_iso_8_1,
10734 intern_c_string ("coding-category-iso-8-1"));
10735 ASET (Vcoding_category_table, coding_category_iso_8_2,
10736 intern_c_string ("coding-category-iso-8-2"));
10737 ASET (Vcoding_category_table, coding_category_iso_7_else,
10738 intern_c_string ("coding-category-iso-7-else"));
10739 ASET (Vcoding_category_table, coding_category_iso_8_else,
10740 intern_c_string ("coding-category-iso-8-else"));
10741 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10742 intern_c_string ("coding-category-utf-8-auto"));
10743 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10744 intern_c_string ("coding-category-utf-8"));
10745 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10746 intern_c_string ("coding-category-utf-8-sig"));
10747 ASET (Vcoding_category_table, coding_category_utf_16_be,
10748 intern_c_string ("coding-category-utf-16-be"));
10749 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10750 intern_c_string ("coding-category-utf-16-auto"));
10751 ASET (Vcoding_category_table, coding_category_utf_16_le,
10752 intern_c_string ("coding-category-utf-16-le"));
10753 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10754 intern_c_string ("coding-category-utf-16-be-nosig"));
10755 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10756 intern_c_string ("coding-category-utf-16-le-nosig"));
10757 ASET (Vcoding_category_table, coding_category_charset,
10758 intern_c_string ("coding-category-charset"));
10759 ASET (Vcoding_category_table, coding_category_sjis,
10760 intern_c_string ("coding-category-sjis"));
10761 ASET (Vcoding_category_table, coding_category_big5,
10762 intern_c_string ("coding-category-big5"));
10763 ASET (Vcoding_category_table, coding_category_ccl,
10764 intern_c_string ("coding-category-ccl"));
10765 ASET (Vcoding_category_table, coding_category_emacs_mule,
10766 intern_c_string ("coding-category-emacs-mule"));
10767 /* Followings are NOT target of code detection. */
10768 ASET (Vcoding_category_table, coding_category_raw_text,
10769 intern_c_string ("coding-category-raw-text"));
10770 ASET (Vcoding_category_table, coding_category_undecided,
10771 intern_c_string ("coding-category-undecided"));
10773 DEFSYM (Qinsufficient_source, "insufficient-source");
10774 DEFSYM (Qinvalid_source, "invalid-source");
10775 DEFSYM (Qinterrupted, "interrupted");
10776 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10778 defsubr (&Scoding_system_p);
10779 defsubr (&Sread_coding_system);
10780 defsubr (&Sread_non_nil_coding_system);
10781 defsubr (&Scheck_coding_system);
10782 defsubr (&Sdetect_coding_region);
10783 defsubr (&Sdetect_coding_string);
10784 defsubr (&Sfind_coding_systems_region_internal);
10785 defsubr (&Sunencodable_char_position);
10786 defsubr (&Scheck_coding_systems_region);
10787 defsubr (&Sdecode_coding_region);
10788 defsubr (&Sencode_coding_region);
10789 defsubr (&Sdecode_coding_string);
10790 defsubr (&Sencode_coding_string);
10791 defsubr (&Sdecode_sjis_char);
10792 defsubr (&Sencode_sjis_char);
10793 defsubr (&Sdecode_big5_char);
10794 defsubr (&Sencode_big5_char);
10795 defsubr (&Sset_terminal_coding_system_internal);
10796 defsubr (&Sset_safe_terminal_coding_system_internal);
10797 defsubr (&Sterminal_coding_system);
10798 defsubr (&Sset_keyboard_coding_system_internal);
10799 defsubr (&Skeyboard_coding_system);
10800 defsubr (&Sfind_operation_coding_system);
10801 defsubr (&Sset_coding_system_priority);
10802 defsubr (&Sdefine_coding_system_internal);
10803 defsubr (&Sdefine_coding_system_alias);
10804 defsubr (&Scoding_system_put);
10805 defsubr (&Scoding_system_base);
10806 defsubr (&Scoding_system_plist);
10807 defsubr (&Scoding_system_aliases);
10808 defsubr (&Scoding_system_eol_type);
10809 defsubr (&Scoding_system_priority_list);
10811 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10812 doc: /* List of coding systems.
10814 Do not alter the value of this variable manually. This variable should be
10815 updated by the functions `define-coding-system' and
10816 `define-coding-system-alias'. */);
10817 Vcoding_system_list = Qnil;
10819 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10820 doc: /* Alist of coding system names.
10821 Each element is one element list of coding system name.
10822 This variable is given to `completing-read' as COLLECTION argument.
10824 Do not alter the value of this variable manually. This variable should be
10825 updated by the functions `make-coding-system' and
10826 `define-coding-system-alias'. */);
10827 Vcoding_system_alist = Qnil;
10829 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10830 doc: /* List of coding-categories (symbols) ordered by priority.
10832 On detecting a coding system, Emacs tries code detection algorithms
10833 associated with each coding-category one by one in this order. When
10834 one algorithm agrees with a byte sequence of source text, the coding
10835 system bound to the corresponding coding-category is selected.
10837 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10839 int i;
10841 Vcoding_category_list = Qnil;
10842 for (i = coding_category_max - 1; i >= 0; i--)
10843 Vcoding_category_list
10844 = Fcons (AREF (Vcoding_category_table, i),
10845 Vcoding_category_list);
10848 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10849 doc: /* Specify the coding system for read operations.
10850 It is useful to bind this variable with `let', but do not set it globally.
10851 If the value is a coding system, it is used for decoding on read operation.
10852 If not, an appropriate element is used from one of the coding system alists.
10853 There are three such tables: `file-coding-system-alist',
10854 `process-coding-system-alist', and `network-coding-system-alist'. */);
10855 Vcoding_system_for_read = Qnil;
10857 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10858 doc: /* Specify the coding system for write operations.
10859 Programs bind this variable with `let', but you should not set it globally.
10860 If the value is a coding system, it is used for encoding of output,
10861 when writing it to a file and when sending it to a file or subprocess.
10863 If this does not specify a coding system, an appropriate element
10864 is used from one of the coding system alists.
10865 There are three such tables: `file-coding-system-alist',
10866 `process-coding-system-alist', and `network-coding-system-alist'.
10867 For output to files, if the above procedure does not specify a coding system,
10868 the value of `buffer-file-coding-system' is used. */);
10869 Vcoding_system_for_write = Qnil;
10871 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10872 doc: /*
10873 Coding system used in the latest file or process I/O. */);
10874 Vlast_coding_system_used = Qnil;
10876 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10877 doc: /*
10878 Error status of the last code conversion.
10880 When an error was detected in the last code conversion, this variable
10881 is set to one of the following symbols.
10882 `insufficient-source'
10883 `inconsistent-eol'
10884 `invalid-source'
10885 `interrupted'
10886 `insufficient-memory'
10887 When no error was detected, the value doesn't change. So, to check
10888 the error status of a code conversion by this variable, you must
10889 explicitly set this variable to nil before performing code
10890 conversion. */);
10891 Vlast_code_conversion_error = Qnil;
10893 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10894 doc: /*
10895 *Non-nil means always inhibit code conversion of end-of-line format.
10896 See info node `Coding Systems' and info node `Text and Binary' concerning
10897 such conversion. */);
10898 inhibit_eol_conversion = 0;
10900 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10901 doc: /*
10902 Non-nil means process buffer inherits coding system of process output.
10903 Bind it to t if the process output is to be treated as if it were a file
10904 read from some filesystem. */);
10905 inherit_process_coding_system = 0;
10907 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10908 doc: /*
10909 Alist to decide a coding system to use for a file I/O operation.
10910 The format is ((PATTERN . VAL) ...),
10911 where PATTERN is a regular expression matching a file name,
10912 VAL is a coding system, a cons of coding systems, or a function symbol.
10913 If VAL is a coding system, it is used for both decoding and encoding
10914 the file contents.
10915 If VAL is a cons of coding systems, the car part is used for decoding,
10916 and the cdr part is used for encoding.
10917 If VAL is a function symbol, the function must return a coding system
10918 or a cons of coding systems which are used as above. The function is
10919 called with an argument that is a list of the arguments with which
10920 `find-operation-coding-system' was called. If the function can't decide
10921 a coding system, it can return `undecided' so that the normal
10922 code-detection is performed.
10924 See also the function `find-operation-coding-system'
10925 and the variable `auto-coding-alist'. */);
10926 Vfile_coding_system_alist = Qnil;
10928 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10929 doc: /*
10930 Alist to decide a coding system to use for a process I/O operation.
10931 The format is ((PATTERN . VAL) ...),
10932 where PATTERN is a regular expression matching a program name,
10933 VAL is a coding system, a cons of coding systems, or a function symbol.
10934 If VAL is a coding system, it is used for both decoding what received
10935 from the program and encoding what sent to the program.
10936 If VAL is a cons of coding systems, the car part is used for decoding,
10937 and the cdr part is used for encoding.
10938 If VAL is a function symbol, the function must return a coding system
10939 or a cons of coding systems which are used as above.
10941 See also the function `find-operation-coding-system'. */);
10942 Vprocess_coding_system_alist = Qnil;
10944 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10945 doc: /*
10946 Alist to decide a coding system to use for a network I/O operation.
10947 The format is ((PATTERN . VAL) ...),
10948 where PATTERN is a regular expression matching a network service name
10949 or is a port number to connect to,
10950 VAL is a coding system, a cons of coding systems, or a function symbol.
10951 If VAL is a coding system, it is used for both decoding what received
10952 from the network stream and encoding what sent to the network stream.
10953 If VAL is a cons of coding systems, the car part is used for decoding,
10954 and the cdr part is used for encoding.
10955 If VAL is a function symbol, the function must return a coding system
10956 or a cons of coding systems which are used as above.
10958 See also the function `find-operation-coding-system'. */);
10959 Vnetwork_coding_system_alist = Qnil;
10961 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10962 doc: /* Coding system to use with system messages.
10963 Also used for decoding keyboard input on X Window system. */);
10964 Vlocale_coding_system = Qnil;
10966 /* The eol mnemonics are reset in startup.el system-dependently. */
10967 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10968 doc: /*
10969 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10970 eol_mnemonic_unix = build_pure_c_string (":");
10972 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10973 doc: /*
10974 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10975 eol_mnemonic_dos = build_pure_c_string ("\\");
10977 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10978 doc: /*
10979 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10980 eol_mnemonic_mac = build_pure_c_string ("/");
10982 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10983 doc: /*
10984 *String displayed in mode line when end-of-line format is not yet determined. */);
10985 eol_mnemonic_undecided = build_pure_c_string (":");
10987 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10988 doc: /*
10989 *Non-nil enables character translation while encoding and decoding. */);
10990 Venable_character_translation = Qt;
10992 DEFVAR_LISP ("standard-translation-table-for-decode",
10993 Vstandard_translation_table_for_decode,
10994 doc: /* Table for translating characters while decoding. */);
10995 Vstandard_translation_table_for_decode = Qnil;
10997 DEFVAR_LISP ("standard-translation-table-for-encode",
10998 Vstandard_translation_table_for_encode,
10999 doc: /* Table for translating characters while encoding. */);
11000 Vstandard_translation_table_for_encode = Qnil;
11002 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11003 doc: /* Alist of charsets vs revision numbers.
11004 While encoding, if a charset (car part of an element) is found,
11005 designate it with the escape sequence identifying revision (cdr part
11006 of the element). */);
11007 Vcharset_revision_table = Qnil;
11009 DEFVAR_LISP ("default-process-coding-system",
11010 Vdefault_process_coding_system,
11011 doc: /* Cons of coding systems used for process I/O by default.
11012 The car part is used for decoding a process output,
11013 the cdr part is used for encoding a text to be sent to a process. */);
11014 Vdefault_process_coding_system = Qnil;
11016 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11017 doc: /*
11018 Table of extra Latin codes in the range 128..159 (inclusive).
11019 This is a vector of length 256.
11020 If Nth element is non-nil, the existence of code N in a file
11021 \(or output of subprocess) doesn't prevent it to be detected as
11022 a coding system of ISO 2022 variant which has a flag
11023 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11024 or reading output of a subprocess.
11025 Only 128th through 159th elements have a meaning. */);
11026 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11028 DEFVAR_LISP ("select-safe-coding-system-function",
11029 Vselect_safe_coding_system_function,
11030 doc: /*
11031 Function to call to select safe coding system for encoding a text.
11033 If set, this function is called to force a user to select a proper
11034 coding system which can encode the text in the case that a default
11035 coding system used in each operation can't encode the text. The
11036 function should take care that the buffer is not modified while
11037 the coding system is being selected.
11039 The default value is `select-safe-coding-system' (which see). */);
11040 Vselect_safe_coding_system_function = Qnil;
11042 DEFVAR_BOOL ("coding-system-require-warning",
11043 coding_system_require_warning,
11044 doc: /* Internal use only.
11045 If non-nil, on writing a file, `select-safe-coding-system-function' is
11046 called even if `coding-system-for-write' is non-nil. The command
11047 `universal-coding-system-argument' binds this variable to t temporarily. */);
11048 coding_system_require_warning = 0;
11051 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11052 inhibit_iso_escape_detection,
11053 doc: /*
11054 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11056 When Emacs reads text, it tries to detect how the text is encoded.
11057 This code detection is sensitive to escape sequences. If Emacs sees
11058 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11059 of the ISO2022 encodings, and decodes text by the corresponding coding
11060 system (e.g. `iso-2022-7bit').
11062 However, there may be a case that you want to read escape sequences in
11063 a file as is. In such a case, you can set this variable to non-nil.
11064 Then the code detection will ignore any escape sequences, and no text is
11065 detected as encoded in some ISO-2022 encoding. The result is that all
11066 escape sequences become visible in a buffer.
11068 The default value is nil, and it is strongly recommended not to change
11069 it. That is because many Emacs Lisp source files that contain
11070 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11071 in Emacs's distribution, and they won't be decoded correctly on
11072 reading if you suppress escape sequence detection.
11074 The other way to read escape sequences in a file without decoding is
11075 to explicitly specify some coding system that doesn't use ISO-2022
11076 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11077 inhibit_iso_escape_detection = 0;
11079 DEFVAR_BOOL ("inhibit-null-byte-detection",
11080 inhibit_null_byte_detection,
11081 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11082 By default, Emacs treats it as binary data, and does not attempt to
11083 decode it. The effect is as if you specified `no-conversion' for
11084 reading that text.
11086 Set this to non-nil when a regular text happens to include null bytes.
11087 Examples are Index nodes of Info files and null-byte delimited output
11088 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11089 decode text as usual. */);
11090 inhibit_null_byte_detection = 0;
11092 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11093 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11094 Internal use only. Removed after the experimental optimizer gets stable. */);
11095 disable_ascii_optimization = 0;
11097 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11098 doc: /* Char table for translating self-inserting characters.
11099 This is applied to the result of input methods, not their input.
11100 See also `keyboard-translate-table'.
11102 Use of this variable for character code unification was rendered
11103 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11104 internal character representation. */);
11105 Vtranslation_table_for_input = Qnil;
11108 Lisp_Object args[coding_arg_max];
11109 Lisp_Object plist[16];
11110 int i;
11112 for (i = 0; i < coding_arg_max; i++)
11113 args[i] = Qnil;
11115 plist[0] = intern_c_string (":name");
11116 plist[1] = args[coding_arg_name] = Qno_conversion;
11117 plist[2] = intern_c_string (":mnemonic");
11118 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11119 plist[4] = intern_c_string (":coding-type");
11120 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11121 plist[6] = intern_c_string (":ascii-compatible-p");
11122 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11123 plist[8] = intern_c_string (":default-char");
11124 plist[9] = args[coding_arg_default_char] = make_number (0);
11125 plist[10] = intern_c_string (":for-unibyte");
11126 plist[11] = args[coding_arg_for_unibyte] = Qt;
11127 plist[12] = intern_c_string (":docstring");
11128 plist[13] = build_pure_c_string ("Do no conversion.\n\
11130 When you visit a file with this coding, the file is read into a\n\
11131 unibyte buffer as is, thus each byte of a file is treated as a\n\
11132 character.");
11133 plist[14] = intern_c_string (":eol-type");
11134 plist[15] = args[coding_arg_eol_type] = Qunix;
11135 args[coding_arg_plist] = Flist (16, plist);
11136 Fdefine_coding_system_internal (coding_arg_max, args);
11138 plist[1] = args[coding_arg_name] = Qundecided;
11139 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11140 plist[5] = args[coding_arg_coding_type] = Qundecided;
11141 /* This is already set.
11142 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11143 plist[8] = intern_c_string (":charset-list");
11144 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11145 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11146 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11147 plist[15] = args[coding_arg_eol_type] = Qnil;
11148 args[coding_arg_plist] = Flist (16, plist);
11149 Fdefine_coding_system_internal (coding_arg_max, args);
11152 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11155 int i;
11157 for (i = 0; i < coding_category_max; i++)
11158 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11160 #if defined (DOS_NT)
11161 system_eol_type = Qdos;
11162 #else
11163 system_eol_type = Qunix;
11164 #endif
11165 staticpro (&system_eol_type);
11168 char *
11169 emacs_strerror (int error_number)
11171 char *str;
11173 synchronize_system_messages_locale ();
11174 str = strerror (error_number);
11176 if (! NILP (Vlocale_coding_system))
11178 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11179 Vlocale_coding_system,
11181 str = SSDATA (dec);
11184 return str;
11187 #endif /* emacs */