Replace obsolete generic-make-keywords calls
[emacs.git] / src / coding.c
blob497c26d48560512f724199b9fcaf3eeb6d837b53
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
44 /*** 0. General comments ***
47 CODING SYSTEM
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
67 o UTF-8
69 o UTF-16
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
94 o BIG5
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
102 o CCL
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
109 o Raw-text
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
115 o No-conversion
117 Like raw text, but don't do end-of-line conversion.
120 END-OF-LINE FORMAT
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
132 STRUCT CODING_SYSTEM
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
141 /* COMMON MACROS */
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
150 Return true if the byte sequence conforms to XXX.
152 Below is the template of these functions. */
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
166 while (1)
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
186 #endif
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
217 while (1)
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
240 #endif
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
281 #endif
284 /*** 1. Preamble ***/
286 #include <config.h>
287 #include <stdio.h>
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
304 Lisp_Object Vcoding_system_hash_table;
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
341 #ifdef emacs
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
360 #endif /* emacs */
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
371 /* ISO2022 section */
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
496 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
498 /* A character to be produced on output if encoding of the original
499 character is prohibited by CODING_ISO_FLAG_SAFE. */
500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
502 /* UTF-8 section */
503 #define CODING_UTF_8_BOM(coding) \
504 ((coding)->spec.utf_8_bom)
506 /* UTF-16 section */
507 #define CODING_UTF_16_BOM(coding) \
508 ((coding)->spec.utf_16.bom)
510 #define CODING_UTF_16_ENDIAN(coding) \
511 ((coding)->spec.utf_16.endian)
513 #define CODING_UTF_16_SURROGATE(coding) \
514 ((coding)->spec.utf_16.surrogate)
517 /* CCL section */
518 #define CODING_CCL_DECODER(coding) \
519 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
520 #define CODING_CCL_ENCODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
522 #define CODING_CCL_VALIDS(coding) \
523 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
525 /* Index for each coding category in `coding_categories' */
527 enum coding_category
529 coding_category_iso_7,
530 coding_category_iso_7_tight,
531 coding_category_iso_8_1,
532 coding_category_iso_8_2,
533 coding_category_iso_7_else,
534 coding_category_iso_8_else,
535 coding_category_utf_8_auto,
536 coding_category_utf_8_nosig,
537 coding_category_utf_8_sig,
538 coding_category_utf_16_auto,
539 coding_category_utf_16_be,
540 coding_category_utf_16_le,
541 coding_category_utf_16_be_nosig,
542 coding_category_utf_16_le_nosig,
543 coding_category_charset,
544 coding_category_sjis,
545 coding_category_big5,
546 coding_category_ccl,
547 coding_category_emacs_mule,
548 /* All above are targets of code detection. */
549 coding_category_raw_text,
550 coding_category_undecided,
551 coding_category_max
554 /* Definitions of flag bits used in detect_coding_XXXX. */
555 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
556 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
557 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
558 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
559 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
560 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
561 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
562 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
563 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
564 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
565 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
566 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
567 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
568 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
569 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
570 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
571 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
572 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
573 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
574 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
576 /* This value is returned if detect_coding_mask () find nothing other
577 than ASCII characters. */
578 #define CATEGORY_MASK_ANY \
579 (CATEGORY_MASK_ISO_7 \
580 | CATEGORY_MASK_ISO_7_TIGHT \
581 | CATEGORY_MASK_ISO_8_1 \
582 | CATEGORY_MASK_ISO_8_2 \
583 | CATEGORY_MASK_ISO_7_ELSE \
584 | CATEGORY_MASK_ISO_8_ELSE \
585 | CATEGORY_MASK_UTF_8_AUTO \
586 | CATEGORY_MASK_UTF_8_NOSIG \
587 | CATEGORY_MASK_UTF_8_SIG \
588 | CATEGORY_MASK_UTF_16_AUTO \
589 | CATEGORY_MASK_UTF_16_BE \
590 | CATEGORY_MASK_UTF_16_LE \
591 | CATEGORY_MASK_UTF_16_BE_NOSIG \
592 | CATEGORY_MASK_UTF_16_LE_NOSIG \
593 | CATEGORY_MASK_CHARSET \
594 | CATEGORY_MASK_SJIS \
595 | CATEGORY_MASK_BIG5 \
596 | CATEGORY_MASK_CCL \
597 | CATEGORY_MASK_EMACS_MULE)
600 #define CATEGORY_MASK_ISO_7BIT \
601 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
603 #define CATEGORY_MASK_ISO_8BIT \
604 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
606 #define CATEGORY_MASK_ISO_ELSE \
607 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
609 #define CATEGORY_MASK_ISO_ESCAPE \
610 (CATEGORY_MASK_ISO_7 \
611 | CATEGORY_MASK_ISO_7_TIGHT \
612 | CATEGORY_MASK_ISO_7_ELSE \
613 | CATEGORY_MASK_ISO_8_ELSE)
615 #define CATEGORY_MASK_ISO \
616 ( CATEGORY_MASK_ISO_7BIT \
617 | CATEGORY_MASK_ISO_8BIT \
618 | CATEGORY_MASK_ISO_ELSE)
620 #define CATEGORY_MASK_UTF_16 \
621 (CATEGORY_MASK_UTF_16_AUTO \
622 | CATEGORY_MASK_UTF_16_BE \
623 | CATEGORY_MASK_UTF_16_LE \
624 | CATEGORY_MASK_UTF_16_BE_NOSIG \
625 | CATEGORY_MASK_UTF_16_LE_NOSIG)
627 #define CATEGORY_MASK_UTF_8 \
628 (CATEGORY_MASK_UTF_8_AUTO \
629 | CATEGORY_MASK_UTF_8_NOSIG \
630 | CATEGORY_MASK_UTF_8_SIG)
632 /* Table of coding categories (Lisp symbols). This variable is for
633 internal use only. */
634 static Lisp_Object Vcoding_category_table;
636 /* Table of coding-categories ordered by priority. */
637 static enum coding_category coding_priorities[coding_category_max];
639 /* Nth element is a coding context for the coding system bound to the
640 Nth coding category. */
641 static struct coding_system coding_categories[coding_category_max];
643 /*** Commonly used macros and functions ***/
645 #ifndef min
646 #define min(a, b) ((a) < (b) ? (a) : (b))
647 #endif
648 #ifndef max
649 #define max(a, b) ((a) > (b) ? (a) : (b))
650 #endif
652 #define CODING_GET_INFO(coding, attrs, charset_list) \
653 do { \
654 (attrs) = CODING_ID_ATTRS ((coding)->id); \
655 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
656 } while (0)
658 static void
659 CHECK_NATNUM_CAR (Lisp_Object x)
661 Lisp_Object tmp = XCAR (x);
662 CHECK_NATNUM (tmp);
663 XSETCAR (x, tmp);
666 static void
667 CHECK_NATNUM_CDR (Lisp_Object x)
669 Lisp_Object tmp = XCDR (x);
670 CHECK_NATNUM (tmp);
671 XSETCDR (x, tmp);
675 /* Safely get one byte from the source text pointed by SRC which ends
676 at SRC_END, and set C to that byte. If there are not enough bytes
677 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
678 and a multibyte character is found at SRC, set C to the
679 negative value of the character code. The caller should declare
680 and set these variables appropriately in advance:
681 src, src_end, multibytep */
683 #define ONE_MORE_BYTE(c) \
684 do { \
685 if (src == src_end) \
687 if (src_base < src) \
688 record_conversion_result \
689 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
690 goto no_more_source; \
692 c = *src++; \
693 if (multibytep && (c & 0x80)) \
695 if ((c & 0xFE) == 0xC0) \
696 c = ((c & 1) << 6) | *src++; \
697 else \
699 src--; \
700 c = - string_char (src, &src, NULL); \
701 record_conversion_result \
702 (coding, CODING_RESULT_INVALID_SRC); \
705 consumed_chars++; \
706 } while (0)
708 /* Safely get two bytes from the source text pointed by SRC which ends
709 at SRC_END, and set C1 and C2 to those bytes while skipping the
710 heading multibyte characters. If there are not enough bytes in the
711 source, it jumps to 'no_more_source'. If MULTIBYTEP and
712 a multibyte character is found for C2, set C2 to the negative value
713 of the character code. The caller should declare and set these
714 variables appropriately in advance:
715 src, src_end, multibytep
716 It is intended that this macro is used in detect_coding_utf_16. */
718 #define TWO_MORE_BYTES(c1, c2) \
719 do { \
720 do { \
721 if (src == src_end) \
722 goto no_more_source; \
723 c1 = *src++; \
724 if (multibytep && (c1 & 0x80)) \
726 if ((c1 & 0xFE) == 0xC0) \
727 c1 = ((c1 & 1) << 6) | *src++; \
728 else \
730 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
731 c1 = -1; \
734 } while (c1 < 0); \
735 if (src == src_end) \
736 goto no_more_source; \
737 c2 = *src++; \
738 if (multibytep && (c2 & 0x80)) \
740 if ((c2 & 0xFE) == 0xC0) \
741 c2 = ((c2 & 1) << 6) | *src++; \
742 else \
743 c2 = -1; \
745 } while (0)
748 /* Store a byte C in the place pointed by DST and increment DST to the
749 next free point, and increment PRODUCED_CHARS. The caller should
750 assure that C is 0..127, and declare and set the variable `dst'
751 appropriately in advance.
755 #define EMIT_ONE_ASCII_BYTE(c) \
756 do { \
757 produced_chars++; \
758 *dst++ = (c); \
759 } while (0)
762 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
764 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
765 do { \
766 produced_chars += 2; \
767 *dst++ = (c1), *dst++ = (c2); \
768 } while (0)
771 /* Store a byte C in the place pointed by DST and increment DST to the
772 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
773 store in an appropriate multibyte form. The caller should
774 declare and set the variables `dst' and `multibytep' appropriately
775 in advance. */
777 #define EMIT_ONE_BYTE(c) \
778 do { \
779 produced_chars++; \
780 if (multibytep) \
782 unsigned ch = (c); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
787 else \
788 *dst++ = (c); \
789 } while (0)
792 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
794 #define EMIT_TWO_BYTES(c1, c2) \
795 do { \
796 produced_chars += 2; \
797 if (multibytep) \
799 unsigned ch; \
801 ch = (c1); \
802 if (ch >= 0x80) \
803 ch = BYTE8_TO_CHAR (ch); \
804 CHAR_STRING_ADVANCE (ch, dst); \
805 ch = (c2); \
806 if (ch >= 0x80) \
807 ch = BYTE8_TO_CHAR (ch); \
808 CHAR_STRING_ADVANCE (ch, dst); \
810 else \
812 *dst++ = (c1); \
813 *dst++ = (c2); \
815 } while (0)
818 #define EMIT_THREE_BYTES(c1, c2, c3) \
819 do { \
820 EMIT_ONE_BYTE (c1); \
821 EMIT_TWO_BYTES (c2, c3); \
822 } while (0)
825 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
826 do { \
827 EMIT_TWO_BYTES (c1, c2); \
828 EMIT_TWO_BYTES (c3, c4); \
829 } while (0)
832 static void
833 record_conversion_result (struct coding_system *coding,
834 enum coding_result_code result)
836 coding->result = result;
837 switch (result)
839 case CODING_RESULT_INSUFFICIENT_SRC:
840 Vlast_code_conversion_error = Qinsufficient_source;
841 break;
842 case CODING_RESULT_INVALID_SRC:
843 Vlast_code_conversion_error = Qinvalid_source;
844 break;
845 case CODING_RESULT_INTERRUPT:
846 Vlast_code_conversion_error = Qinterrupted;
847 break;
848 case CODING_RESULT_INSUFFICIENT_DST:
849 /* Don't record this error in Vlast_code_conversion_error
850 because it happens just temporarily and is resolved when the
851 whole conversion is finished. */
852 break;
853 case CODING_RESULT_SUCCESS:
854 break;
855 default:
856 Vlast_code_conversion_error = intern ("Unknown error");
860 /* These wrapper macros are used to preserve validity of pointers into
861 buffer text across calls to decode_char, encode_char, etc, which
862 could cause relocation of buffers if it loads a charset map,
863 because loading a charset map allocates large structures. */
865 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
866 do { \
867 ptrdiff_t offset; \
869 charset_map_loaded = 0; \
870 c = DECODE_CHAR (charset, code); \
871 if (charset_map_loaded \
872 && (offset = coding_change_source (coding))) \
874 src += offset; \
875 src_base += offset; \
876 src_end += offset; \
878 } while (0)
880 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
881 do { \
882 ptrdiff_t offset; \
884 charset_map_loaded = 0; \
885 code = ENCODE_CHAR (charset, c); \
886 if (charset_map_loaded \
887 && (offset = coding_change_destination (coding))) \
889 dst += offset; \
890 dst_end += offset; \
892 } while (0)
894 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
895 do { \
896 ptrdiff_t offset; \
898 charset_map_loaded = 0; \
899 charset = char_charset (c, charset_list, code_return); \
900 if (charset_map_loaded \
901 && (offset = coding_change_destination (coding))) \
903 dst += offset; \
904 dst_end += offset; \
906 } while (0)
908 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
909 do { \
910 ptrdiff_t offset; \
912 charset_map_loaded = 0; \
913 result = CHAR_CHARSET_P (c, charset); \
914 if (charset_map_loaded \
915 && (offset = coding_change_destination (coding))) \
917 dst += offset; \
918 dst_end += offset; \
920 } while (0)
923 /* If there are at least BYTES length of room at dst, allocate memory
924 for coding->destination and update dst and dst_end. We don't have
925 to take care of coding->source which will be relocated. It is
926 handled by calling coding_set_source in encode_coding. */
928 #define ASSURE_DESTINATION(bytes) \
929 do { \
930 if (dst + (bytes) >= dst_end) \
932 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
934 dst = alloc_destination (coding, more_bytes, dst); \
935 dst_end = coding->destination + coding->dst_bytes; \
937 } while (0)
940 /* Store multibyte form of the character C in P, and advance P to the
941 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
942 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
943 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
945 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
947 /* Return the character code of character whose multibyte form is at
948 P, and advance P to the end of the multibyte form. This used to be
949 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
950 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
952 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
954 /* Set coding->source from coding->src_object. */
956 static void
957 coding_set_source (struct coding_system *coding)
959 if (BUFFERP (coding->src_object))
961 struct buffer *buf = XBUFFER (coding->src_object);
963 if (coding->src_pos < 0)
964 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
965 else
966 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
968 else if (STRINGP (coding->src_object))
970 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
972 else
974 /* Otherwise, the source is C string and is never relocated
975 automatically. Thus we don't have to update anything. */
980 /* Set coding->source from coding->src_object, and return how many
981 bytes coding->source was changed. */
983 static ptrdiff_t
984 coding_change_source (struct coding_system *coding)
986 const unsigned char *orig = coding->source;
987 coding_set_source (coding);
988 return coding->source - orig;
992 /* Set coding->destination from coding->dst_object. */
994 static void
995 coding_set_destination (struct coding_system *coding)
997 if (BUFFERP (coding->dst_object))
999 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1001 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1002 coding->dst_bytes = (GAP_END_ADDR
1003 - (coding->src_bytes - coding->consumed)
1004 - coding->destination);
1006 else
1008 /* We are sure that coding->dst_pos_byte is before the gap
1009 of the buffer. */
1010 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1011 + coding->dst_pos_byte - BEG_BYTE);
1012 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1013 - coding->destination);
1016 else
1018 /* Otherwise, the destination is C string and is never relocated
1019 automatically. Thus we don't have to update anything. */
1024 /* Set coding->destination from coding->dst_object, and return how
1025 many bytes coding->destination was changed. */
1027 static ptrdiff_t
1028 coding_change_destination (struct coding_system *coding)
1030 const unsigned char *orig = coding->destination;
1031 coding_set_destination (coding);
1032 return coding->destination - orig;
1036 static void
1037 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1039 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1040 string_overflow ();
1041 coding->destination = xrealloc (coding->destination,
1042 coding->dst_bytes + bytes);
1043 coding->dst_bytes += bytes;
1046 static void
1047 coding_alloc_by_making_gap (struct coding_system *coding,
1048 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1050 if (EQ (coding->src_object, coding->dst_object))
1052 /* The gap may contain the produced data at the head and not-yet
1053 consumed data at the tail. To preserve those data, we at
1054 first make the gap size to zero, then increase the gap
1055 size. */
1056 ptrdiff_t add = GAP_SIZE;
1058 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1059 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1060 make_gap (bytes);
1061 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1062 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1064 else
1065 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1069 static unsigned char *
1070 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1071 unsigned char *dst)
1073 ptrdiff_t offset = dst - coding->destination;
1075 if (BUFFERP (coding->dst_object))
1077 struct buffer *buf = XBUFFER (coding->dst_object);
1079 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1081 else
1082 coding_alloc_by_realloc (coding, nbytes);
1083 coding_set_destination (coding);
1084 dst = coding->destination + offset;
1085 return dst;
1088 /** Macros for annotations. */
1090 /* An annotation data is stored in the array coding->charbuf in this
1091 format:
1092 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1093 LENGTH is the number of elements in the annotation.
1094 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1095 NCHARS is the number of characters in the text annotated.
1097 The format of the following elements depend on ANNOTATION_MASK.
1099 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1100 follows:
1101 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1103 NBYTES is the number of bytes specified in the header part of
1104 old-style emacs-mule encoding, or 0 for the other kind of
1105 composition.
1107 METHOD is one of enum composition_method.
1109 Optional COMPOSITION-COMPONENTS are characters and composition
1110 rules.
1112 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1113 follows.
1115 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1116 recover from an invalid annotation, and should be skipped by
1117 produce_annotation. */
1119 /* Maximum length of the header of annotation data. */
1120 #define MAX_ANNOTATION_LENGTH 5
1122 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1123 do { \
1124 *(buf)++ = -(len); \
1125 *(buf)++ = (mask); \
1126 *(buf)++ = (nchars); \
1127 coding->annotated = 1; \
1128 } while (0);
1130 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1131 do { \
1132 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1133 *buf++ = nbytes; \
1134 *buf++ = method; \
1135 } while (0)
1138 #define ADD_CHARSET_DATA(buf, nchars, id) \
1139 do { \
1140 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1141 *buf++ = id; \
1142 } while (0)
1145 /* Bitmasks for coding->eol_seen. */
1147 #define EOL_SEEN_NONE 0
1148 #define EOL_SEEN_LF 1
1149 #define EOL_SEEN_CR 2
1150 #define EOL_SEEN_CRLF 4
1153 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1158 /*** 3. UTF-8 ***/
1160 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1161 Return true if a text is encoded in UTF-8. */
1163 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1164 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1165 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1166 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1167 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1168 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1170 #define UTF_8_BOM_1 0xEF
1171 #define UTF_8_BOM_2 0xBB
1172 #define UTF_8_BOM_3 0xBF
1174 /* Unlike the other detect_coding_XXX, this function counts number of
1175 characters and check EOL format. */
1177 static bool
1178 detect_coding_utf_8 (struct coding_system *coding,
1179 struct coding_detection_info *detect_info)
1181 const unsigned char *src = coding->source, *src_base;
1182 const unsigned char *src_end = coding->source + coding->src_bytes;
1183 bool multibytep = coding->src_multibyte;
1184 ptrdiff_t consumed_chars = 0;
1185 bool bom_found = 0;
1186 int nchars = coding->head_ascii;
1187 int eol_seen = coding->eol_seen;
1189 detect_info->checked |= CATEGORY_MASK_UTF_8;
1190 /* A coding system of this category is always ASCII compatible. */
1191 src += nchars;
1193 if (src == coding->source /* BOM should be at the head. */
1194 && src + 3 < src_end /* BOM is 3-byte long. */
1195 && src[0] == UTF_8_BOM_1
1196 && src[1] == UTF_8_BOM_2
1197 && src[2] == UTF_8_BOM_3)
1199 bom_found = 1;
1200 src += 3;
1201 nchars++;
1204 while (1)
1206 int c, c1, c2, c3, c4;
1208 src_base = src;
1209 ONE_MORE_BYTE (c);
1210 if (c < 0 || UTF_8_1_OCTET_P (c))
1212 nchars++;
1213 if (c == '\r')
1215 if (src < src_end && *src == '\n')
1217 eol_seen |= EOL_SEEN_CRLF;
1218 src++;
1219 nchars++;
1221 else
1222 eol_seen |= EOL_SEEN_CR;
1224 else if (c == '\n')
1225 eol_seen |= EOL_SEEN_LF;
1226 continue;
1228 ONE_MORE_BYTE (c1);
1229 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1230 break;
1231 if (UTF_8_2_OCTET_LEADING_P (c))
1233 nchars++;
1234 continue;
1236 ONE_MORE_BYTE (c2);
1237 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1238 break;
1239 if (UTF_8_3_OCTET_LEADING_P (c))
1241 nchars++;
1242 continue;
1244 ONE_MORE_BYTE (c3);
1245 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1246 break;
1247 if (UTF_8_4_OCTET_LEADING_P (c))
1249 nchars++;
1250 continue;
1252 ONE_MORE_BYTE (c4);
1253 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1254 break;
1255 if (UTF_8_5_OCTET_LEADING_P (c))
1257 nchars++;
1258 continue;
1260 break;
1262 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1263 return 0;
1265 no_more_source:
1266 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1268 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1269 return 0;
1271 if (bom_found)
1273 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1274 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1276 else
1278 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1279 if (nchars < src_end - coding->source)
1280 /* The found characters are less than source bytes, which
1281 means that we found a valid non-ASCII characters. */
1282 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1284 coding->detected_utf8_chars = nchars;
1285 return 1;
1289 static void
1290 decode_coding_utf_8 (struct coding_system *coding)
1292 const unsigned char *src = coding->source + coding->consumed;
1293 const unsigned char *src_end = coding->source + coding->src_bytes;
1294 const unsigned char *src_base;
1295 int *charbuf = coding->charbuf + coding->charbuf_used;
1296 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1298 bool multibytep = coding->src_multibyte;
1299 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1300 bool eol_dos
1301 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1302 int byte_after_cr = -1;
1304 if (bom != utf_without_bom)
1306 int c1, c2, c3;
1308 src_base = src;
1309 ONE_MORE_BYTE (c1);
1310 if (! UTF_8_3_OCTET_LEADING_P (c1))
1311 src = src_base;
1312 else
1314 ONE_MORE_BYTE (c2);
1315 if (! UTF_8_EXTRA_OCTET_P (c2))
1316 src = src_base;
1317 else
1319 ONE_MORE_BYTE (c3);
1320 if (! UTF_8_EXTRA_OCTET_P (c3))
1321 src = src_base;
1322 else
1324 if ((c1 != UTF_8_BOM_1)
1325 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1326 src = src_base;
1327 else
1328 CODING_UTF_8_BOM (coding) = utf_without_bom;
1333 CODING_UTF_8_BOM (coding) = utf_without_bom;
1335 while (1)
1337 int c, c1, c2, c3, c4, c5;
1339 src_base = src;
1340 consumed_chars_base = consumed_chars;
1342 if (charbuf >= charbuf_end)
1344 if (byte_after_cr >= 0)
1345 src_base--;
1346 break;
1349 if (byte_after_cr >= 0)
1350 c1 = byte_after_cr, byte_after_cr = -1;
1351 else
1352 ONE_MORE_BYTE (c1);
1353 if (c1 < 0)
1355 c = - c1;
1357 else if (UTF_8_1_OCTET_P (c1))
1359 if (eol_dos && c1 == '\r')
1360 ONE_MORE_BYTE (byte_after_cr);
1361 c = c1;
1363 else
1365 ONE_MORE_BYTE (c2);
1366 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1367 goto invalid_code;
1368 if (UTF_8_2_OCTET_LEADING_P (c1))
1370 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1371 /* Reject overlong sequences here and below. Encoders
1372 producing them are incorrect, they can be misleading,
1373 and they mess up read/write invariance. */
1374 if (c < 128)
1375 goto invalid_code;
1377 else
1379 ONE_MORE_BYTE (c3);
1380 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1381 goto invalid_code;
1382 if (UTF_8_3_OCTET_LEADING_P (c1))
1384 c = (((c1 & 0xF) << 12)
1385 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1386 if (c < 0x800
1387 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1388 goto invalid_code;
1390 else
1392 ONE_MORE_BYTE (c4);
1393 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1394 goto invalid_code;
1395 if (UTF_8_4_OCTET_LEADING_P (c1))
1397 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1398 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1399 if (c < 0x10000)
1400 goto invalid_code;
1402 else
1404 ONE_MORE_BYTE (c5);
1405 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1406 goto invalid_code;
1407 if (UTF_8_5_OCTET_LEADING_P (c1))
1409 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1410 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1411 | (c5 & 0x3F));
1412 if ((c > MAX_CHAR) || (c < 0x200000))
1413 goto invalid_code;
1415 else
1416 goto invalid_code;
1422 *charbuf++ = c;
1423 continue;
1425 invalid_code:
1426 src = src_base;
1427 consumed_chars = consumed_chars_base;
1428 ONE_MORE_BYTE (c);
1429 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1430 coding->errors++;
1433 no_more_source:
1434 coding->consumed_char += consumed_chars_base;
1435 coding->consumed = src_base - coding->source;
1436 coding->charbuf_used = charbuf - coding->charbuf;
1440 static bool
1441 encode_coding_utf_8 (struct coding_system *coding)
1443 bool multibytep = coding->dst_multibyte;
1444 int *charbuf = coding->charbuf;
1445 int *charbuf_end = charbuf + coding->charbuf_used;
1446 unsigned char *dst = coding->destination + coding->produced;
1447 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1448 ptrdiff_t produced_chars = 0;
1449 int c;
1451 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1453 ASSURE_DESTINATION (3);
1454 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1455 CODING_UTF_8_BOM (coding) = utf_without_bom;
1458 if (multibytep)
1460 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1462 while (charbuf < charbuf_end)
1464 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1466 ASSURE_DESTINATION (safe_room);
1467 c = *charbuf++;
1468 if (CHAR_BYTE8_P (c))
1470 c = CHAR_TO_BYTE8 (c);
1471 EMIT_ONE_BYTE (c);
1473 else
1475 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1476 for (p = str; p < pend; p++)
1477 EMIT_ONE_BYTE (*p);
1481 else
1483 int safe_room = MAX_MULTIBYTE_LENGTH;
1485 while (charbuf < charbuf_end)
1487 ASSURE_DESTINATION (safe_room);
1488 c = *charbuf++;
1489 if (CHAR_BYTE8_P (c))
1490 *dst++ = CHAR_TO_BYTE8 (c);
1491 else
1492 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1493 produced_chars++;
1496 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1497 coding->produced_char += produced_chars;
1498 coding->produced = dst - coding->destination;
1499 return 0;
1503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1504 Return true if a text is encoded in one of UTF-16 based coding systems. */
1506 #define UTF_16_HIGH_SURROGATE_P(val) \
1507 (((val) & 0xFC00) == 0xD800)
1509 #define UTF_16_LOW_SURROGATE_P(val) \
1510 (((val) & 0xFC00) == 0xDC00)
1513 static bool
1514 detect_coding_utf_16 (struct coding_system *coding,
1515 struct coding_detection_info *detect_info)
1517 const unsigned char *src = coding->source;
1518 const unsigned char *src_end = coding->source + coding->src_bytes;
1519 bool multibytep = coding->src_multibyte;
1520 int c1, c2;
1522 detect_info->checked |= CATEGORY_MASK_UTF_16;
1523 if (coding->mode & CODING_MODE_LAST_BLOCK
1524 && (coding->src_chars & 1))
1526 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1527 return 0;
1530 TWO_MORE_BYTES (c1, c2);
1531 if ((c1 == 0xFF) && (c2 == 0xFE))
1533 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1534 | CATEGORY_MASK_UTF_16_AUTO);
1535 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1536 | CATEGORY_MASK_UTF_16_BE_NOSIG
1537 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1539 else if ((c1 == 0xFE) && (c2 == 0xFF))
1541 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1542 | CATEGORY_MASK_UTF_16_AUTO);
1543 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1544 | CATEGORY_MASK_UTF_16_BE_NOSIG
1545 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1547 else if (c2 < 0)
1549 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1550 return 0;
1552 else
1554 /* We check the dispersion of Eth and Oth bytes where E is even and
1555 O is odd. If both are high, we assume binary data.*/
1556 unsigned char e[256], o[256];
1557 unsigned e_num = 1, o_num = 1;
1559 memset (e, 0, 256);
1560 memset (o, 0, 256);
1561 e[c1] = 1;
1562 o[c2] = 1;
1564 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1565 |CATEGORY_MASK_UTF_16_BE
1566 | CATEGORY_MASK_UTF_16_LE);
1568 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1569 != CATEGORY_MASK_UTF_16)
1571 TWO_MORE_BYTES (c1, c2);
1572 if (c2 < 0)
1573 break;
1574 if (! e[c1])
1576 e[c1] = 1;
1577 e_num++;
1578 if (e_num >= 128)
1579 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1581 if (! o[c2])
1583 o[c2] = 1;
1584 o_num++;
1585 if (o_num >= 128)
1586 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1589 return 0;
1592 no_more_source:
1593 return 1;
1596 static void
1597 decode_coding_utf_16 (struct coding_system *coding)
1599 const unsigned char *src = coding->source + coding->consumed;
1600 const unsigned char *src_end = coding->source + coding->src_bytes;
1601 const unsigned char *src_base;
1602 int *charbuf = coding->charbuf + coding->charbuf_used;
1603 /* We may produces at most 3 chars in one loop. */
1604 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1605 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1606 bool multibytep = coding->src_multibyte;
1607 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1608 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1609 int surrogate = CODING_UTF_16_SURROGATE (coding);
1610 bool eol_dos
1611 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1612 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1614 if (bom == utf_with_bom)
1616 int c, c1, c2;
1618 src_base = src;
1619 ONE_MORE_BYTE (c1);
1620 ONE_MORE_BYTE (c2);
1621 c = (c1 << 8) | c2;
1623 if (endian == utf_16_big_endian
1624 ? c != 0xFEFF : c != 0xFFFE)
1626 /* The first two bytes are not BOM. Treat them as bytes
1627 for a normal character. */
1628 src = src_base;
1629 coding->errors++;
1631 CODING_UTF_16_BOM (coding) = utf_without_bom;
1633 else if (bom == utf_detect_bom)
1635 /* We have already tried to detect BOM and failed in
1636 detect_coding. */
1637 CODING_UTF_16_BOM (coding) = utf_without_bom;
1640 while (1)
1642 int c, c1, c2;
1644 src_base = src;
1645 consumed_chars_base = consumed_chars;
1647 if (charbuf >= charbuf_end)
1649 if (byte_after_cr1 >= 0)
1650 src_base -= 2;
1651 break;
1654 if (byte_after_cr1 >= 0)
1655 c1 = byte_after_cr1, byte_after_cr1 = -1;
1656 else
1657 ONE_MORE_BYTE (c1);
1658 if (c1 < 0)
1660 *charbuf++ = -c1;
1661 continue;
1663 if (byte_after_cr2 >= 0)
1664 c2 = byte_after_cr2, byte_after_cr2 = -1;
1665 else
1666 ONE_MORE_BYTE (c2);
1667 if (c2 < 0)
1669 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1670 *charbuf++ = -c2;
1671 continue;
1673 c = (endian == utf_16_big_endian
1674 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1676 if (surrogate)
1678 if (! UTF_16_LOW_SURROGATE_P (c))
1680 if (endian == utf_16_big_endian)
1681 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1682 else
1683 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1684 *charbuf++ = c1;
1685 *charbuf++ = c2;
1686 coding->errors++;
1687 if (UTF_16_HIGH_SURROGATE_P (c))
1688 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1689 else
1690 *charbuf++ = c;
1692 else
1694 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1695 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1696 *charbuf++ = 0x10000 + c;
1699 else
1701 if (UTF_16_HIGH_SURROGATE_P (c))
1702 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1703 else
1705 if (eol_dos && c == '\r')
1707 ONE_MORE_BYTE (byte_after_cr1);
1708 ONE_MORE_BYTE (byte_after_cr2);
1710 *charbuf++ = c;
1715 no_more_source:
1716 coding->consumed_char += consumed_chars_base;
1717 coding->consumed = src_base - coding->source;
1718 coding->charbuf_used = charbuf - coding->charbuf;
1721 static bool
1722 encode_coding_utf_16 (struct coding_system *coding)
1724 bool multibytep = coding->dst_multibyte;
1725 int *charbuf = coding->charbuf;
1726 int *charbuf_end = charbuf + coding->charbuf_used;
1727 unsigned char *dst = coding->destination + coding->produced;
1728 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1729 int safe_room = 8;
1730 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1731 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1732 ptrdiff_t produced_chars = 0;
1733 int c;
1735 if (bom != utf_without_bom)
1737 ASSURE_DESTINATION (safe_room);
1738 if (big_endian)
1739 EMIT_TWO_BYTES (0xFE, 0xFF);
1740 else
1741 EMIT_TWO_BYTES (0xFF, 0xFE);
1742 CODING_UTF_16_BOM (coding) = utf_without_bom;
1745 while (charbuf < charbuf_end)
1747 ASSURE_DESTINATION (safe_room);
1748 c = *charbuf++;
1749 if (c > MAX_UNICODE_CHAR)
1750 c = coding->default_char;
1752 if (c < 0x10000)
1754 if (big_endian)
1755 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1756 else
1757 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1759 else
1761 int c1, c2;
1763 c -= 0x10000;
1764 c1 = (c >> 10) + 0xD800;
1765 c2 = (c & 0x3FF) + 0xDC00;
1766 if (big_endian)
1767 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1768 else
1769 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1772 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1773 coding->produced = dst - coding->destination;
1774 coding->produced_char += produced_chars;
1775 return 0;
1779 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1781 /* Emacs' internal format for representation of multiple character
1782 sets is a kind of multi-byte encoding, i.e. characters are
1783 represented by variable-length sequences of one-byte codes.
1785 ASCII characters and control characters (e.g. `tab', `newline') are
1786 represented by one-byte sequences which are their ASCII codes, in
1787 the range 0x00 through 0x7F.
1789 8-bit characters of the range 0x80..0x9F are represented by
1790 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1791 code + 0x20).
1793 8-bit characters of the range 0xA0..0xFF are represented by
1794 one-byte sequences which are their 8-bit code.
1796 The other characters are represented by a sequence of `base
1797 leading-code', optional `extended leading-code', and one or two
1798 `position-code's. The length of the sequence is determined by the
1799 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1800 whereas extended leading-code and position-code take the range 0xA0
1801 through 0xFF. See `charset.h' for more details about leading-code
1802 and position-code.
1804 --- CODE RANGE of Emacs' internal format ---
1805 character set range
1806 ------------- -----
1807 ascii 0x00..0x7F
1808 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1809 eight-bit-graphic 0xA0..0xBF
1810 ELSE 0x81..0x9D + [0xA0..0xFF]+
1811 ---------------------------------------------
1813 As this is the internal character representation, the format is
1814 usually not used externally (i.e. in a file or in a data sent to a
1815 process). But, it is possible to have a text externally in this
1816 format (i.e. by encoding by the coding system `emacs-mule').
1818 In that case, a sequence of one-byte codes has a slightly different
1819 form.
1821 At first, all characters in eight-bit-control are represented by
1822 one-byte sequences which are their 8-bit code.
1824 Next, character composition data are represented by the byte
1825 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1826 where,
1827 METHOD is 0xF2 plus one of composition method (enum
1828 composition_method),
1830 BYTES is 0xA0 plus a byte length of this composition data,
1832 CHARS is 0xA0 plus a number of characters composed by this
1833 data,
1835 COMPONENTs are characters of multibyte form or composition
1836 rules encoded by two-byte of ASCII codes.
1838 In addition, for backward compatibility, the following formats are
1839 also recognized as composition data on decoding.
1841 0x80 MSEQ ...
1842 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1844 Here,
1845 MSEQ is a multibyte form but in these special format:
1846 ASCII: 0xA0 ASCII_CODE+0x80,
1847 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1848 RULE is a one byte code of the range 0xA0..0xF0 that
1849 represents a composition rule.
1852 char emacs_mule_bytes[256];
1855 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1856 Return true if a text is encoded in 'emacs-mule'. */
1858 static bool
1859 detect_coding_emacs_mule (struct coding_system *coding,
1860 struct coding_detection_info *detect_info)
1862 const unsigned char *src = coding->source, *src_base;
1863 const unsigned char *src_end = coding->source + coding->src_bytes;
1864 bool multibytep = coding->src_multibyte;
1865 ptrdiff_t consumed_chars = 0;
1866 int c;
1867 int found = 0;
1869 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1870 /* A coding system of this category is always ASCII compatible. */
1871 src += coding->head_ascii;
1873 while (1)
1875 src_base = src;
1876 ONE_MORE_BYTE (c);
1877 if (c < 0)
1878 continue;
1879 if (c == 0x80)
1881 /* Perhaps the start of composite character. We simply skip
1882 it because analyzing it is too heavy for detecting. But,
1883 at least, we check that the composite character
1884 constitutes of more than 4 bytes. */
1885 const unsigned char *src_start;
1887 repeat:
1888 src_start = src;
1891 ONE_MORE_BYTE (c);
1893 while (c >= 0xA0);
1895 if (src - src_start <= 4)
1896 break;
1897 found = CATEGORY_MASK_EMACS_MULE;
1898 if (c == 0x80)
1899 goto repeat;
1902 if (c < 0x80)
1904 if (c < 0x20
1905 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1906 break;
1908 else
1910 int more_bytes = emacs_mule_bytes[c] - 1;
1912 while (more_bytes > 0)
1914 ONE_MORE_BYTE (c);
1915 if (c < 0xA0)
1917 src--; /* Unread the last byte. */
1918 break;
1920 more_bytes--;
1922 if (more_bytes != 0)
1923 break;
1924 found = CATEGORY_MASK_EMACS_MULE;
1927 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1928 return 0;
1930 no_more_source:
1931 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1933 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934 return 0;
1936 detect_info->found |= found;
1937 return 1;
1941 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1942 character. If CMP_STATUS indicates that we must expect MSEQ or
1943 RULE described above, decode it and return the negative value of
1944 the decoded character or rule. If an invalid byte is found, return
1945 -1. If SRC is too short, return -2. */
1947 static int
1948 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1949 int *nbytes, int *nchars, int *id,
1950 struct composition_status *cmp_status)
1952 const unsigned char *src_end = coding->source + coding->src_bytes;
1953 const unsigned char *src_base = src;
1954 bool multibytep = coding->src_multibyte;
1955 int charset_ID;
1956 unsigned code;
1957 int c;
1958 int consumed_chars = 0;
1959 bool mseq_found = 0;
1961 ONE_MORE_BYTE (c);
1962 if (c < 0)
1964 c = -c;
1965 charset_ID = emacs_mule_charset[0];
1967 else
1969 if (c >= 0xA0)
1971 if (cmp_status->state != COMPOSING_NO
1972 && cmp_status->old_form)
1974 if (cmp_status->state == COMPOSING_CHAR)
1976 if (c == 0xA0)
1978 ONE_MORE_BYTE (c);
1979 c -= 0x80;
1980 if (c < 0)
1981 goto invalid_code;
1983 else
1984 c -= 0x20;
1985 mseq_found = 1;
1987 else
1989 *nbytes = src - src_base;
1990 *nchars = consumed_chars;
1991 return -c;
1994 else
1995 goto invalid_code;
1998 switch (emacs_mule_bytes[c])
2000 case 2:
2001 if ((charset_ID = emacs_mule_charset[c]) < 0)
2002 goto invalid_code;
2003 ONE_MORE_BYTE (c);
2004 if (c < 0xA0)
2005 goto invalid_code;
2006 code = c & 0x7F;
2007 break;
2009 case 3:
2010 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2011 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2013 ONE_MORE_BYTE (c);
2014 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2015 goto invalid_code;
2016 ONE_MORE_BYTE (c);
2017 if (c < 0xA0)
2018 goto invalid_code;
2019 code = c & 0x7F;
2021 else
2023 if ((charset_ID = emacs_mule_charset[c]) < 0)
2024 goto invalid_code;
2025 ONE_MORE_BYTE (c);
2026 if (c < 0xA0)
2027 goto invalid_code;
2028 code = (c & 0x7F) << 8;
2029 ONE_MORE_BYTE (c);
2030 if (c < 0xA0)
2031 goto invalid_code;
2032 code |= c & 0x7F;
2034 break;
2036 case 4:
2037 ONE_MORE_BYTE (c);
2038 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2039 goto invalid_code;
2040 ONE_MORE_BYTE (c);
2041 if (c < 0xA0)
2042 goto invalid_code;
2043 code = (c & 0x7F) << 8;
2044 ONE_MORE_BYTE (c);
2045 if (c < 0xA0)
2046 goto invalid_code;
2047 code |= c & 0x7F;
2048 break;
2050 case 1:
2051 code = c;
2052 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2053 break;
2055 default:
2056 emacs_abort ();
2058 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2059 CHARSET_FROM_ID (charset_ID), code, c);
2060 if (c < 0)
2061 goto invalid_code;
2063 *nbytes = src - src_base;
2064 *nchars = consumed_chars;
2065 if (id)
2066 *id = charset_ID;
2067 return (mseq_found ? -c : c);
2069 no_more_source:
2070 return -2;
2072 invalid_code:
2073 return -1;
2077 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2079 /* Handle these composition sequence ('|': the end of header elements,
2080 BYTES and CHARS >= 0xA0):
2082 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2083 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2084 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2086 and these old form:
2088 (4) relative composition: 0x80 | MSEQ ... MSEQ
2089 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2091 When the starter 0x80 and the following header elements are found,
2092 this annotation header is produced.
2094 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2096 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2097 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2099 Then, upon reading the following elements, these codes are produced
2100 until the composition end is found:
2102 (1) CHAR ... CHAR
2103 (2) ALT ... ALT CHAR ... CHAR
2104 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2105 (4) CHAR ... CHAR
2106 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2108 When the composition end is found, LENGTH and NCHARS in the
2109 annotation header is updated as below:
2111 (1) LENGTH: unchanged, NCHARS: unchanged
2112 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2113 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2114 (4) LENGTH: unchanged, NCHARS: number of CHARs
2115 (5) LENGTH: unchanged, NCHARS: number of CHARs
2117 If an error is found while composing, the annotation header is
2118 changed to the original composition header (plus filler -1s) as
2119 below:
2121 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2122 (5) [ 0x80 0xFF -1 -1- -1 ]
2124 and the sequence [ -2 DECODED-RULE ] is changed to the original
2125 byte sequence as below:
2126 o the original byte sequence is B: [ B -1 ]
2127 o the original byte sequence is B1 B2: [ B1 B2 ]
2129 Most of the routines are implemented by macros because many
2130 variables and labels in the caller decode_coding_emacs_mule must be
2131 accessible, and they are usually called just once (thus doesn't
2132 increase the size of compiled object). */
2134 /* Decode a composition rule represented by C as a component of
2135 composition sequence of Emacs 20 style. Set RULE to the decoded
2136 rule. */
2138 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2139 do { \
2140 int gref, nref; \
2142 c -= 0xA0; \
2143 if (c < 0 || c >= 81) \
2144 goto invalid_code; \
2145 gref = c / 9, nref = c % 9; \
2146 if (gref == 4) gref = 10; \
2147 if (nref == 4) nref = 10; \
2148 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2149 } while (0)
2152 /* Decode a composition rule represented by C and the following byte
2153 at SRC as a component of composition sequence of Emacs 21 style.
2154 Set RULE to the decoded rule. */
2156 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2157 do { \
2158 int gref, nref; \
2160 gref = c - 0x20; \
2161 if (gref < 0 || gref >= 81) \
2162 goto invalid_code; \
2163 ONE_MORE_BYTE (c); \
2164 nref = c - 0x20; \
2165 if (nref < 0 || nref >= 81) \
2166 goto invalid_code; \
2167 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2168 } while (0)
2171 /* Start of Emacs 21 style format. The first three bytes at SRC are
2172 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2173 byte length of this composition information, CHARS is the number of
2174 characters composed by this composition. */
2176 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2177 do { \
2178 enum composition_method method = c - 0xF2; \
2179 int nbytes, nchars; \
2181 ONE_MORE_BYTE (c); \
2182 if (c < 0) \
2183 goto invalid_code; \
2184 nbytes = c - 0xA0; \
2185 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2186 goto invalid_code; \
2187 ONE_MORE_BYTE (c); \
2188 nchars = c - 0xA0; \
2189 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2190 goto invalid_code; \
2191 cmp_status->old_form = 0; \
2192 cmp_status->method = method; \
2193 if (method == COMPOSITION_RELATIVE) \
2194 cmp_status->state = COMPOSING_CHAR; \
2195 else \
2196 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2197 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2198 cmp_status->nchars = nchars; \
2199 cmp_status->ncomps = nbytes - 4; \
2200 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2201 } while (0)
2204 /* Start of Emacs 20 style format for relative composition. */
2206 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2207 do { \
2208 cmp_status->old_form = 1; \
2209 cmp_status->method = COMPOSITION_RELATIVE; \
2210 cmp_status->state = COMPOSING_CHAR; \
2211 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2212 cmp_status->nchars = cmp_status->ncomps = 0; \
2213 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2214 } while (0)
2217 /* Start of Emacs 20 style format for rule-base composition. */
2219 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2220 do { \
2221 cmp_status->old_form = 1; \
2222 cmp_status->method = COMPOSITION_WITH_RULE; \
2223 cmp_status->state = COMPOSING_CHAR; \
2224 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2225 cmp_status->nchars = cmp_status->ncomps = 0; \
2226 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2227 } while (0)
2230 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2231 do { \
2232 const unsigned char *current_src = src; \
2234 ONE_MORE_BYTE (c); \
2235 if (c < 0) \
2236 goto invalid_code; \
2237 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2238 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2239 DECODE_EMACS_MULE_21_COMPOSITION (); \
2240 else if (c < 0xA0) \
2241 goto invalid_code; \
2242 else if (c < 0xC0) \
2244 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2245 /* Re-read C as a composition component. */ \
2246 src = current_src; \
2248 else if (c == 0xFF) \
2249 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2250 else \
2251 goto invalid_code; \
2252 } while (0)
2254 #define EMACS_MULE_COMPOSITION_END() \
2255 do { \
2256 int idx = - cmp_status->length; \
2258 if (cmp_status->old_form) \
2259 charbuf[idx + 2] = cmp_status->nchars; \
2260 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2261 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2262 cmp_status->state = COMPOSING_NO; \
2263 } while (0)
2266 static int
2267 emacs_mule_finish_composition (int *charbuf,
2268 struct composition_status *cmp_status)
2270 int idx = - cmp_status->length;
2271 int new_chars;
2273 if (cmp_status->old_form && cmp_status->nchars > 0)
2275 charbuf[idx + 2] = cmp_status->nchars;
2276 new_chars = 0;
2277 if (cmp_status->method == COMPOSITION_WITH_RULE
2278 && cmp_status->state == COMPOSING_CHAR)
2280 /* The last rule was invalid. */
2281 int rule = charbuf[-1] + 0xA0;
2283 charbuf[-2] = BYTE8_TO_CHAR (rule);
2284 charbuf[-1] = -1;
2285 new_chars = 1;
2288 else
2290 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2292 if (cmp_status->method == COMPOSITION_WITH_RULE)
2294 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2295 charbuf[idx++] = -3;
2296 charbuf[idx++] = 0;
2297 new_chars = 1;
2299 else
2301 int nchars = charbuf[idx + 1] + 0xA0;
2302 int nbytes = charbuf[idx + 2] + 0xA0;
2304 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2305 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2306 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2307 charbuf[idx++] = -1;
2308 new_chars = 4;
2311 cmp_status->state = COMPOSING_NO;
2312 return new_chars;
2315 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2316 do { \
2317 if (cmp_status->state != COMPOSING_NO) \
2318 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2319 } while (0)
2322 static void
2323 decode_coding_emacs_mule (struct coding_system *coding)
2325 const unsigned char *src = coding->source + coding->consumed;
2326 const unsigned char *src_end = coding->source + coding->src_bytes;
2327 const unsigned char *src_base;
2328 int *charbuf = coding->charbuf + coding->charbuf_used;
2329 /* We may produce two annotations (charset and composition) in one
2330 loop and one more charset annotation at the end. */
2331 int *charbuf_end
2332 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2333 /* We can produce up to 2 characters in a loop. */
2334 - 1;
2335 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2336 bool multibytep = coding->src_multibyte;
2337 ptrdiff_t char_offset = coding->produced_char;
2338 ptrdiff_t last_offset = char_offset;
2339 int last_id = charset_ascii;
2340 bool eol_dos
2341 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2342 int byte_after_cr = -1;
2343 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2345 if (cmp_status->state != COMPOSING_NO)
2347 int i;
2349 if (charbuf_end - charbuf < cmp_status->length)
2350 emacs_abort ();
2351 for (i = 0; i < cmp_status->length; i++)
2352 *charbuf++ = cmp_status->carryover[i];
2353 coding->annotated = 1;
2356 while (1)
2358 int c, id IF_LINT (= 0);
2360 src_base = src;
2361 consumed_chars_base = consumed_chars;
2363 if (charbuf >= charbuf_end)
2365 if (byte_after_cr >= 0)
2366 src_base--;
2367 break;
2370 if (byte_after_cr >= 0)
2371 c = byte_after_cr, byte_after_cr = -1;
2372 else
2373 ONE_MORE_BYTE (c);
2375 if (c < 0 || c == 0x80)
2377 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2378 if (c < 0)
2380 *charbuf++ = -c;
2381 char_offset++;
2383 else
2384 DECODE_EMACS_MULE_COMPOSITION_START ();
2385 continue;
2388 if (c < 0x80)
2390 if (eol_dos && c == '\r')
2391 ONE_MORE_BYTE (byte_after_cr);
2392 id = charset_ascii;
2393 if (cmp_status->state != COMPOSING_NO)
2395 if (cmp_status->old_form)
2396 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2397 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2398 cmp_status->ncomps--;
2401 else
2403 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2404 /* emacs_mule_char can load a charset map from a file, which
2405 allocates a large structure and might cause buffer text
2406 to be relocated as result. Thus, we need to remember the
2407 original pointer to buffer text, and fix up all related
2408 pointers after the call. */
2409 const unsigned char *orig = coding->source;
2410 ptrdiff_t offset;
2412 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2413 cmp_status);
2414 offset = coding->source - orig;
2415 if (offset)
2417 src += offset;
2418 src_base += offset;
2419 src_end += offset;
2421 if (c < 0)
2423 if (c == -1)
2424 goto invalid_code;
2425 if (c == -2)
2426 break;
2428 src = src_base + nbytes;
2429 consumed_chars = consumed_chars_base + nchars;
2430 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2431 cmp_status->ncomps -= nchars;
2434 /* Now if C >= 0, we found a normally encoded character, if C <
2435 0, we found an old-style composition component character or
2436 rule. */
2438 if (cmp_status->state == COMPOSING_NO)
2440 if (last_id != id)
2442 if (last_id != charset_ascii)
2443 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2444 last_id);
2445 last_id = id;
2446 last_offset = char_offset;
2448 *charbuf++ = c;
2449 char_offset++;
2451 else if (cmp_status->state == COMPOSING_CHAR)
2453 if (cmp_status->old_form)
2455 if (c >= 0)
2457 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2458 *charbuf++ = c;
2459 char_offset++;
2461 else
2463 *charbuf++ = -c;
2464 cmp_status->nchars++;
2465 cmp_status->length++;
2466 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2467 EMACS_MULE_COMPOSITION_END ();
2468 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2469 cmp_status->state = COMPOSING_RULE;
2472 else
2474 *charbuf++ = c;
2475 cmp_status->length++;
2476 cmp_status->nchars--;
2477 if (cmp_status->nchars == 0)
2478 EMACS_MULE_COMPOSITION_END ();
2481 else if (cmp_status->state == COMPOSING_RULE)
2483 int rule;
2485 if (c >= 0)
2487 EMACS_MULE_COMPOSITION_END ();
2488 *charbuf++ = c;
2489 char_offset++;
2491 else
2493 c = -c;
2494 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2495 if (rule < 0)
2496 goto invalid_code;
2497 *charbuf++ = -2;
2498 *charbuf++ = rule;
2499 cmp_status->length += 2;
2500 cmp_status->state = COMPOSING_CHAR;
2503 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2505 *charbuf++ = c;
2506 cmp_status->length++;
2507 if (cmp_status->ncomps == 0)
2508 cmp_status->state = COMPOSING_CHAR;
2509 else if (cmp_status->ncomps > 0)
2511 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2512 cmp_status->state = COMPOSING_COMPONENT_RULE;
2514 else
2515 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517 else /* COMPOSING_COMPONENT_RULE */
2519 int rule;
2521 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2522 if (rule < 0)
2523 goto invalid_code;
2524 *charbuf++ = -2;
2525 *charbuf++ = rule;
2526 cmp_status->length += 2;
2527 cmp_status->ncomps--;
2528 if (cmp_status->ncomps > 0)
2529 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2530 else
2531 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2533 continue;
2535 invalid_code:
2536 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2537 src = src_base;
2538 consumed_chars = consumed_chars_base;
2539 ONE_MORE_BYTE (c);
2540 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2541 char_offset++;
2542 coding->errors++;
2545 no_more_source:
2546 if (cmp_status->state != COMPOSING_NO)
2548 if (coding->mode & CODING_MODE_LAST_BLOCK)
2549 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550 else
2552 int i;
2554 charbuf -= cmp_status->length;
2555 for (i = 0; i < cmp_status->length; i++)
2556 cmp_status->carryover[i] = charbuf[i];
2559 if (last_id != charset_ascii)
2560 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2561 coding->consumed_char += consumed_chars_base;
2562 coding->consumed = src_base - coding->source;
2563 coding->charbuf_used = charbuf - coding->charbuf;
2567 #define EMACS_MULE_LEADING_CODES(id, codes) \
2568 do { \
2569 if (id < 0xA0) \
2570 codes[0] = id, codes[1] = 0; \
2571 else if (id < 0xE0) \
2572 codes[0] = 0x9A, codes[1] = id; \
2573 else if (id < 0xF0) \
2574 codes[0] = 0x9B, codes[1] = id; \
2575 else if (id < 0xF5) \
2576 codes[0] = 0x9C, codes[1] = id; \
2577 else \
2578 codes[0] = 0x9D, codes[1] = id; \
2579 } while (0);
2582 static bool
2583 encode_coding_emacs_mule (struct coding_system *coding)
2585 bool multibytep = coding->dst_multibyte;
2586 int *charbuf = coding->charbuf;
2587 int *charbuf_end = charbuf + coding->charbuf_used;
2588 unsigned char *dst = coding->destination + coding->produced;
2589 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2590 int safe_room = 8;
2591 ptrdiff_t produced_chars = 0;
2592 Lisp_Object attrs, charset_list;
2593 int c;
2594 int preferred_charset_id = -1;
2596 CODING_GET_INFO (coding, attrs, charset_list);
2597 if (! EQ (charset_list, Vemacs_mule_charset_list))
2599 charset_list = Vemacs_mule_charset_list;
2600 ASET (attrs, coding_attr_charset_list, charset_list);
2603 while (charbuf < charbuf_end)
2605 ASSURE_DESTINATION (safe_room);
2606 c = *charbuf++;
2608 if (c < 0)
2610 /* Handle an annotation. */
2611 switch (*charbuf)
2613 case CODING_ANNOTATE_COMPOSITION_MASK:
2614 /* Not yet implemented. */
2615 break;
2616 case CODING_ANNOTATE_CHARSET_MASK:
2617 preferred_charset_id = charbuf[3];
2618 if (preferred_charset_id >= 0
2619 && NILP (Fmemq (make_number (preferred_charset_id),
2620 charset_list)))
2621 preferred_charset_id = -1;
2622 break;
2623 default:
2624 emacs_abort ();
2626 charbuf += -c - 1;
2627 continue;
2630 if (ASCII_CHAR_P (c))
2631 EMIT_ONE_ASCII_BYTE (c);
2632 else if (CHAR_BYTE8_P (c))
2634 c = CHAR_TO_BYTE8 (c);
2635 EMIT_ONE_BYTE (c);
2637 else
2639 struct charset *charset;
2640 unsigned code;
2641 int dimension;
2642 int emacs_mule_id;
2643 unsigned char leading_codes[2];
2645 if (preferred_charset_id >= 0)
2647 bool result;
2649 charset = CHARSET_FROM_ID (preferred_charset_id);
2650 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2651 if (result)
2652 code = ENCODE_CHAR (charset, c);
2653 else
2654 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2655 &code, charset);
2657 else
2658 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2659 &code, charset);
2660 if (! charset)
2662 c = coding->default_char;
2663 if (ASCII_CHAR_P (c))
2665 EMIT_ONE_ASCII_BYTE (c);
2666 continue;
2668 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669 &code, charset);
2671 dimension = CHARSET_DIMENSION (charset);
2672 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2673 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2674 EMIT_ONE_BYTE (leading_codes[0]);
2675 if (leading_codes[1])
2676 EMIT_ONE_BYTE (leading_codes[1]);
2677 if (dimension == 1)
2678 EMIT_ONE_BYTE (code | 0x80);
2679 else
2681 code |= 0x8080;
2682 EMIT_ONE_BYTE (code >> 8);
2683 EMIT_ONE_BYTE (code & 0xFF);
2687 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2688 coding->produced_char += produced_chars;
2689 coding->produced = dst - coding->destination;
2690 return 0;
2694 /*** 7. ISO2022 handlers ***/
2696 /* The following note describes the coding system ISO2022 briefly.
2697 Since the intention of this note is to help understand the
2698 functions in this file, some parts are NOT ACCURATE or are OVERLY
2699 SIMPLIFIED. For thorough understanding, please refer to the
2700 original document of ISO2022. This is equivalent to the standard
2701 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2703 ISO2022 provides many mechanisms to encode several character sets
2704 in 7-bit and 8-bit environments. For 7-bit environments, all text
2705 is encoded using bytes less than 128. This may make the encoded
2706 text a little bit longer, but the text passes more easily through
2707 several types of gateway, some of which strip off the MSB (Most
2708 Significant Bit).
2710 There are two kinds of character sets: control character sets and
2711 graphic character sets. The former contain control characters such
2712 as `newline' and `escape' to provide control functions (control
2713 functions are also provided by escape sequences). The latter
2714 contain graphic characters such as 'A' and '-'. Emacs recognizes
2715 two control character sets and many graphic character sets.
2717 Graphic character sets are classified into one of the following
2718 four classes, according to the number of bytes (DIMENSION) and
2719 number of characters in one dimension (CHARS) of the set:
2720 - DIMENSION1_CHARS94
2721 - DIMENSION1_CHARS96
2722 - DIMENSION2_CHARS94
2723 - DIMENSION2_CHARS96
2725 In addition, each character set is assigned an identification tag,
2726 unique for each set, called the "final character" (denoted as <F>
2727 hereafter). The <F> of each character set is decided by ECMA(*)
2728 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2729 (0x30..0x3F are for private use only).
2731 Note (*): ECMA = European Computer Manufacturers Association
2733 Here are examples of graphic character sets [NAME(<F>)]:
2734 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2735 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2736 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2737 o DIMENSION2_CHARS96 -- none for the moment
2739 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2740 C0 [0x00..0x1F] -- control character plane 0
2741 GL [0x20..0x7F] -- graphic character plane 0
2742 C1 [0x80..0x9F] -- control character plane 1
2743 GR [0xA0..0xFF] -- graphic character plane 1
2745 A control character set is directly designated and invoked to C0 or
2746 C1 by an escape sequence. The most common case is that:
2747 - ISO646's control character set is designated/invoked to C0, and
2748 - ISO6429's control character set is designated/invoked to C1,
2749 and usually these designations/invocations are omitted in encoded
2750 text. In a 7-bit environment, only C0 can be used, and a control
2751 character for C1 is encoded by an appropriate escape sequence to
2752 fit into the environment. All control characters for C1 are
2753 defined to have corresponding escape sequences.
2755 A graphic character set is at first designated to one of four
2756 graphic registers (G0 through G3), then these graphic registers are
2757 invoked to GL or GR. These designations and invocations can be
2758 done independently. The most common case is that G0 is invoked to
2759 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2760 these invocations and designations are omitted in encoded text.
2761 In a 7-bit environment, only GL can be used.
2763 When a graphic character set of CHARS94 is invoked to GL, codes
2764 0x20 and 0x7F of the GL area work as control characters SPACE and
2765 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2766 be used.
2768 There are two ways of invocation: locking-shift and single-shift.
2769 With locking-shift, the invocation lasts until the next different
2770 invocation, whereas with single-shift, the invocation affects the
2771 following character only and doesn't affect the locking-shift
2772 state. Invocations are done by the following control characters or
2773 escape sequences:
2775 ----------------------------------------------------------------------
2776 abbrev function cntrl escape seq description
2777 ----------------------------------------------------------------------
2778 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2779 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2780 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2781 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2782 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2783 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2784 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2785 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2786 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2787 ----------------------------------------------------------------------
2788 (*) These are not used by any known coding system.
2790 Control characters for these functions are defined by macros
2791 ISO_CODE_XXX in `coding.h'.
2793 Designations are done by the following escape sequences:
2794 ----------------------------------------------------------------------
2795 escape sequence description
2796 ----------------------------------------------------------------------
2797 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2798 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2799 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2800 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2801 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2802 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2803 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2804 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2805 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2806 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2807 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2808 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2809 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2810 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2811 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2812 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2813 ----------------------------------------------------------------------
2815 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2816 of dimension 1, chars 94, and final character <F>, etc...
2818 Note (*): Although these designations are not allowed in ISO2022,
2819 Emacs accepts them on decoding, and produces them on encoding
2820 CHARS96 character sets in a coding system which is characterized as
2821 7-bit environment, non-locking-shift, and non-single-shift.
2823 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2824 '(' must be omitted. We refer to this as "short-form" hereafter.
2826 Now you may notice that there are a lot of ways of encoding the
2827 same multilingual text in ISO2022. Actually, there exist many
2828 coding systems such as Compound Text (used in X11's inter client
2829 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2830 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2831 localized platforms), and all of these are variants of ISO2022.
2833 In addition to the above, Emacs handles two more kinds of escape
2834 sequences: ISO6429's direction specification and Emacs' private
2835 sequence for specifying character composition.
2837 ISO6429's direction specification takes the following form:
2838 o CSI ']' -- end of the current direction
2839 o CSI '0' ']' -- end of the current direction
2840 o CSI '1' ']' -- start of left-to-right text
2841 o CSI '2' ']' -- start of right-to-left text
2842 The control character CSI (0x9B: control sequence introducer) is
2843 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2845 Character composition specification takes the following form:
2846 o ESC '0' -- start relative composition
2847 o ESC '1' -- end composition
2848 o ESC '2' -- start rule-base composition (*)
2849 o ESC '3' -- start relative composition with alternate chars (**)
2850 o ESC '4' -- start rule-base composition with alternate chars (**)
2851 Since these are not standard escape sequences of any ISO standard,
2852 the use of them with these meanings is restricted to Emacs only.
2854 (*) This form is used only in Emacs 20.7 and older versions,
2855 but newer versions can safely decode it.
2856 (**) This form is used only in Emacs 21.1 and newer versions,
2857 and older versions can't decode it.
2859 Here's a list of example usages of these composition escape
2860 sequences (categorized by `enum composition_method').
2862 COMPOSITION_RELATIVE:
2863 ESC 0 CHAR [ CHAR ] ESC 1
2864 COMPOSITION_WITH_RULE:
2865 ESC 2 CHAR [ RULE CHAR ] ESC 1
2866 COMPOSITION_WITH_ALTCHARS:
2867 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2868 COMPOSITION_WITH_RULE_ALTCHARS:
2869 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2871 static enum iso_code_class_type iso_code_class[256];
2873 #define SAFE_CHARSET_P(coding, id) \
2874 ((id) <= (coding)->max_charset_id \
2875 && (coding)->safe_charsets[id] != 255)
2877 static void
2878 setup_iso_safe_charsets (Lisp_Object attrs)
2880 Lisp_Object charset_list, safe_charsets;
2881 Lisp_Object request;
2882 Lisp_Object reg_usage;
2883 Lisp_Object tail;
2884 EMACS_INT reg94, reg96;
2885 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2886 int max_charset_id;
2888 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2889 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2890 && ! EQ (charset_list, Viso_2022_charset_list))
2892 charset_list = Viso_2022_charset_list;
2893 ASET (attrs, coding_attr_charset_list, charset_list);
2894 ASET (attrs, coding_attr_safe_charsets, Qnil);
2897 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2898 return;
2900 max_charset_id = 0;
2901 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2903 int id = XINT (XCAR (tail));
2904 if (max_charset_id < id)
2905 max_charset_id = id;
2908 safe_charsets = make_uninit_string (max_charset_id + 1);
2909 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2910 request = AREF (attrs, coding_attr_iso_request);
2911 reg_usage = AREF (attrs, coding_attr_iso_usage);
2912 reg94 = XINT (XCAR (reg_usage));
2913 reg96 = XINT (XCDR (reg_usage));
2915 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2917 Lisp_Object id;
2918 Lisp_Object reg;
2919 struct charset *charset;
2921 id = XCAR (tail);
2922 charset = CHARSET_FROM_ID (XINT (id));
2923 reg = Fcdr (Fassq (id, request));
2924 if (! NILP (reg))
2925 SSET (safe_charsets, XINT (id), XINT (reg));
2926 else if (charset->iso_chars_96)
2928 if (reg96 < 4)
2929 SSET (safe_charsets, XINT (id), reg96);
2931 else
2933 if (reg94 < 4)
2934 SSET (safe_charsets, XINT (id), reg94);
2937 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2942 Return true if a text is encoded in one of ISO-2022 based coding
2943 systems. */
2945 static bool
2946 detect_coding_iso_2022 (struct coding_system *coding,
2947 struct coding_detection_info *detect_info)
2949 const unsigned char *src = coding->source, *src_base = src;
2950 const unsigned char *src_end = coding->source + coding->src_bytes;
2951 bool multibytep = coding->src_multibyte;
2952 bool single_shifting = 0;
2953 int id;
2954 int c, c1;
2955 ptrdiff_t consumed_chars = 0;
2956 int i;
2957 int rejected = 0;
2958 int found = 0;
2959 int composition_count = -1;
2961 detect_info->checked |= CATEGORY_MASK_ISO;
2963 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2965 struct coding_system *this = &(coding_categories[i]);
2966 Lisp_Object attrs, val;
2968 if (this->id < 0)
2969 continue;
2970 attrs = CODING_ID_ATTRS (this->id);
2971 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2972 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2973 setup_iso_safe_charsets (attrs);
2974 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2975 this->max_charset_id = SCHARS (val) - 1;
2976 this->safe_charsets = SDATA (val);
2979 /* A coding system of this category is always ASCII compatible. */
2980 src += coding->head_ascii;
2982 while (rejected != CATEGORY_MASK_ISO)
2984 src_base = src;
2985 ONE_MORE_BYTE (c);
2986 switch (c)
2988 case ISO_CODE_ESC:
2989 if (inhibit_iso_escape_detection)
2990 break;
2991 single_shifting = 0;
2992 ONE_MORE_BYTE (c);
2993 if (c == 'N' || c == 'O')
2995 /* ESC <Fe> for SS2 or SS3. */
2996 single_shifting = 1;
2997 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2999 else if (c == '1')
3001 /* End of composition. */
3002 if (composition_count < 0
3003 || composition_count > MAX_COMPOSITION_COMPONENTS)
3004 /* Invalid */
3005 break;
3006 composition_count = -1;
3007 found |= CATEGORY_MASK_ISO;
3009 else if (c >= '0' && c <= '4')
3011 /* ESC <Fp> for start/end composition. */
3012 composition_count = 0;
3014 else
3016 if (c >= '(' && c <= '/')
3018 /* Designation sequence for a charset of dimension 1. */
3019 ONE_MORE_BYTE (c1);
3020 if (c1 < ' ' || c1 >= 0x80
3021 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3022 /* Invalid designation sequence. Just ignore. */
3023 break;
3025 else if (c == '$')
3027 /* Designation sequence for a charset of dimension 2. */
3028 ONE_MORE_BYTE (c);
3029 if (c >= '@' && c <= 'B')
3030 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3031 id = iso_charset_table[1][0][c];
3032 else if (c >= '(' && c <= '/')
3034 ONE_MORE_BYTE (c1);
3035 if (c1 < ' ' || c1 >= 0x80
3036 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3037 /* Invalid designation sequence. Just ignore. */
3038 break;
3040 else
3041 /* Invalid designation sequence. Just ignore it. */
3042 break;
3044 else
3046 /* Invalid escape sequence. Just ignore it. */
3047 break;
3050 /* We found a valid designation sequence for CHARSET. */
3051 rejected |= CATEGORY_MASK_ISO_8BIT;
3052 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3053 id))
3054 found |= CATEGORY_MASK_ISO_7;
3055 else
3056 rejected |= CATEGORY_MASK_ISO_7;
3057 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3058 id))
3059 found |= CATEGORY_MASK_ISO_7_TIGHT;
3060 else
3061 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3062 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3063 id))
3064 found |= CATEGORY_MASK_ISO_7_ELSE;
3065 else
3066 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3067 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3068 id))
3069 found |= CATEGORY_MASK_ISO_8_ELSE;
3070 else
3071 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3073 break;
3075 case ISO_CODE_SO:
3076 case ISO_CODE_SI:
3077 /* Locking shift out/in. */
3078 if (inhibit_iso_escape_detection)
3079 break;
3080 single_shifting = 0;
3081 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3082 break;
3084 case ISO_CODE_CSI:
3085 /* Control sequence introducer. */
3086 single_shifting = 0;
3087 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088 found |= CATEGORY_MASK_ISO_8_ELSE;
3089 goto check_extra_latin;
3091 case ISO_CODE_SS2:
3092 case ISO_CODE_SS3:
3093 /* Single shift. */
3094 if (inhibit_iso_escape_detection)
3095 break;
3096 single_shifting = 0;
3097 rejected |= CATEGORY_MASK_ISO_7BIT;
3098 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3099 & CODING_ISO_FLAG_SINGLE_SHIFT)
3101 found |= CATEGORY_MASK_ISO_8_1;
3102 single_shifting = 1;
3104 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3105 & CODING_ISO_FLAG_SINGLE_SHIFT)
3107 found |= CATEGORY_MASK_ISO_8_2;
3108 single_shifting = 1;
3110 if (single_shifting)
3111 break;
3112 goto check_extra_latin;
3114 default:
3115 if (c < 0)
3116 continue;
3117 if (c < 0x80)
3119 if (composition_count >= 0)
3120 composition_count++;
3121 single_shifting = 0;
3122 break;
3124 if (c >= 0xA0)
3126 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3127 found |= CATEGORY_MASK_ISO_8_1;
3128 /* Check the length of succeeding codes of the range
3129 0xA0..0FF. If the byte length is even, we include
3130 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3131 only when we are not single shifting. */
3132 if (! single_shifting
3133 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3135 int len = 1;
3136 while (src < src_end)
3138 src_base = src;
3139 ONE_MORE_BYTE (c);
3140 if (c < 0xA0)
3142 src = src_base;
3143 break;
3145 len++;
3148 if (len & 1 && src < src_end)
3150 rejected |= CATEGORY_MASK_ISO_8_2;
3151 if (composition_count >= 0)
3152 composition_count += len;
3154 else
3156 found |= CATEGORY_MASK_ISO_8_2;
3157 if (composition_count >= 0)
3158 composition_count += len / 2;
3161 break;
3163 check_extra_latin:
3164 if (! VECTORP (Vlatin_extra_code_table)
3165 || NILP (AREF (Vlatin_extra_code_table, c)))
3167 rejected = CATEGORY_MASK_ISO;
3168 break;
3170 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3171 & CODING_ISO_FLAG_LATIN_EXTRA)
3172 found |= CATEGORY_MASK_ISO_8_1;
3173 else
3174 rejected |= CATEGORY_MASK_ISO_8_1;
3175 rejected |= CATEGORY_MASK_ISO_8_2;
3176 break;
3179 detect_info->rejected |= CATEGORY_MASK_ISO;
3180 return 0;
3182 no_more_source:
3183 detect_info->rejected |= rejected;
3184 detect_info->found |= (found & ~rejected);
3185 return 1;
3189 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3190 escape sequence should be kept. */
3191 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3192 do { \
3193 int id, prev; \
3195 if (final < '0' || final >= 128 \
3196 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3197 || !SAFE_CHARSET_P (coding, id)) \
3199 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3200 chars_96 = -1; \
3201 break; \
3203 prev = CODING_ISO_DESIGNATION (coding, reg); \
3204 if (id == charset_jisx0201_roman) \
3206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3207 id = charset_ascii; \
3209 else if (id == charset_jisx0208_1978) \
3211 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3212 id = charset_jisx0208; \
3214 CODING_ISO_DESIGNATION (coding, reg) = id; \
3215 /* If there was an invalid designation to REG previously, and this \
3216 designation is ASCII to REG, we should keep this designation \
3217 sequence. */ \
3218 if (prev == -2 && id == charset_ascii) \
3219 chars_96 = -1; \
3220 } while (0)
3223 /* Handle these composition sequence (ALT: alternate char):
3225 (1) relative composition: ESC 0 CHAR ... ESC 1
3226 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3227 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3228 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3230 When the start sequence (ESC 0/2/3/4) is found, this annotation
3231 header is produced.
3233 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3235 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3236 produced until the end sequence (ESC 1) is found:
3238 (1) CHAR ... CHAR
3239 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3240 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3241 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3243 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3244 annotation header is updated as below:
3246 (1) LENGTH: unchanged, NCHARS: number of CHARs
3247 (2) LENGTH: unchanged, NCHARS: number of CHARs
3248 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3249 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3251 If an error is found while composing, the annotation header is
3252 changed to:
3254 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3256 and the sequence [ -2 DECODED-RULE ] is changed to the original
3257 byte sequence as below:
3258 o the original byte sequence is B: [ B -1 ]
3259 o the original byte sequence is B1 B2: [ B1 B2 ]
3260 and the sequence [ -1 -1 ] is changed to the original byte
3261 sequence:
3262 [ ESC '0' ]
3265 /* Decode a composition rule C1 and maybe one more byte from the
3266 source, and set RULE to the encoded composition rule. If the rule
3267 is invalid, goto invalid_code. */
3269 #define DECODE_COMPOSITION_RULE(rule) \
3270 do { \
3271 rule = c1 - 32; \
3272 if (rule < 0) \
3273 goto invalid_code; \
3274 if (rule < 81) /* old format (before ver.21) */ \
3276 int gref = (rule) / 9; \
3277 int nref = (rule) % 9; \
3278 if (gref == 4) gref = 10; \
3279 if (nref == 4) nref = 10; \
3280 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3282 else /* new format (after ver.21) */ \
3284 int b; \
3286 ONE_MORE_BYTE (b); \
3287 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3288 goto invalid_code; \
3289 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3290 rule += 0x100; /* Distinguish it from the old format. */ \
3292 } while (0)
3294 #define ENCODE_COMPOSITION_RULE(rule) \
3295 do { \
3296 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3298 if (rule < 0x100) /* old format */ \
3300 if (gref == 10) gref = 4; \
3301 if (nref == 10) nref = 4; \
3302 charbuf[idx] = 32 + gref * 9 + nref; \
3303 charbuf[idx + 1] = -1; \
3304 new_chars++; \
3306 else /* new format */ \
3308 charbuf[idx] = 32 + 81 + gref; \
3309 charbuf[idx + 1] = 32 + nref; \
3310 new_chars += 2; \
3312 } while (0)
3314 /* Finish the current composition as invalid. */
3316 static int
3317 finish_composition (int *charbuf, struct composition_status *cmp_status)
3319 int idx = - cmp_status->length;
3320 int new_chars;
3322 /* Recover the original ESC sequence */
3323 charbuf[idx++] = ISO_CODE_ESC;
3324 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3325 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3326 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3327 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3328 : '4');
3329 charbuf[idx++] = -2;
3330 charbuf[idx++] = 0;
3331 charbuf[idx++] = -1;
3332 new_chars = cmp_status->nchars;
3333 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3334 for (; idx < 0; idx++)
3336 int elt = charbuf[idx];
3338 if (elt == -2)
3340 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3341 idx++;
3343 else if (elt == -1)
3345 charbuf[idx++] = ISO_CODE_ESC;
3346 charbuf[idx] = '0';
3347 new_chars += 2;
3350 cmp_status->state = COMPOSING_NO;
3351 return new_chars;
3354 /* If characters are under composition, finish the composition. */
3355 #define MAYBE_FINISH_COMPOSITION() \
3356 do { \
3357 if (cmp_status->state != COMPOSING_NO) \
3358 char_offset += finish_composition (charbuf, cmp_status); \
3359 } while (0)
3361 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3363 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3364 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3365 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3366 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3368 Produce this annotation sequence now:
3370 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3373 #define DECODE_COMPOSITION_START(c1) \
3374 do { \
3375 if (c1 == '0' \
3376 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3377 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3378 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3379 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3381 *charbuf++ = -1; \
3382 *charbuf++= -1; \
3383 cmp_status->state = COMPOSING_CHAR; \
3384 cmp_status->length += 2; \
3386 else \
3388 MAYBE_FINISH_COMPOSITION (); \
3389 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3390 : c1 == '2' ? COMPOSITION_WITH_RULE \
3391 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3392 : COMPOSITION_WITH_RULE_ALTCHARS); \
3393 cmp_status->state \
3394 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3395 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3396 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3397 cmp_status->nchars = cmp_status->ncomps = 0; \
3398 coding->annotated = 1; \
3400 } while (0)
3403 /* Handle composition end sequence ESC 1. */
3405 #define DECODE_COMPOSITION_END() \
3406 do { \
3407 if (cmp_status->nchars == 0 \
3408 || ((cmp_status->state == COMPOSING_CHAR) \
3409 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3411 MAYBE_FINISH_COMPOSITION (); \
3412 goto invalid_code; \
3414 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3415 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3416 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3417 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3418 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3419 char_offset += cmp_status->nchars; \
3420 cmp_status->state = COMPOSING_NO; \
3421 } while (0)
3423 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3425 #define STORE_COMPOSITION_RULE(rule) \
3426 do { \
3427 *charbuf++ = -2; \
3428 *charbuf++ = rule; \
3429 cmp_status->length += 2; \
3430 cmp_status->state--; \
3431 } while (0)
3433 /* Store a composed char or a component char C in charbuf, and update
3434 cmp_status. */
3436 #define STORE_COMPOSITION_CHAR(c) \
3437 do { \
3438 *charbuf++ = (c); \
3439 cmp_status->length++; \
3440 if (cmp_status->state == COMPOSING_CHAR) \
3441 cmp_status->nchars++; \
3442 else \
3443 cmp_status->ncomps++; \
3444 if (cmp_status->method == COMPOSITION_WITH_RULE \
3445 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3446 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3447 cmp_status->state++; \
3448 } while (0)
3451 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3453 static void
3454 decode_coding_iso_2022 (struct coding_system *coding)
3456 const unsigned char *src = coding->source + coding->consumed;
3457 const unsigned char *src_end = coding->source + coding->src_bytes;
3458 const unsigned char *src_base;
3459 int *charbuf = coding->charbuf + coding->charbuf_used;
3460 /* We may produce two annotations (charset and composition) in one
3461 loop and one more charset annotation at the end. */
3462 int *charbuf_end
3463 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3464 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3465 bool multibytep = coding->src_multibyte;
3466 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3467 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3468 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3469 int charset_id_2, charset_id_3;
3470 struct charset *charset;
3471 int c;
3472 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3473 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3474 ptrdiff_t char_offset = coding->produced_char;
3475 ptrdiff_t last_offset = char_offset;
3476 int last_id = charset_ascii;
3477 bool eol_dos
3478 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3479 int byte_after_cr = -1;
3480 int i;
3482 setup_iso_safe_charsets (attrs);
3483 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3485 if (cmp_status->state != COMPOSING_NO)
3487 if (charbuf_end - charbuf < cmp_status->length)
3488 emacs_abort ();
3489 for (i = 0; i < cmp_status->length; i++)
3490 *charbuf++ = cmp_status->carryover[i];
3491 coding->annotated = 1;
3494 while (1)
3496 int c1, c2, c3;
3498 src_base = src;
3499 consumed_chars_base = consumed_chars;
3501 if (charbuf >= charbuf_end)
3503 if (byte_after_cr >= 0)
3504 src_base--;
3505 break;
3508 if (byte_after_cr >= 0)
3509 c1 = byte_after_cr, byte_after_cr = -1;
3510 else
3511 ONE_MORE_BYTE (c1);
3512 if (c1 < 0)
3513 goto invalid_code;
3515 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3517 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3518 char_offset++;
3519 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3520 continue;
3523 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3525 if (c1 == ISO_CODE_ESC)
3527 if (src + 1 >= src_end)
3528 goto no_more_source;
3529 *charbuf++ = ISO_CODE_ESC;
3530 char_offset++;
3531 if (src[0] == '%' && src[1] == '@')
3533 src += 2;
3534 consumed_chars += 2;
3535 char_offset += 2;
3536 /* We are sure charbuf can contain two more chars. */
3537 *charbuf++ = '%';
3538 *charbuf++ = '@';
3539 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3542 else
3544 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545 char_offset++;
3547 continue;
3550 if ((cmp_status->state == COMPOSING_RULE
3551 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3552 && c1 != ISO_CODE_ESC)
3554 int rule;
3556 DECODE_COMPOSITION_RULE (rule);
3557 STORE_COMPOSITION_RULE (rule);
3558 continue;
3561 /* We produce at most one character. */
3562 switch (iso_code_class [c1])
3564 case ISO_0x20_or_0x7F:
3565 if (charset_id_0 < 0
3566 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3567 /* This is SPACE or DEL. */
3568 charset = CHARSET_FROM_ID (charset_ascii);
3569 else
3570 charset = CHARSET_FROM_ID (charset_id_0);
3571 break;
3573 case ISO_graphic_plane_0:
3574 if (charset_id_0 < 0)
3575 charset = CHARSET_FROM_ID (charset_ascii);
3576 else
3577 charset = CHARSET_FROM_ID (charset_id_0);
3578 break;
3580 case ISO_0xA0_or_0xFF:
3581 if (charset_id_1 < 0
3582 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3583 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3584 goto invalid_code;
3585 /* This is a graphic character, we fall down ... */
3587 case ISO_graphic_plane_1:
3588 if (charset_id_1 < 0)
3589 goto invalid_code;
3590 charset = CHARSET_FROM_ID (charset_id_1);
3591 break;
3593 case ISO_control_0:
3594 if (eol_dos && c1 == '\r')
3595 ONE_MORE_BYTE (byte_after_cr);
3596 MAYBE_FINISH_COMPOSITION ();
3597 charset = CHARSET_FROM_ID (charset_ascii);
3598 break;
3600 case ISO_control_1:
3601 goto invalid_code;
3603 case ISO_shift_out:
3604 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3605 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3606 goto invalid_code;
3607 CODING_ISO_INVOCATION (coding, 0) = 1;
3608 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3609 continue;
3611 case ISO_shift_in:
3612 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3613 goto invalid_code;
3614 CODING_ISO_INVOCATION (coding, 0) = 0;
3615 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3616 continue;
3618 case ISO_single_shift_2_7:
3619 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3620 goto invalid_code;
3621 case ISO_single_shift_2:
3622 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3623 goto invalid_code;
3624 /* SS2 is handled as an escape sequence of ESC 'N' */
3625 c1 = 'N';
3626 goto label_escape_sequence;
3628 case ISO_single_shift_3:
3629 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3630 goto invalid_code;
3631 /* SS2 is handled as an escape sequence of ESC 'O' */
3632 c1 = 'O';
3633 goto label_escape_sequence;
3635 case ISO_control_sequence_introducer:
3636 /* CSI is handled as an escape sequence of ESC '[' ... */
3637 c1 = '[';
3638 goto label_escape_sequence;
3640 case ISO_escape:
3641 ONE_MORE_BYTE (c1);
3642 label_escape_sequence:
3643 /* Escape sequences handled here are invocation,
3644 designation, direction specification, and character
3645 composition specification. */
3646 switch (c1)
3648 case '&': /* revision of following character set */
3649 ONE_MORE_BYTE (c1);
3650 if (!(c1 >= '@' && c1 <= '~'))
3651 goto invalid_code;
3652 ONE_MORE_BYTE (c1);
3653 if (c1 != ISO_CODE_ESC)
3654 goto invalid_code;
3655 ONE_MORE_BYTE (c1);
3656 goto label_escape_sequence;
3658 case '$': /* designation of 2-byte character set */
3659 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3660 goto invalid_code;
3662 int reg, chars96;
3664 ONE_MORE_BYTE (c1);
3665 if (c1 >= '@' && c1 <= 'B')
3666 { /* designation of JISX0208.1978, GB2312.1980,
3667 or JISX0208.1980 */
3668 reg = 0, chars96 = 0;
3670 else if (c1 >= 0x28 && c1 <= 0x2B)
3671 { /* designation of DIMENSION2_CHARS94 character set */
3672 reg = c1 - 0x28, chars96 = 0;
3673 ONE_MORE_BYTE (c1);
3675 else if (c1 >= 0x2C && c1 <= 0x2F)
3676 { /* designation of DIMENSION2_CHARS96 character set */
3677 reg = c1 - 0x2C, chars96 = 1;
3678 ONE_MORE_BYTE (c1);
3680 else
3681 goto invalid_code;
3682 DECODE_DESIGNATION (reg, 2, chars96, c1);
3683 /* We must update these variables now. */
3684 if (reg == 0)
3685 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3686 else if (reg == 1)
3687 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3688 if (chars96 < 0)
3689 goto invalid_code;
3691 continue;
3693 case 'n': /* invocation of locking-shift-2 */
3694 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3695 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3696 goto invalid_code;
3697 CODING_ISO_INVOCATION (coding, 0) = 2;
3698 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3699 continue;
3701 case 'o': /* invocation of locking-shift-3 */
3702 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3703 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3704 goto invalid_code;
3705 CODING_ISO_INVOCATION (coding, 0) = 3;
3706 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3707 continue;
3709 case 'N': /* invocation of single-shift-2 */
3710 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3711 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3712 goto invalid_code;
3713 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3714 if (charset_id_2 < 0)
3715 charset = CHARSET_FROM_ID (charset_ascii);
3716 else
3717 charset = CHARSET_FROM_ID (charset_id_2);
3718 ONE_MORE_BYTE (c1);
3719 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3720 goto invalid_code;
3721 break;
3723 case 'O': /* invocation of single-shift-3 */
3724 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3725 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3726 goto invalid_code;
3727 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3728 if (charset_id_3 < 0)
3729 charset = CHARSET_FROM_ID (charset_ascii);
3730 else
3731 charset = CHARSET_FROM_ID (charset_id_3);
3732 ONE_MORE_BYTE (c1);
3733 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3734 goto invalid_code;
3735 break;
3737 case '0': case '2': case '3': case '4': /* start composition */
3738 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3739 goto invalid_code;
3740 if (last_id != charset_ascii)
3742 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3743 last_id = charset_ascii;
3744 last_offset = char_offset;
3746 DECODE_COMPOSITION_START (c1);
3747 continue;
3749 case '1': /* end composition */
3750 if (cmp_status->state == COMPOSING_NO)
3751 goto invalid_code;
3752 DECODE_COMPOSITION_END ();
3753 continue;
3755 case '[': /* specification of direction */
3756 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3757 goto invalid_code;
3758 /* For the moment, nested direction is not supported.
3759 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3760 left-to-right, and nonzero means right-to-left. */
3761 ONE_MORE_BYTE (c1);
3762 switch (c1)
3764 case ']': /* end of the current direction */
3765 coding->mode &= ~CODING_MODE_DIRECTION;
3767 case '0': /* end of the current direction */
3768 case '1': /* start of left-to-right direction */
3769 ONE_MORE_BYTE (c1);
3770 if (c1 == ']')
3771 coding->mode &= ~CODING_MODE_DIRECTION;
3772 else
3773 goto invalid_code;
3774 break;
3776 case '2': /* start of right-to-left direction */
3777 ONE_MORE_BYTE (c1);
3778 if (c1 == ']')
3779 coding->mode |= CODING_MODE_DIRECTION;
3780 else
3781 goto invalid_code;
3782 break;
3784 default:
3785 goto invalid_code;
3787 continue;
3789 case '%':
3790 ONE_MORE_BYTE (c1);
3791 if (c1 == '/')
3793 /* CTEXT extended segment:
3794 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3795 We keep these bytes as is for the moment.
3796 They may be decoded by post-read-conversion. */
3797 int dim, M, L;
3798 int size;
3800 ONE_MORE_BYTE (dim);
3801 if (dim < '0' || dim > '4')
3802 goto invalid_code;
3803 ONE_MORE_BYTE (M);
3804 if (M < 128)
3805 goto invalid_code;
3806 ONE_MORE_BYTE (L);
3807 if (L < 128)
3808 goto invalid_code;
3809 size = ((M - 128) * 128) + (L - 128);
3810 if (charbuf + 6 > charbuf_end)
3811 goto break_loop;
3812 *charbuf++ = ISO_CODE_ESC;
3813 *charbuf++ = '%';
3814 *charbuf++ = '/';
3815 *charbuf++ = dim;
3816 *charbuf++ = BYTE8_TO_CHAR (M);
3817 *charbuf++ = BYTE8_TO_CHAR (L);
3818 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3820 else if (c1 == 'G')
3822 /* XFree86 extension for embedding UTF-8 in CTEXT:
3823 ESC % G --UTF-8-BYTES-- ESC % @
3824 We keep these bytes as is for the moment.
3825 They may be decoded by post-read-conversion. */
3826 if (charbuf + 3 > charbuf_end)
3827 goto break_loop;
3828 *charbuf++ = ISO_CODE_ESC;
3829 *charbuf++ = '%';
3830 *charbuf++ = 'G';
3831 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3833 else
3834 goto invalid_code;
3835 continue;
3836 break;
3838 default:
3839 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3840 goto invalid_code;
3842 int reg, chars96;
3844 if (c1 >= 0x28 && c1 <= 0x2B)
3845 { /* designation of DIMENSION1_CHARS94 character set */
3846 reg = c1 - 0x28, chars96 = 0;
3847 ONE_MORE_BYTE (c1);
3849 else if (c1 >= 0x2C && c1 <= 0x2F)
3850 { /* designation of DIMENSION1_CHARS96 character set */
3851 reg = c1 - 0x2C, chars96 = 1;
3852 ONE_MORE_BYTE (c1);
3854 else
3855 goto invalid_code;
3856 DECODE_DESIGNATION (reg, 1, chars96, c1);
3857 /* We must update these variables now. */
3858 if (reg == 0)
3859 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3860 else if (reg == 1)
3861 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3862 if (chars96 < 0)
3863 goto invalid_code;
3865 continue;
3867 break;
3869 default:
3870 emacs_abort ();
3873 if (cmp_status->state == COMPOSING_NO
3874 && charset->id != charset_ascii
3875 && last_id != charset->id)
3877 if (last_id != charset_ascii)
3878 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3879 last_id = charset->id;
3880 last_offset = char_offset;
3883 /* Now we know CHARSET and 1st position code C1 of a character.
3884 Produce a decoded character while getting 2nd and 3rd
3885 position codes C2, C3 if necessary. */
3886 if (CHARSET_DIMENSION (charset) > 1)
3888 ONE_MORE_BYTE (c2);
3889 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3890 || ((c1 & 0x80) != (c2 & 0x80)))
3891 /* C2 is not in a valid range. */
3892 goto invalid_code;
3893 if (CHARSET_DIMENSION (charset) == 2)
3894 c1 = (c1 << 8) | c2;
3895 else
3897 ONE_MORE_BYTE (c3);
3898 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3899 || ((c1 & 0x80) != (c3 & 0x80)))
3900 /* C3 is not in a valid range. */
3901 goto invalid_code;
3902 c1 = (c1 << 16) | (c2 << 8) | c2;
3905 c1 &= 0x7F7F7F;
3906 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3907 if (c < 0)
3909 MAYBE_FINISH_COMPOSITION ();
3910 for (; src_base < src; src_base++, char_offset++)
3912 if (ASCII_BYTE_P (*src_base))
3913 *charbuf++ = *src_base;
3914 else
3915 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3918 else if (cmp_status->state == COMPOSING_NO)
3920 *charbuf++ = c;
3921 char_offset++;
3923 else if ((cmp_status->state == COMPOSING_CHAR
3924 ? cmp_status->nchars
3925 : cmp_status->ncomps)
3926 >= MAX_COMPOSITION_COMPONENTS)
3928 /* Too long composition. */
3929 MAYBE_FINISH_COMPOSITION ();
3930 *charbuf++ = c;
3931 char_offset++;
3933 else
3934 STORE_COMPOSITION_CHAR (c);
3935 continue;
3937 invalid_code:
3938 MAYBE_FINISH_COMPOSITION ();
3939 src = src_base;
3940 consumed_chars = consumed_chars_base;
3941 ONE_MORE_BYTE (c);
3942 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3943 char_offset++;
3944 coding->errors++;
3945 /* Reset the invocation and designation status to the safest
3946 one; i.e. designate ASCII to the graphic register 0, and
3947 invoke that register to the graphic plane 0. This typically
3948 helps the case that an designation sequence for ASCII "ESC (
3949 B" is somehow broken (e.g. broken by a newline). */
3950 CODING_ISO_INVOCATION (coding, 0) = 0;
3951 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3952 charset_id_0 = charset_ascii;
3953 continue;
3955 break_loop:
3956 break;
3959 no_more_source:
3960 if (cmp_status->state != COMPOSING_NO)
3962 if (coding->mode & CODING_MODE_LAST_BLOCK)
3963 MAYBE_FINISH_COMPOSITION ();
3964 else
3966 charbuf -= cmp_status->length;
3967 for (i = 0; i < cmp_status->length; i++)
3968 cmp_status->carryover[i] = charbuf[i];
3971 else if (last_id != charset_ascii)
3972 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3973 coding->consumed_char += consumed_chars_base;
3974 coding->consumed = src_base - coding->source;
3975 coding->charbuf_used = charbuf - coding->charbuf;
3979 /* ISO2022 encoding stuff. */
3982 It is not enough to say just "ISO2022" on encoding, we have to
3983 specify more details. In Emacs, each coding system of ISO2022
3984 variant has the following specifications:
3985 1. Initial designation to G0 thru G3.
3986 2. Allows short-form designation?
3987 3. ASCII should be designated to G0 before control characters?
3988 4. ASCII should be designated to G0 at end of line?
3989 5. 7-bit environment or 8-bit environment?
3990 6. Use locking-shift?
3991 7. Use Single-shift?
3992 And the following two are only for Japanese:
3993 8. Use ASCII in place of JIS0201-1976-Roman?
3994 9. Use JISX0208-1983 in place of JISX0208-1978?
3995 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3996 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3997 details.
4000 /* Produce codes (escape sequence) for designating CHARSET to graphic
4001 register REG at DST, and increment DST. If <final-char> of CHARSET is
4002 '@', 'A', or 'B' and the coding system CODING allows, produce
4003 designation sequence of short-form. */
4005 #define ENCODE_DESIGNATION(charset, reg, coding) \
4006 do { \
4007 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4008 const char *intermediate_char_94 = "()*+"; \
4009 const char *intermediate_char_96 = ",-./"; \
4010 int revision = -1; \
4012 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4013 revision = CHARSET_ISO_REVISION (charset); \
4015 if (revision >= 0) \
4017 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4018 EMIT_ONE_BYTE ('@' + revision); \
4020 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4021 if (CHARSET_DIMENSION (charset) == 1) \
4023 int b; \
4024 if (! CHARSET_ISO_CHARS_96 (charset)) \
4025 b = intermediate_char_94[reg]; \
4026 else \
4027 b = intermediate_char_96[reg]; \
4028 EMIT_ONE_ASCII_BYTE (b); \
4030 else \
4032 EMIT_ONE_ASCII_BYTE ('$'); \
4033 if (! CHARSET_ISO_CHARS_96 (charset)) \
4035 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4036 || reg != 0 \
4037 || final_char < '@' || final_char > 'B') \
4038 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4040 else \
4041 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4043 EMIT_ONE_ASCII_BYTE (final_char); \
4045 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4046 } while (0)
4049 /* The following two macros produce codes (control character or escape
4050 sequence) for ISO2022 single-shift functions (single-shift-2 and
4051 single-shift-3). */
4053 #define ENCODE_SINGLE_SHIFT_2 \
4054 do { \
4055 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4056 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4057 else \
4058 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4059 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4060 } while (0)
4063 #define ENCODE_SINGLE_SHIFT_3 \
4064 do { \
4065 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4066 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4067 else \
4068 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4069 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4070 } while (0)
4073 /* The following four macros produce codes (control character or
4074 escape sequence) for ISO2022 locking-shift functions (shift-in,
4075 shift-out, locking-shift-2, and locking-shift-3). */
4077 #define ENCODE_SHIFT_IN \
4078 do { \
4079 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4080 CODING_ISO_INVOCATION (coding, 0) = 0; \
4081 } while (0)
4084 #define ENCODE_SHIFT_OUT \
4085 do { \
4086 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4087 CODING_ISO_INVOCATION (coding, 0) = 1; \
4088 } while (0)
4091 #define ENCODE_LOCKING_SHIFT_2 \
4092 do { \
4093 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4094 CODING_ISO_INVOCATION (coding, 0) = 2; \
4095 } while (0)
4098 #define ENCODE_LOCKING_SHIFT_3 \
4099 do { \
4100 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4101 CODING_ISO_INVOCATION (coding, 0) = 3; \
4102 } while (0)
4105 /* Produce codes for a DIMENSION1 character whose character set is
4106 CHARSET and whose position-code is C1. Designation and invocation
4107 sequences are also produced in advance if necessary. */
4109 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4110 do { \
4111 int id = CHARSET_ID (charset); \
4113 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4114 && id == charset_ascii) \
4116 id = charset_jisx0201_roman; \
4117 charset = CHARSET_FROM_ID (id); \
4120 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4122 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4123 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4124 else \
4125 EMIT_ONE_BYTE (c1 | 0x80); \
4126 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4127 break; \
4129 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4131 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4132 break; \
4134 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4136 EMIT_ONE_BYTE (c1 | 0x80); \
4137 break; \
4139 else \
4140 /* Since CHARSET is not yet invoked to any graphic planes, we \
4141 must invoke it, or, at first, designate it to some graphic \
4142 register. Then repeat the loop to actually produce the \
4143 character. */ \
4144 dst = encode_invocation_designation (charset, coding, dst, \
4145 &produced_chars); \
4146 } while (1)
4149 /* Produce codes for a DIMENSION2 character whose character set is
4150 CHARSET and whose position-codes are C1 and C2. Designation and
4151 invocation codes are also produced in advance if necessary. */
4153 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4154 do { \
4155 int id = CHARSET_ID (charset); \
4157 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4158 && id == charset_jisx0208) \
4160 id = charset_jisx0208_1978; \
4161 charset = CHARSET_FROM_ID (id); \
4164 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4166 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4167 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4168 else \
4169 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4170 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4171 break; \
4173 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4175 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4176 break; \
4178 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4180 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4181 break; \
4183 else \
4184 /* Since CHARSET is not yet invoked to any graphic planes, we \
4185 must invoke it, or, at first, designate it to some graphic \
4186 register. Then repeat the loop to actually produce the \
4187 character. */ \
4188 dst = encode_invocation_designation (charset, coding, dst, \
4189 &produced_chars); \
4190 } while (1)
4193 #define ENCODE_ISO_CHARACTER(charset, c) \
4194 do { \
4195 unsigned code; \
4196 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4198 if (CHARSET_DIMENSION (charset) == 1) \
4199 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4200 else \
4201 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4202 } while (0)
4205 /* Produce designation and invocation codes at a place pointed by DST
4206 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4207 Return new DST. */
4209 static unsigned char *
4210 encode_invocation_designation (struct charset *charset,
4211 struct coding_system *coding,
4212 unsigned char *dst, ptrdiff_t *p_nchars)
4214 bool multibytep = coding->dst_multibyte;
4215 ptrdiff_t produced_chars = *p_nchars;
4216 int reg; /* graphic register number */
4217 int id = CHARSET_ID (charset);
4219 /* At first, check designations. */
4220 for (reg = 0; reg < 4; reg++)
4221 if (id == CODING_ISO_DESIGNATION (coding, reg))
4222 break;
4224 if (reg >= 4)
4226 /* CHARSET is not yet designated to any graphic registers. */
4227 /* At first check the requested designation. */
4228 reg = CODING_ISO_REQUEST (coding, id);
4229 if (reg < 0)
4230 /* Since CHARSET requests no special designation, designate it
4231 to graphic register 0. */
4232 reg = 0;
4234 ENCODE_DESIGNATION (charset, reg, coding);
4237 if (CODING_ISO_INVOCATION (coding, 0) != reg
4238 && CODING_ISO_INVOCATION (coding, 1) != reg)
4240 /* Since the graphic register REG is not invoked to any graphic
4241 planes, invoke it to graphic plane 0. */
4242 switch (reg)
4244 case 0: /* graphic register 0 */
4245 ENCODE_SHIFT_IN;
4246 break;
4248 case 1: /* graphic register 1 */
4249 ENCODE_SHIFT_OUT;
4250 break;
4252 case 2: /* graphic register 2 */
4253 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4254 ENCODE_SINGLE_SHIFT_2;
4255 else
4256 ENCODE_LOCKING_SHIFT_2;
4257 break;
4259 case 3: /* graphic register 3 */
4260 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4261 ENCODE_SINGLE_SHIFT_3;
4262 else
4263 ENCODE_LOCKING_SHIFT_3;
4264 break;
4268 *p_nchars = produced_chars;
4269 return dst;
4273 /* Produce codes for designation and invocation to reset the graphic
4274 planes and registers to initial state. */
4275 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4276 do { \
4277 int reg; \
4278 struct charset *charset; \
4280 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4281 ENCODE_SHIFT_IN; \
4282 for (reg = 0; reg < 4; reg++) \
4283 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4284 && (CODING_ISO_DESIGNATION (coding, reg) \
4285 != CODING_ISO_INITIAL (coding, reg))) \
4287 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4288 ENCODE_DESIGNATION (charset, reg, coding); \
4290 } while (0)
4293 /* Produce designation sequences of charsets in the line started from
4294 CHARBUF to a place pointed by DST, and return the number of
4295 produced bytes. DST should not directly point a buffer text area
4296 which may be relocated by char_charset call.
4298 If the current block ends before any end-of-line, we may fail to
4299 find all the necessary designations. */
4301 static ptrdiff_t
4302 encode_designation_at_bol (struct coding_system *coding,
4303 int *charbuf, int *charbuf_end,
4304 unsigned char *dst)
4306 unsigned char *orig = dst;
4307 struct charset *charset;
4308 /* Table of charsets to be designated to each graphic register. */
4309 int r[4];
4310 int c, found = 0, reg;
4311 ptrdiff_t produced_chars = 0;
4312 bool multibytep = coding->dst_multibyte;
4313 Lisp_Object attrs;
4314 Lisp_Object charset_list;
4316 attrs = CODING_ID_ATTRS (coding->id);
4317 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4318 if (EQ (charset_list, Qiso_2022))
4319 charset_list = Viso_2022_charset_list;
4321 for (reg = 0; reg < 4; reg++)
4322 r[reg] = -1;
4324 while (charbuf < charbuf_end && found < 4)
4326 int id;
4328 c = *charbuf++;
4329 if (c == '\n')
4330 break;
4331 charset = char_charset (c, charset_list, NULL);
4332 id = CHARSET_ID (charset);
4333 reg = CODING_ISO_REQUEST (coding, id);
4334 if (reg >= 0 && r[reg] < 0)
4336 found++;
4337 r[reg] = id;
4341 if (found)
4343 for (reg = 0; reg < 4; reg++)
4344 if (r[reg] >= 0
4345 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4346 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4349 return dst - orig;
4352 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4354 static bool
4355 encode_coding_iso_2022 (struct coding_system *coding)
4357 bool multibytep = coding->dst_multibyte;
4358 int *charbuf = coding->charbuf;
4359 int *charbuf_end = charbuf + coding->charbuf_used;
4360 unsigned char *dst = coding->destination + coding->produced;
4361 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4362 int safe_room = 16;
4363 bool bol_designation
4364 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4365 && CODING_ISO_BOL (coding));
4366 ptrdiff_t produced_chars = 0;
4367 Lisp_Object attrs, eol_type, charset_list;
4368 bool ascii_compatible;
4369 int c;
4370 int preferred_charset_id = -1;
4372 CODING_GET_INFO (coding, attrs, charset_list);
4373 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4374 if (VECTORP (eol_type))
4375 eol_type = Qunix;
4377 setup_iso_safe_charsets (attrs);
4378 /* Charset list may have been changed. */
4379 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4380 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4382 ascii_compatible
4383 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4384 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4385 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4387 while (charbuf < charbuf_end)
4389 ASSURE_DESTINATION (safe_room);
4391 if (bol_designation)
4393 /* We have to produce designation sequences if any now. */
4394 unsigned char desig_buf[16];
4395 int nbytes;
4396 ptrdiff_t offset;
4398 charset_map_loaded = 0;
4399 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4400 desig_buf);
4401 if (charset_map_loaded
4402 && (offset = coding_change_destination (coding)))
4404 dst += offset;
4405 dst_end += offset;
4407 memcpy (dst, desig_buf, nbytes);
4408 dst += nbytes;
4409 /* We are sure that designation sequences are all ASCII bytes. */
4410 produced_chars += nbytes;
4411 bol_designation = 0;
4412 ASSURE_DESTINATION (safe_room);
4415 c = *charbuf++;
4417 if (c < 0)
4419 /* Handle an annotation. */
4420 switch (*charbuf)
4422 case CODING_ANNOTATE_COMPOSITION_MASK:
4423 /* Not yet implemented. */
4424 break;
4425 case CODING_ANNOTATE_CHARSET_MASK:
4426 preferred_charset_id = charbuf[2];
4427 if (preferred_charset_id >= 0
4428 && NILP (Fmemq (make_number (preferred_charset_id),
4429 charset_list)))
4430 preferred_charset_id = -1;
4431 break;
4432 default:
4433 emacs_abort ();
4435 charbuf += -c - 1;
4436 continue;
4439 /* Now encode the character C. */
4440 if (c < 0x20 || c == 0x7F)
4442 if (c == '\n'
4443 || (c == '\r' && EQ (eol_type, Qmac)))
4445 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4446 ENCODE_RESET_PLANE_AND_REGISTER ();
4447 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4449 int i;
4451 for (i = 0; i < 4; i++)
4452 CODING_ISO_DESIGNATION (coding, i)
4453 = CODING_ISO_INITIAL (coding, i);
4455 bol_designation = ((CODING_ISO_FLAGS (coding)
4456 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4457 != 0);
4459 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4460 ENCODE_RESET_PLANE_AND_REGISTER ();
4461 EMIT_ONE_ASCII_BYTE (c);
4463 else if (ASCII_CHAR_P (c))
4465 if (ascii_compatible)
4466 EMIT_ONE_ASCII_BYTE (c);
4467 else
4469 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4470 ENCODE_ISO_CHARACTER (charset, c);
4473 else if (CHAR_BYTE8_P (c))
4475 c = CHAR_TO_BYTE8 (c);
4476 EMIT_ONE_BYTE (c);
4478 else
4480 struct charset *charset;
4482 if (preferred_charset_id >= 0)
4484 bool result;
4486 charset = CHARSET_FROM_ID (preferred_charset_id);
4487 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4488 if (! result)
4489 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4490 NULL, charset);
4492 else
4493 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4494 NULL, charset);
4495 if (!charset)
4497 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4499 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4500 charset = CHARSET_FROM_ID (charset_ascii);
4502 else
4504 c = coding->default_char;
4505 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4506 charset_list, NULL, charset);
4509 ENCODE_ISO_CHARACTER (charset, c);
4513 if (coding->mode & CODING_MODE_LAST_BLOCK
4514 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4516 ASSURE_DESTINATION (safe_room);
4517 ENCODE_RESET_PLANE_AND_REGISTER ();
4519 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4520 CODING_ISO_BOL (coding) = bol_designation;
4521 coding->produced_char += produced_chars;
4522 coding->produced = dst - coding->destination;
4523 return 0;
4527 /*** 8,9. SJIS and BIG5 handlers ***/
4529 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4530 quite widely. So, for the moment, Emacs supports them in the bare
4531 C code. But, in the future, they may be supported only by CCL. */
4533 /* SJIS is a coding system encoding three character sets: ASCII, right
4534 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4535 as is. A character of charset katakana-jisx0201 is encoded by
4536 "position-code + 0x80". A character of charset japanese-jisx0208
4537 is encoded in 2-byte but two position-codes are divided and shifted
4538 so that it fit in the range below.
4540 --- CODE RANGE of SJIS ---
4541 (character set) (range)
4542 ASCII 0x00 .. 0x7F
4543 KATAKANA-JISX0201 0xA0 .. 0xDF
4544 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4545 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4546 -------------------------------
4550 /* BIG5 is a coding system encoding two character sets: ASCII and
4551 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4552 character set and is encoded in two-byte.
4554 --- CODE RANGE of BIG5 ---
4555 (character set) (range)
4556 ASCII 0x00 .. 0x7F
4557 Big5 (1st byte) 0xA1 .. 0xFE
4558 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4559 --------------------------
4563 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4564 Return true if a text is encoded in SJIS. */
4566 static bool
4567 detect_coding_sjis (struct coding_system *coding,
4568 struct coding_detection_info *detect_info)
4570 const unsigned char *src = coding->source, *src_base;
4571 const unsigned char *src_end = coding->source + coding->src_bytes;
4572 bool multibytep = coding->src_multibyte;
4573 ptrdiff_t consumed_chars = 0;
4574 int found = 0;
4575 int c;
4576 Lisp_Object attrs, charset_list;
4577 int max_first_byte_of_2_byte_code;
4579 CODING_GET_INFO (coding, attrs, charset_list);
4580 max_first_byte_of_2_byte_code
4581 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4583 detect_info->checked |= CATEGORY_MASK_SJIS;
4584 /* A coding system of this category is always ASCII compatible. */
4585 src += coding->head_ascii;
4587 while (1)
4589 src_base = src;
4590 ONE_MORE_BYTE (c);
4591 if (c < 0x80)
4592 continue;
4593 if ((c >= 0x81 && c <= 0x9F)
4594 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4596 ONE_MORE_BYTE (c);
4597 if (c < 0x40 || c == 0x7F || c > 0xFC)
4598 break;
4599 found = CATEGORY_MASK_SJIS;
4601 else if (c >= 0xA0 && c < 0xE0)
4602 found = CATEGORY_MASK_SJIS;
4603 else
4604 break;
4606 detect_info->rejected |= CATEGORY_MASK_SJIS;
4607 return 0;
4609 no_more_source:
4610 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4612 detect_info->rejected |= CATEGORY_MASK_SJIS;
4613 return 0;
4615 detect_info->found |= found;
4616 return 1;
4619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4620 Return true if a text is encoded in BIG5. */
4622 static bool
4623 detect_coding_big5 (struct coding_system *coding,
4624 struct coding_detection_info *detect_info)
4626 const unsigned char *src = coding->source, *src_base;
4627 const unsigned char *src_end = coding->source + coding->src_bytes;
4628 bool multibytep = coding->src_multibyte;
4629 ptrdiff_t consumed_chars = 0;
4630 int found = 0;
4631 int c;
4633 detect_info->checked |= CATEGORY_MASK_BIG5;
4634 /* A coding system of this category is always ASCII compatible. */
4635 src += coding->head_ascii;
4637 while (1)
4639 src_base = src;
4640 ONE_MORE_BYTE (c);
4641 if (c < 0x80)
4642 continue;
4643 if (c >= 0xA1)
4645 ONE_MORE_BYTE (c);
4646 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4647 return 0;
4648 found = CATEGORY_MASK_BIG5;
4650 else
4651 break;
4653 detect_info->rejected |= CATEGORY_MASK_BIG5;
4654 return 0;
4656 no_more_source:
4657 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4659 detect_info->rejected |= CATEGORY_MASK_BIG5;
4660 return 0;
4662 detect_info->found |= found;
4663 return 1;
4666 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4668 static void
4669 decode_coding_sjis (struct coding_system *coding)
4671 const unsigned char *src = coding->source + coding->consumed;
4672 const unsigned char *src_end = coding->source + coding->src_bytes;
4673 const unsigned char *src_base;
4674 int *charbuf = coding->charbuf + coding->charbuf_used;
4675 /* We may produce one charset annotation in one loop and one more at
4676 the end. */
4677 int *charbuf_end
4678 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4679 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4680 bool multibytep = coding->src_multibyte;
4681 struct charset *charset_roman, *charset_kanji, *charset_kana;
4682 struct charset *charset_kanji2;
4683 Lisp_Object attrs, charset_list, val;
4684 ptrdiff_t char_offset = coding->produced_char;
4685 ptrdiff_t last_offset = char_offset;
4686 int last_id = charset_ascii;
4687 bool eol_dos
4688 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4689 int byte_after_cr = -1;
4691 CODING_GET_INFO (coding, attrs, charset_list);
4693 val = charset_list;
4694 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4695 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4696 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4697 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4699 while (1)
4701 int c, c1;
4702 struct charset *charset;
4704 src_base = src;
4705 consumed_chars_base = consumed_chars;
4707 if (charbuf >= charbuf_end)
4709 if (byte_after_cr >= 0)
4710 src_base--;
4711 break;
4714 if (byte_after_cr >= 0)
4715 c = byte_after_cr, byte_after_cr = -1;
4716 else
4717 ONE_MORE_BYTE (c);
4718 if (c < 0)
4719 goto invalid_code;
4720 if (c < 0x80)
4722 if (eol_dos && c == '\r')
4723 ONE_MORE_BYTE (byte_after_cr);
4724 charset = charset_roman;
4726 else if (c == 0x80 || c == 0xA0)
4727 goto invalid_code;
4728 else if (c >= 0xA1 && c <= 0xDF)
4730 /* SJIS -> JISX0201-Kana */
4731 c &= 0x7F;
4732 charset = charset_kana;
4734 else if (c <= 0xEF)
4736 /* SJIS -> JISX0208 */
4737 ONE_MORE_BYTE (c1);
4738 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4739 goto invalid_code;
4740 c = (c << 8) | c1;
4741 SJIS_TO_JIS (c);
4742 charset = charset_kanji;
4744 else if (c <= 0xFC && charset_kanji2)
4746 /* SJIS -> JISX0213-2 */
4747 ONE_MORE_BYTE (c1);
4748 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4749 goto invalid_code;
4750 c = (c << 8) | c1;
4751 SJIS_TO_JIS2 (c);
4752 charset = charset_kanji2;
4754 else
4755 goto invalid_code;
4756 if (charset->id != charset_ascii
4757 && last_id != charset->id)
4759 if (last_id != charset_ascii)
4760 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4761 last_id = charset->id;
4762 last_offset = char_offset;
4764 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4765 *charbuf++ = c;
4766 char_offset++;
4767 continue;
4769 invalid_code:
4770 src = src_base;
4771 consumed_chars = consumed_chars_base;
4772 ONE_MORE_BYTE (c);
4773 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4774 char_offset++;
4775 coding->errors++;
4778 no_more_source:
4779 if (last_id != charset_ascii)
4780 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4781 coding->consumed_char += consumed_chars_base;
4782 coding->consumed = src_base - coding->source;
4783 coding->charbuf_used = charbuf - coding->charbuf;
4786 static void
4787 decode_coding_big5 (struct coding_system *coding)
4789 const unsigned char *src = coding->source + coding->consumed;
4790 const unsigned char *src_end = coding->source + coding->src_bytes;
4791 const unsigned char *src_base;
4792 int *charbuf = coding->charbuf + coding->charbuf_used;
4793 /* We may produce one charset annotation in one loop and one more at
4794 the end. */
4795 int *charbuf_end
4796 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4797 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4798 bool multibytep = coding->src_multibyte;
4799 struct charset *charset_roman, *charset_big5;
4800 Lisp_Object attrs, charset_list, val;
4801 ptrdiff_t char_offset = coding->produced_char;
4802 ptrdiff_t last_offset = char_offset;
4803 int last_id = charset_ascii;
4804 bool eol_dos
4805 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4806 int byte_after_cr = -1;
4808 CODING_GET_INFO (coding, attrs, charset_list);
4809 val = charset_list;
4810 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4811 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4813 while (1)
4815 int c, c1;
4816 struct charset *charset;
4818 src_base = src;
4819 consumed_chars_base = consumed_chars;
4821 if (charbuf >= charbuf_end)
4823 if (byte_after_cr >= 0)
4824 src_base--;
4825 break;
4828 if (byte_after_cr >= 0)
4829 c = byte_after_cr, byte_after_cr = -1;
4830 else
4831 ONE_MORE_BYTE (c);
4833 if (c < 0)
4834 goto invalid_code;
4835 if (c < 0x80)
4837 if (eol_dos && c == '\r')
4838 ONE_MORE_BYTE (byte_after_cr);
4839 charset = charset_roman;
4841 else
4843 /* BIG5 -> Big5 */
4844 if (c < 0xA1 || c > 0xFE)
4845 goto invalid_code;
4846 ONE_MORE_BYTE (c1);
4847 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4848 goto invalid_code;
4849 c = c << 8 | c1;
4850 charset = charset_big5;
4852 if (charset->id != charset_ascii
4853 && last_id != charset->id)
4855 if (last_id != charset_ascii)
4856 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4857 last_id = charset->id;
4858 last_offset = char_offset;
4860 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4861 *charbuf++ = c;
4862 char_offset++;
4863 continue;
4865 invalid_code:
4866 src = src_base;
4867 consumed_chars = consumed_chars_base;
4868 ONE_MORE_BYTE (c);
4869 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4870 char_offset++;
4871 coding->errors++;
4874 no_more_source:
4875 if (last_id != charset_ascii)
4876 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4877 coding->consumed_char += consumed_chars_base;
4878 coding->consumed = src_base - coding->source;
4879 coding->charbuf_used = charbuf - coding->charbuf;
4882 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4883 This function can encode charsets `ascii', `katakana-jisx0201',
4884 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4885 are sure that all these charsets are registered as official charset
4886 (i.e. do not have extended leading-codes). Characters of other
4887 charsets are produced without any encoding. */
4889 static bool
4890 encode_coding_sjis (struct coding_system *coding)
4892 bool multibytep = coding->dst_multibyte;
4893 int *charbuf = coding->charbuf;
4894 int *charbuf_end = charbuf + coding->charbuf_used;
4895 unsigned char *dst = coding->destination + coding->produced;
4896 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4897 int safe_room = 4;
4898 ptrdiff_t produced_chars = 0;
4899 Lisp_Object attrs, charset_list, val;
4900 bool ascii_compatible;
4901 struct charset *charset_kanji, *charset_kana;
4902 struct charset *charset_kanji2;
4903 int c;
4905 CODING_GET_INFO (coding, attrs, charset_list);
4906 val = XCDR (charset_list);
4907 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4908 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4909 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4911 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4913 while (charbuf < charbuf_end)
4915 ASSURE_DESTINATION (safe_room);
4916 c = *charbuf++;
4917 /* Now encode the character C. */
4918 if (ASCII_CHAR_P (c) && ascii_compatible)
4919 EMIT_ONE_ASCII_BYTE (c);
4920 else if (CHAR_BYTE8_P (c))
4922 c = CHAR_TO_BYTE8 (c);
4923 EMIT_ONE_BYTE (c);
4925 else
4927 unsigned code;
4928 struct charset *charset;
4929 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4930 &code, charset);
4932 if (!charset)
4934 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4936 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4937 charset = CHARSET_FROM_ID (charset_ascii);
4939 else
4941 c = coding->default_char;
4942 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4943 charset_list, &code, charset);
4946 if (code == CHARSET_INVALID_CODE (charset))
4947 emacs_abort ();
4948 if (charset == charset_kanji)
4950 int c1, c2;
4951 JIS_TO_SJIS (code);
4952 c1 = code >> 8, c2 = code & 0xFF;
4953 EMIT_TWO_BYTES (c1, c2);
4955 else if (charset == charset_kana)
4956 EMIT_ONE_BYTE (code | 0x80);
4957 else if (charset_kanji2 && charset == charset_kanji2)
4959 int c1, c2;
4961 c1 = code >> 8;
4962 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4963 || c1 == 0x28
4964 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4966 JIS_TO_SJIS2 (code);
4967 c1 = code >> 8, c2 = code & 0xFF;
4968 EMIT_TWO_BYTES (c1, c2);
4970 else
4971 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4973 else
4974 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4977 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4978 coding->produced_char += produced_chars;
4979 coding->produced = dst - coding->destination;
4980 return 0;
4983 static bool
4984 encode_coding_big5 (struct coding_system *coding)
4986 bool multibytep = coding->dst_multibyte;
4987 int *charbuf = coding->charbuf;
4988 int *charbuf_end = charbuf + coding->charbuf_used;
4989 unsigned char *dst = coding->destination + coding->produced;
4990 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4991 int safe_room = 4;
4992 ptrdiff_t produced_chars = 0;
4993 Lisp_Object attrs, charset_list, val;
4994 bool ascii_compatible;
4995 struct charset *charset_big5;
4996 int c;
4998 CODING_GET_INFO (coding, attrs, charset_list);
4999 val = XCDR (charset_list);
5000 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5001 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5003 while (charbuf < charbuf_end)
5005 ASSURE_DESTINATION (safe_room);
5006 c = *charbuf++;
5007 /* Now encode the character C. */
5008 if (ASCII_CHAR_P (c) && ascii_compatible)
5009 EMIT_ONE_ASCII_BYTE (c);
5010 else if (CHAR_BYTE8_P (c))
5012 c = CHAR_TO_BYTE8 (c);
5013 EMIT_ONE_BYTE (c);
5015 else
5017 unsigned code;
5018 struct charset *charset;
5019 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5020 &code, charset);
5022 if (! charset)
5024 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5026 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5027 charset = CHARSET_FROM_ID (charset_ascii);
5029 else
5031 c = coding->default_char;
5032 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5033 charset_list, &code, charset);
5036 if (code == CHARSET_INVALID_CODE (charset))
5037 emacs_abort ();
5038 if (charset == charset_big5)
5040 int c1, c2;
5042 c1 = code >> 8, c2 = code & 0xFF;
5043 EMIT_TWO_BYTES (c1, c2);
5045 else
5046 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5049 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5050 coding->produced_char += produced_chars;
5051 coding->produced = dst - coding->destination;
5052 return 0;
5056 /*** 10. CCL handlers ***/
5058 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5059 Return true if a text is encoded in a coding system of which
5060 encoder/decoder are written in CCL program. */
5062 static bool
5063 detect_coding_ccl (struct coding_system *coding,
5064 struct coding_detection_info *detect_info)
5066 const unsigned char *src = coding->source, *src_base;
5067 const unsigned char *src_end = coding->source + coding->src_bytes;
5068 bool multibytep = coding->src_multibyte;
5069 ptrdiff_t consumed_chars = 0;
5070 int found = 0;
5071 unsigned char *valids;
5072 ptrdiff_t head_ascii = coding->head_ascii;
5073 Lisp_Object attrs;
5075 detect_info->checked |= CATEGORY_MASK_CCL;
5077 coding = &coding_categories[coding_category_ccl];
5078 valids = CODING_CCL_VALIDS (coding);
5079 attrs = CODING_ID_ATTRS (coding->id);
5080 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5081 src += head_ascii;
5083 while (1)
5085 int c;
5087 src_base = src;
5088 ONE_MORE_BYTE (c);
5089 if (c < 0 || ! valids[c])
5090 break;
5091 if ((valids[c] > 1))
5092 found = CATEGORY_MASK_CCL;
5094 detect_info->rejected |= CATEGORY_MASK_CCL;
5095 return 0;
5097 no_more_source:
5098 detect_info->found |= found;
5099 return 1;
5102 static void
5103 decode_coding_ccl (struct coding_system *coding)
5105 const unsigned char *src = coding->source + coding->consumed;
5106 const unsigned char *src_end = coding->source + coding->src_bytes;
5107 int *charbuf = coding->charbuf + coding->charbuf_used;
5108 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5109 ptrdiff_t consumed_chars = 0;
5110 bool multibytep = coding->src_multibyte;
5111 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5112 int source_charbuf[1024];
5113 int source_byteidx[1025];
5114 Lisp_Object attrs, charset_list;
5116 CODING_GET_INFO (coding, attrs, charset_list);
5118 while (1)
5120 const unsigned char *p = src;
5121 ptrdiff_t offset;
5122 int i = 0;
5124 if (multibytep)
5126 while (i < 1024 && p < src_end)
5128 source_byteidx[i] = p - src;
5129 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5131 source_byteidx[i] = p - src;
5133 else
5134 while (i < 1024 && p < src_end)
5135 source_charbuf[i++] = *p++;
5137 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5138 ccl->last_block = 1;
5139 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5140 charset_map_loaded = 0;
5141 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5142 charset_list);
5143 if (charset_map_loaded
5144 && (offset = coding_change_source (coding)))
5146 p += offset;
5147 src += offset;
5148 src_end += offset;
5150 charbuf += ccl->produced;
5151 if (multibytep)
5152 src += source_byteidx[ccl->consumed];
5153 else
5154 src += ccl->consumed;
5155 consumed_chars += ccl->consumed;
5156 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5157 break;
5160 switch (ccl->status)
5162 case CCL_STAT_SUSPEND_BY_SRC:
5163 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5164 break;
5165 case CCL_STAT_SUSPEND_BY_DST:
5166 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5167 break;
5168 case CCL_STAT_QUIT:
5169 case CCL_STAT_INVALID_CMD:
5170 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5171 break;
5172 default:
5173 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5174 break;
5176 coding->consumed_char += consumed_chars;
5177 coding->consumed = src - coding->source;
5178 coding->charbuf_used = charbuf - coding->charbuf;
5181 static bool
5182 encode_coding_ccl (struct coding_system *coding)
5184 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5185 bool multibytep = coding->dst_multibyte;
5186 int *charbuf = coding->charbuf;
5187 int *charbuf_end = charbuf + coding->charbuf_used;
5188 unsigned char *dst = coding->destination + coding->produced;
5189 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5190 int destination_charbuf[1024];
5191 ptrdiff_t produced_chars = 0;
5192 int i;
5193 Lisp_Object attrs, charset_list;
5195 CODING_GET_INFO (coding, attrs, charset_list);
5196 if (coding->consumed_char == coding->src_chars
5197 && coding->mode & CODING_MODE_LAST_BLOCK)
5198 ccl->last_block = 1;
5202 ptrdiff_t offset;
5204 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5205 charset_map_loaded = 0;
5206 ccl_driver (ccl, charbuf, destination_charbuf,
5207 charbuf_end - charbuf, 1024, charset_list);
5208 if (charset_map_loaded
5209 && (offset = coding_change_destination (coding)))
5210 dst += offset;
5211 if (multibytep)
5213 ASSURE_DESTINATION (ccl->produced * 2);
5214 for (i = 0; i < ccl->produced; i++)
5215 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5217 else
5219 ASSURE_DESTINATION (ccl->produced);
5220 for (i = 0; i < ccl->produced; i++)
5221 *dst++ = destination_charbuf[i] & 0xFF;
5222 produced_chars += ccl->produced;
5224 charbuf += ccl->consumed;
5225 if (ccl->status == CCL_STAT_QUIT
5226 || ccl->status == CCL_STAT_INVALID_CMD)
5227 break;
5229 while (charbuf < charbuf_end);
5231 switch (ccl->status)
5233 case CCL_STAT_SUSPEND_BY_SRC:
5234 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5235 break;
5236 case CCL_STAT_SUSPEND_BY_DST:
5237 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5238 break;
5239 case CCL_STAT_QUIT:
5240 case CCL_STAT_INVALID_CMD:
5241 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5242 break;
5243 default:
5244 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5245 break;
5248 coding->produced_char += produced_chars;
5249 coding->produced = dst - coding->destination;
5250 return 0;
5254 /*** 10, 11. no-conversion handlers ***/
5256 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5258 static void
5259 decode_coding_raw_text (struct coding_system *coding)
5261 bool eol_dos
5262 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5264 coding->chars_at_source = 1;
5265 coding->consumed_char = coding->src_chars;
5266 coding->consumed = coding->src_bytes;
5267 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5269 coding->consumed_char--;
5270 coding->consumed--;
5271 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5273 else
5274 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5277 static bool
5278 encode_coding_raw_text (struct coding_system *coding)
5280 bool multibytep = coding->dst_multibyte;
5281 int *charbuf = coding->charbuf;
5282 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5283 unsigned char *dst = coding->destination + coding->produced;
5284 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5285 ptrdiff_t produced_chars = 0;
5286 int c;
5288 if (multibytep)
5290 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5292 if (coding->src_multibyte)
5293 while (charbuf < charbuf_end)
5295 ASSURE_DESTINATION (safe_room);
5296 c = *charbuf++;
5297 if (ASCII_CHAR_P (c))
5298 EMIT_ONE_ASCII_BYTE (c);
5299 else if (CHAR_BYTE8_P (c))
5301 c = CHAR_TO_BYTE8 (c);
5302 EMIT_ONE_BYTE (c);
5304 else
5306 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5308 CHAR_STRING_ADVANCE (c, p1);
5311 EMIT_ONE_BYTE (*p0);
5312 p0++;
5314 while (p0 < p1);
5317 else
5318 while (charbuf < charbuf_end)
5320 ASSURE_DESTINATION (safe_room);
5321 c = *charbuf++;
5322 EMIT_ONE_BYTE (c);
5325 else
5327 if (coding->src_multibyte)
5329 int safe_room = MAX_MULTIBYTE_LENGTH;
5331 while (charbuf < charbuf_end)
5333 ASSURE_DESTINATION (safe_room);
5334 c = *charbuf++;
5335 if (ASCII_CHAR_P (c))
5336 *dst++ = c;
5337 else if (CHAR_BYTE8_P (c))
5338 *dst++ = CHAR_TO_BYTE8 (c);
5339 else
5340 CHAR_STRING_ADVANCE (c, dst);
5343 else
5345 ASSURE_DESTINATION (charbuf_end - charbuf);
5346 while (charbuf < charbuf_end && dst < dst_end)
5347 *dst++ = *charbuf++;
5349 produced_chars = dst - (coding->destination + coding->produced);
5351 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5352 coding->produced_char += produced_chars;
5353 coding->produced = dst - coding->destination;
5354 return 0;
5357 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5358 Return true if a text is encoded in a charset-based coding system. */
5360 static bool
5361 detect_coding_charset (struct coding_system *coding,
5362 struct coding_detection_info *detect_info)
5364 const unsigned char *src = coding->source, *src_base;
5365 const unsigned char *src_end = coding->source + coding->src_bytes;
5366 bool multibytep = coding->src_multibyte;
5367 ptrdiff_t consumed_chars = 0;
5368 Lisp_Object attrs, valids, name;
5369 int found = 0;
5370 ptrdiff_t head_ascii = coding->head_ascii;
5371 bool check_latin_extra = 0;
5373 detect_info->checked |= CATEGORY_MASK_CHARSET;
5375 coding = &coding_categories[coding_category_charset];
5376 attrs = CODING_ID_ATTRS (coding->id);
5377 valids = AREF (attrs, coding_attr_charset_valids);
5378 name = CODING_ID_NAME (coding->id);
5379 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5380 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5381 || strncmp (SSDATA (SYMBOL_NAME (name)),
5382 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5383 check_latin_extra = 1;
5385 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5386 src += head_ascii;
5388 while (1)
5390 int c;
5391 Lisp_Object val;
5392 struct charset *charset;
5393 int dim, idx;
5395 src_base = src;
5396 ONE_MORE_BYTE (c);
5397 if (c < 0)
5398 continue;
5399 val = AREF (valids, c);
5400 if (NILP (val))
5401 break;
5402 if (c >= 0x80)
5404 if (c < 0xA0
5405 && check_latin_extra
5406 && (!VECTORP (Vlatin_extra_code_table)
5407 || NILP (AREF (Vlatin_extra_code_table, c))))
5408 break;
5409 found = CATEGORY_MASK_CHARSET;
5411 if (INTEGERP (val))
5413 charset = CHARSET_FROM_ID (XFASTINT (val));
5414 dim = CHARSET_DIMENSION (charset);
5415 for (idx = 1; idx < dim; idx++)
5417 if (src == src_end)
5418 goto too_short;
5419 ONE_MORE_BYTE (c);
5420 if (c < charset->code_space[(dim - 1 - idx) * 4]
5421 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5422 break;
5424 if (idx < dim)
5425 break;
5427 else
5429 idx = 1;
5430 for (; CONSP (val); val = XCDR (val))
5432 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5433 dim = CHARSET_DIMENSION (charset);
5434 while (idx < dim)
5436 if (src == src_end)
5437 goto too_short;
5438 ONE_MORE_BYTE (c);
5439 if (c < charset->code_space[(dim - 1 - idx) * 4]
5440 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5441 break;
5442 idx++;
5444 if (idx == dim)
5446 val = Qnil;
5447 break;
5450 if (CONSP (val))
5451 break;
5454 too_short:
5455 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5456 return 0;
5458 no_more_source:
5459 detect_info->found |= found;
5460 return 1;
5463 static void
5464 decode_coding_charset (struct coding_system *coding)
5466 const unsigned char *src = coding->source + coding->consumed;
5467 const unsigned char *src_end = coding->source + coding->src_bytes;
5468 const unsigned char *src_base;
5469 int *charbuf = coding->charbuf + coding->charbuf_used;
5470 /* We may produce one charset annotation in one loop and one more at
5471 the end. */
5472 int *charbuf_end
5473 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5474 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5475 bool multibytep = coding->src_multibyte;
5476 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5477 Lisp_Object valids;
5478 ptrdiff_t char_offset = coding->produced_char;
5479 ptrdiff_t last_offset = char_offset;
5480 int last_id = charset_ascii;
5481 bool eol_dos
5482 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5483 int byte_after_cr = -1;
5485 valids = AREF (attrs, coding_attr_charset_valids);
5487 while (1)
5489 int c;
5490 Lisp_Object val;
5491 struct charset *charset;
5492 int dim;
5493 int len = 1;
5494 unsigned code;
5496 src_base = src;
5497 consumed_chars_base = consumed_chars;
5499 if (charbuf >= charbuf_end)
5501 if (byte_after_cr >= 0)
5502 src_base--;
5503 break;
5506 if (byte_after_cr >= 0)
5508 c = byte_after_cr;
5509 byte_after_cr = -1;
5511 else
5513 ONE_MORE_BYTE (c);
5514 if (eol_dos && c == '\r')
5515 ONE_MORE_BYTE (byte_after_cr);
5517 if (c < 0)
5518 goto invalid_code;
5519 code = c;
5521 val = AREF (valids, c);
5522 if (! INTEGERP (val) && ! CONSP (val))
5523 goto invalid_code;
5524 if (INTEGERP (val))
5526 charset = CHARSET_FROM_ID (XFASTINT (val));
5527 dim = CHARSET_DIMENSION (charset);
5528 while (len < dim)
5530 ONE_MORE_BYTE (c);
5531 code = (code << 8) | c;
5532 len++;
5534 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5535 charset, code, c);
5537 else
5539 /* VAL is a list of charset IDs. It is assured that the
5540 list is sorted by charset dimensions (smaller one
5541 comes first). */
5542 while (CONSP (val))
5544 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5545 dim = CHARSET_DIMENSION (charset);
5546 while (len < dim)
5548 ONE_MORE_BYTE (c);
5549 code = (code << 8) | c;
5550 len++;
5552 CODING_DECODE_CHAR (coding, src, src_base,
5553 src_end, charset, code, c);
5554 if (c >= 0)
5555 break;
5556 val = XCDR (val);
5559 if (c < 0)
5560 goto invalid_code;
5561 if (charset->id != charset_ascii
5562 && last_id != charset->id)
5564 if (last_id != charset_ascii)
5565 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5566 last_id = charset->id;
5567 last_offset = char_offset;
5570 *charbuf++ = c;
5571 char_offset++;
5572 continue;
5574 invalid_code:
5575 src = src_base;
5576 consumed_chars = consumed_chars_base;
5577 ONE_MORE_BYTE (c);
5578 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5579 char_offset++;
5580 coding->errors++;
5583 no_more_source:
5584 if (last_id != charset_ascii)
5585 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5586 coding->consumed_char += consumed_chars_base;
5587 coding->consumed = src_base - coding->source;
5588 coding->charbuf_used = charbuf - coding->charbuf;
5591 static bool
5592 encode_coding_charset (struct coding_system *coding)
5594 bool multibytep = coding->dst_multibyte;
5595 int *charbuf = coding->charbuf;
5596 int *charbuf_end = charbuf + coding->charbuf_used;
5597 unsigned char *dst = coding->destination + coding->produced;
5598 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5599 int safe_room = MAX_MULTIBYTE_LENGTH;
5600 ptrdiff_t produced_chars = 0;
5601 Lisp_Object attrs, charset_list;
5602 bool ascii_compatible;
5603 int c;
5605 CODING_GET_INFO (coding, attrs, charset_list);
5606 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5608 while (charbuf < charbuf_end)
5610 struct charset *charset;
5611 unsigned code;
5613 ASSURE_DESTINATION (safe_room);
5614 c = *charbuf++;
5615 if (ascii_compatible && ASCII_CHAR_P (c))
5616 EMIT_ONE_ASCII_BYTE (c);
5617 else if (CHAR_BYTE8_P (c))
5619 c = CHAR_TO_BYTE8 (c);
5620 EMIT_ONE_BYTE (c);
5622 else
5624 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5625 &code, charset);
5627 if (charset)
5629 if (CHARSET_DIMENSION (charset) == 1)
5630 EMIT_ONE_BYTE (code);
5631 else if (CHARSET_DIMENSION (charset) == 2)
5632 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5633 else if (CHARSET_DIMENSION (charset) == 3)
5634 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5635 else
5636 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5637 (code >> 8) & 0xFF, code & 0xFF);
5639 else
5641 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5642 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5643 else
5644 c = coding->default_char;
5645 EMIT_ONE_BYTE (c);
5650 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5651 coding->produced_char += produced_chars;
5652 coding->produced = dst - coding->destination;
5653 return 0;
5657 /*** 7. C library functions ***/
5659 /* Setup coding context CODING from information about CODING_SYSTEM.
5660 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5661 CODING_SYSTEM is invalid, signal an error. */
5663 void
5664 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5666 Lisp_Object attrs;
5667 Lisp_Object eol_type;
5668 Lisp_Object coding_type;
5669 Lisp_Object val;
5671 if (NILP (coding_system))
5672 coding_system = Qundecided;
5674 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5676 attrs = CODING_ID_ATTRS (coding->id);
5677 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5679 coding->mode = 0;
5680 if (VECTORP (eol_type))
5681 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5682 | CODING_REQUIRE_DETECTION_MASK);
5683 else if (! EQ (eol_type, Qunix))
5684 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5685 | CODING_REQUIRE_ENCODING_MASK);
5686 else
5687 coding->common_flags = 0;
5688 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5689 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5690 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5691 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5692 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5693 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5695 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5696 coding->max_charset_id = SCHARS (val) - 1;
5697 coding->safe_charsets = SDATA (val);
5698 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5699 coding->carryover_bytes = 0;
5701 coding_type = CODING_ATTR_TYPE (attrs);
5702 if (EQ (coding_type, Qundecided))
5704 coding->detector = NULL;
5705 coding->decoder = decode_coding_raw_text;
5706 coding->encoder = encode_coding_raw_text;
5707 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5709 else if (EQ (coding_type, Qiso_2022))
5711 int i;
5712 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5714 /* Invoke graphic register 0 to plane 0. */
5715 CODING_ISO_INVOCATION (coding, 0) = 0;
5716 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5717 CODING_ISO_INVOCATION (coding, 1)
5718 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5719 /* Setup the initial status of designation. */
5720 for (i = 0; i < 4; i++)
5721 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5722 /* Not single shifting initially. */
5723 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5724 /* Beginning of buffer should also be regarded as bol. */
5725 CODING_ISO_BOL (coding) = 1;
5726 coding->detector = detect_coding_iso_2022;
5727 coding->decoder = decode_coding_iso_2022;
5728 coding->encoder = encode_coding_iso_2022;
5729 if (flags & CODING_ISO_FLAG_SAFE)
5730 coding->mode |= CODING_MODE_SAFE_ENCODING;
5731 coding->common_flags
5732 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5733 | CODING_REQUIRE_FLUSHING_MASK);
5734 if (flags & CODING_ISO_FLAG_COMPOSITION)
5735 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5736 if (flags & CODING_ISO_FLAG_DESIGNATION)
5737 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5738 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5740 setup_iso_safe_charsets (attrs);
5741 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5742 coding->max_charset_id = SCHARS (val) - 1;
5743 coding->safe_charsets = SDATA (val);
5745 CODING_ISO_FLAGS (coding) = flags;
5746 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5747 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5748 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5749 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5751 else if (EQ (coding_type, Qcharset))
5753 coding->detector = detect_coding_charset;
5754 coding->decoder = decode_coding_charset;
5755 coding->encoder = encode_coding_charset;
5756 coding->common_flags
5757 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5759 else if (EQ (coding_type, Qutf_8))
5761 val = AREF (attrs, coding_attr_utf_bom);
5762 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5763 : EQ (val, Qt) ? utf_with_bom
5764 : utf_without_bom);
5765 coding->detector = detect_coding_utf_8;
5766 coding->decoder = decode_coding_utf_8;
5767 coding->encoder = encode_coding_utf_8;
5768 coding->common_flags
5769 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5770 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5771 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5773 else if (EQ (coding_type, Qutf_16))
5775 val = AREF (attrs, coding_attr_utf_bom);
5776 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5777 : EQ (val, Qt) ? utf_with_bom
5778 : utf_without_bom);
5779 val = AREF (attrs, coding_attr_utf_16_endian);
5780 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5781 : utf_16_little_endian);
5782 CODING_UTF_16_SURROGATE (coding) = 0;
5783 coding->detector = detect_coding_utf_16;
5784 coding->decoder = decode_coding_utf_16;
5785 coding->encoder = encode_coding_utf_16;
5786 coding->common_flags
5787 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5788 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5789 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5791 else if (EQ (coding_type, Qccl))
5793 coding->detector = detect_coding_ccl;
5794 coding->decoder = decode_coding_ccl;
5795 coding->encoder = encode_coding_ccl;
5796 coding->common_flags
5797 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5798 | CODING_REQUIRE_FLUSHING_MASK);
5800 else if (EQ (coding_type, Qemacs_mule))
5802 coding->detector = detect_coding_emacs_mule;
5803 coding->decoder = decode_coding_emacs_mule;
5804 coding->encoder = encode_coding_emacs_mule;
5805 coding->common_flags
5806 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5807 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5808 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5810 Lisp_Object tail, safe_charsets;
5811 int max_charset_id = 0;
5813 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5814 tail = XCDR (tail))
5815 if (max_charset_id < XFASTINT (XCAR (tail)))
5816 max_charset_id = XFASTINT (XCAR (tail));
5817 safe_charsets = make_uninit_string (max_charset_id + 1);
5818 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5819 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5820 tail = XCDR (tail))
5821 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5822 coding->max_charset_id = max_charset_id;
5823 coding->safe_charsets = SDATA (safe_charsets);
5825 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5826 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5828 else if (EQ (coding_type, Qshift_jis))
5830 coding->detector = detect_coding_sjis;
5831 coding->decoder = decode_coding_sjis;
5832 coding->encoder = encode_coding_sjis;
5833 coding->common_flags
5834 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5836 else if (EQ (coding_type, Qbig5))
5838 coding->detector = detect_coding_big5;
5839 coding->decoder = decode_coding_big5;
5840 coding->encoder = encode_coding_big5;
5841 coding->common_flags
5842 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5844 else /* EQ (coding_type, Qraw_text) */
5846 coding->detector = NULL;
5847 coding->decoder = decode_coding_raw_text;
5848 coding->encoder = encode_coding_raw_text;
5849 if (! EQ (eol_type, Qunix))
5851 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5852 if (! VECTORP (eol_type))
5853 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5858 return;
5861 /* Return a list of charsets supported by CODING. */
5863 Lisp_Object
5864 coding_charset_list (struct coding_system *coding)
5866 Lisp_Object attrs, charset_list;
5868 CODING_GET_INFO (coding, attrs, charset_list);
5869 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5871 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5873 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5874 charset_list = Viso_2022_charset_list;
5876 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5878 charset_list = Vemacs_mule_charset_list;
5880 return charset_list;
5884 /* Return a list of charsets supported by CODING-SYSTEM. */
5886 Lisp_Object
5887 coding_system_charset_list (Lisp_Object coding_system)
5889 ptrdiff_t id;
5890 Lisp_Object attrs, charset_list;
5892 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5893 attrs = CODING_ID_ATTRS (id);
5895 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5897 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5899 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5900 charset_list = Viso_2022_charset_list;
5901 else
5902 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5904 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5906 charset_list = Vemacs_mule_charset_list;
5908 else
5910 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5912 return charset_list;
5916 /* Return raw-text or one of its subsidiaries that has the same
5917 eol_type as CODING-SYSTEM. */
5919 Lisp_Object
5920 raw_text_coding_system (Lisp_Object coding_system)
5922 Lisp_Object spec, attrs;
5923 Lisp_Object eol_type, raw_text_eol_type;
5925 if (NILP (coding_system))
5926 return Qraw_text;
5927 spec = CODING_SYSTEM_SPEC (coding_system);
5928 attrs = AREF (spec, 0);
5930 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5931 return coding_system;
5933 eol_type = AREF (spec, 2);
5934 if (VECTORP (eol_type))
5935 return Qraw_text;
5936 spec = CODING_SYSTEM_SPEC (Qraw_text);
5937 raw_text_eol_type = AREF (spec, 2);
5938 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5939 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5940 : AREF (raw_text_eol_type, 2));
5944 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5945 the subsidiary that has the same eol-spec as PARENT (if it is not
5946 nil and specifies end-of-line format) or the system's setting
5947 (system_eol_type). */
5949 Lisp_Object
5950 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5952 Lisp_Object spec, eol_type;
5954 if (NILP (coding_system))
5955 coding_system = Qraw_text;
5956 spec = CODING_SYSTEM_SPEC (coding_system);
5957 eol_type = AREF (spec, 2);
5958 if (VECTORP (eol_type))
5960 Lisp_Object parent_eol_type;
5962 if (! NILP (parent))
5964 Lisp_Object parent_spec;
5966 parent_spec = CODING_SYSTEM_SPEC (parent);
5967 parent_eol_type = AREF (parent_spec, 2);
5968 if (VECTORP (parent_eol_type))
5969 parent_eol_type = system_eol_type;
5971 else
5972 parent_eol_type = system_eol_type;
5973 if (EQ (parent_eol_type, Qunix))
5974 coding_system = AREF (eol_type, 0);
5975 else if (EQ (parent_eol_type, Qdos))
5976 coding_system = AREF (eol_type, 1);
5977 else if (EQ (parent_eol_type, Qmac))
5978 coding_system = AREF (eol_type, 2);
5980 return coding_system;
5984 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5985 decided for writing to a process. If not, complement them, and
5986 return a new coding system. */
5988 Lisp_Object
5989 complement_process_encoding_system (Lisp_Object coding_system)
5991 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5992 Lisp_Object spec, attrs;
5993 int i;
5995 for (i = 0; i < 3; i++)
5997 if (i == 1)
5998 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5999 else if (i == 2)
6000 coding_system = preferred_coding_system ();
6001 spec = CODING_SYSTEM_SPEC (coding_system);
6002 if (NILP (spec))
6003 continue;
6004 attrs = AREF (spec, 0);
6005 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6006 coding_base = CODING_ATTR_BASE_NAME (attrs);
6007 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6008 eol_base = coding_system;
6009 if (! NILP (coding_base) && ! NILP (eol_base))
6010 break;
6013 if (i > 0)
6014 /* The original CODING_SYSTEM didn't specify text-conversion or
6015 eol-conversion. Be sure that we return a fully complemented
6016 coding system. */
6017 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6018 return coding_system;
6022 /* Emacs has a mechanism to automatically detect a coding system if it
6023 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6024 it's impossible to distinguish some coding systems accurately
6025 because they use the same range of codes. So, at first, coding
6026 systems are categorized into 7, those are:
6028 o coding-category-emacs-mule
6030 The category for a coding system which has the same code range
6031 as Emacs' internal format. Assigned the coding-system (Lisp
6032 symbol) `emacs-mule' by default.
6034 o coding-category-sjis
6036 The category for a coding system which has the same code range
6037 as SJIS. Assigned the coding-system (Lisp
6038 symbol) `japanese-shift-jis' by default.
6040 o coding-category-iso-7
6042 The category for a coding system which has the same code range
6043 as ISO2022 of 7-bit environment. This doesn't use any locking
6044 shift and single shift functions. This can encode/decode all
6045 charsets. Assigned the coding-system (Lisp symbol)
6046 `iso-2022-7bit' by default.
6048 o coding-category-iso-7-tight
6050 Same as coding-category-iso-7 except that this can
6051 encode/decode only the specified charsets.
6053 o coding-category-iso-8-1
6055 The category for a coding system which has the same code range
6056 as ISO2022 of 8-bit environment and graphic plane 1 used only
6057 for DIMENSION1 charset. This doesn't use any locking shift
6058 and single shift functions. Assigned the coding-system (Lisp
6059 symbol) `iso-latin-1' by default.
6061 o coding-category-iso-8-2
6063 The category for a coding system which has the same code range
6064 as ISO2022 of 8-bit environment and graphic plane 1 used only
6065 for DIMENSION2 charset. This doesn't use any locking shift
6066 and single shift functions. Assigned the coding-system (Lisp
6067 symbol) `japanese-iso-8bit' by default.
6069 o coding-category-iso-7-else
6071 The category for a coding system which has the same code range
6072 as ISO2022 of 7-bit environment but uses locking shift or
6073 single shift functions. Assigned the coding-system (Lisp
6074 symbol) `iso-2022-7bit-lock' by default.
6076 o coding-category-iso-8-else
6078 The category for a coding system which has the same code range
6079 as ISO2022 of 8-bit environment but uses locking shift or
6080 single shift functions. Assigned the coding-system (Lisp
6081 symbol) `iso-2022-8bit-ss2' by default.
6083 o coding-category-big5
6085 The category for a coding system which has the same code range
6086 as BIG5. Assigned the coding-system (Lisp symbol)
6087 `cn-big5' by default.
6089 o coding-category-utf-8
6091 The category for a coding system which has the same code range
6092 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6093 symbol) `utf-8' by default.
6095 o coding-category-utf-16-be
6097 The category for a coding system in which a text has an
6098 Unicode signature (cf. Unicode Standard) in the order of BIG
6099 endian at the head. Assigned the coding-system (Lisp symbol)
6100 `utf-16-be' by default.
6102 o coding-category-utf-16-le
6104 The category for a coding system in which a text has an
6105 Unicode signature (cf. Unicode Standard) in the order of
6106 LITTLE endian at the head. Assigned the coding-system (Lisp
6107 symbol) `utf-16-le' by default.
6109 o coding-category-ccl
6111 The category for a coding system of which encoder/decoder is
6112 written in CCL programs. The default value is nil, i.e., no
6113 coding system is assigned.
6115 o coding-category-binary
6117 The category for a coding system not categorized in any of the
6118 above. Assigned the coding-system (Lisp symbol)
6119 `no-conversion' by default.
6121 Each of them is a Lisp symbol and the value is an actual
6122 `coding-system's (this is also a Lisp symbol) assigned by a user.
6123 What Emacs does actually is to detect a category of coding system.
6124 Then, it uses a `coding-system' assigned to it. If Emacs can't
6125 decide only one possible category, it selects a category of the
6126 highest priority. Priorities of categories are also specified by a
6127 user in a Lisp variable `coding-category-list'.
6131 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6132 int eol_seen);
6135 /* Return the number of ASCII characters at the head of the source.
6136 By side effects, set coding->head_ascii and update
6137 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6138 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6139 reliable only when all the source bytes are ASCII. */
6141 static int
6142 check_ascii (struct coding_system *coding)
6144 const unsigned char *src, *end;
6145 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6146 int eol_seen = coding->eol_seen;
6148 coding_set_source (coding);
6149 src = coding->source;
6150 end = src + coding->src_bytes;
6152 if (inhibit_eol_conversion
6153 || SYMBOLP (eol_type))
6155 /* We don't have to check EOL format. */
6156 while (src < end && !( *src & 0x80))
6158 if (*src++ == '\n')
6159 eol_seen |= EOL_SEEN_LF;
6162 else
6164 end--; /* We look ahead one byte for "CR LF". */
6165 while (src < end)
6167 int c = *src;
6169 if (c & 0x80)
6170 break;
6171 src++;
6172 if (c == '\r')
6174 if (*src == '\n')
6176 eol_seen |= EOL_SEEN_CRLF;
6177 src++;
6179 else
6180 eol_seen |= EOL_SEEN_CR;
6182 else if (c == '\n')
6183 eol_seen |= EOL_SEEN_LF;
6185 if (src == end)
6187 int c = *src;
6189 /* All bytes but the last one C are ASCII. */
6190 if (! (c & 0x80))
6192 if (c == '\r')
6193 eol_seen |= EOL_SEEN_CR;
6194 else if (c == '\n')
6195 eol_seen |= EOL_SEEN_LF;
6196 src++;
6200 coding->head_ascii = src - coding->source;
6201 coding->eol_seen = eol_seen;
6202 return (coding->head_ascii);
6206 /* Return the number of characters at the source if all the bytes are
6207 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6208 effects, update coding->eol_seen. The value of coding->eol_seen is
6209 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6210 the value is reliable only when all the source bytes are valid
6211 UTF-8. */
6213 static int
6214 check_utf_8 (struct coding_system *coding)
6216 const unsigned char *src, *end;
6217 int eol_seen;
6218 int nchars = coding->head_ascii;
6220 if (coding->head_ascii < 0)
6221 check_ascii (coding);
6222 else
6223 coding_set_source (coding);
6224 src = coding->source + coding->head_ascii;
6225 /* We look ahead one byte for CR LF. */
6226 end = coding->source + coding->src_bytes - 1;
6227 eol_seen = coding->eol_seen;
6228 while (src < end)
6230 int c = *src;
6232 if (UTF_8_1_OCTET_P (*src))
6234 src++;
6235 if (c < 0x20)
6237 if (c == '\r')
6239 if (*src == '\n')
6241 eol_seen |= EOL_SEEN_CRLF;
6242 src++;
6243 nchars++;
6245 else
6246 eol_seen |= EOL_SEEN_CR;
6248 else if (c == '\n')
6249 eol_seen |= EOL_SEEN_LF;
6252 else if (UTF_8_2_OCTET_LEADING_P (c))
6254 if (c < 0xC2 /* overlong sequence */
6255 || src + 1 >= end
6256 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6257 return -1;
6258 src += 2;
6260 else if (UTF_8_3_OCTET_LEADING_P (c))
6262 if (src + 2 >= end
6263 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6264 && UTF_8_EXTRA_OCTET_P (src[2])))
6265 return -1;
6266 c = (((c & 0xF) << 12)
6267 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6268 if (c < 0x800 /* overlong sequence */
6269 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6270 return -1;
6271 src += 3;
6273 else if (UTF_8_4_OCTET_LEADING_P (c))
6275 if (src + 3 >= end
6276 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6277 && UTF_8_EXTRA_OCTET_P (src[2])
6278 && UTF_8_EXTRA_OCTET_P (src[3])))
6279 return -1;
6280 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6281 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6282 if (c < 0x10000 /* overlong sequence */
6283 || c >= 0x110000) /* non-Unicode character */
6284 return -1;
6285 src += 4;
6287 else
6288 return -1;
6289 nchars++;
6292 if (src == end)
6294 if (! UTF_8_1_OCTET_P (*src))
6295 return -1;
6296 nchars++;
6297 if (*src == '\r')
6298 eol_seen |= EOL_SEEN_CR;
6299 else if (*src == '\n')
6300 eol_seen |= EOL_SEEN_LF;
6302 coding->eol_seen = eol_seen;
6303 return nchars;
6307 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6308 SOURCE is encoded. If CATEGORY is one of
6309 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6310 two-byte, else they are encoded by one-byte.
6312 Return one of EOL_SEEN_XXX. */
6314 #define MAX_EOL_CHECK_COUNT 3
6316 static int
6317 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6318 enum coding_category category)
6320 const unsigned char *src = source, *src_end = src + src_bytes;
6321 unsigned char c;
6322 int total = 0;
6323 int eol_seen = EOL_SEEN_NONE;
6325 if ((1 << category) & CATEGORY_MASK_UTF_16)
6327 bool msb = category == (coding_category_utf_16_le
6328 | coding_category_utf_16_le_nosig);
6329 bool lsb = !msb;
6331 while (src + 1 < src_end)
6333 c = src[lsb];
6334 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6336 int this_eol;
6338 if (c == '\n')
6339 this_eol = EOL_SEEN_LF;
6340 else if (src + 3 >= src_end
6341 || src[msb + 2] != 0
6342 || src[lsb + 2] != '\n')
6343 this_eol = EOL_SEEN_CR;
6344 else
6346 this_eol = EOL_SEEN_CRLF;
6347 src += 2;
6350 if (eol_seen == EOL_SEEN_NONE)
6351 /* This is the first end-of-line. */
6352 eol_seen = this_eol;
6353 else if (eol_seen != this_eol)
6355 /* The found type is different from what found before.
6356 Allow for stray ^M characters in DOS EOL files. */
6357 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6358 || (eol_seen == EOL_SEEN_CRLF
6359 && this_eol == EOL_SEEN_CR))
6360 eol_seen = EOL_SEEN_CRLF;
6361 else
6363 eol_seen = EOL_SEEN_LF;
6364 break;
6367 if (++total == MAX_EOL_CHECK_COUNT)
6368 break;
6370 src += 2;
6373 else
6374 while (src < src_end)
6376 c = *src++;
6377 if (c == '\n' || c == '\r')
6379 int this_eol;
6381 if (c == '\n')
6382 this_eol = EOL_SEEN_LF;
6383 else if (src >= src_end || *src != '\n')
6384 this_eol = EOL_SEEN_CR;
6385 else
6386 this_eol = EOL_SEEN_CRLF, src++;
6388 if (eol_seen == EOL_SEEN_NONE)
6389 /* This is the first end-of-line. */
6390 eol_seen = this_eol;
6391 else if (eol_seen != this_eol)
6393 /* The found type is different from what found before.
6394 Allow for stray ^M characters in DOS EOL files. */
6395 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6396 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6397 eol_seen = EOL_SEEN_CRLF;
6398 else
6400 eol_seen = EOL_SEEN_LF;
6401 break;
6404 if (++total == MAX_EOL_CHECK_COUNT)
6405 break;
6408 return eol_seen;
6412 static Lisp_Object
6413 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6415 Lisp_Object eol_type;
6417 eol_type = CODING_ID_EOL_TYPE (coding->id);
6418 if (! VECTORP (eol_type))
6419 /* Already adjusted. */
6420 return eol_type;
6421 if (eol_seen & EOL_SEEN_LF)
6423 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6424 eol_type = Qunix;
6426 else if (eol_seen & EOL_SEEN_CRLF)
6428 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6429 eol_type = Qdos;
6431 else if (eol_seen & EOL_SEEN_CR)
6433 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6434 eol_type = Qmac;
6436 return eol_type;
6439 /* Detect how a text specified in CODING is encoded. If a coding
6440 system is detected, update fields of CODING by the detected coding
6441 system. */
6443 static void
6444 detect_coding (struct coding_system *coding)
6446 const unsigned char *src, *src_end;
6447 unsigned int saved_mode = coding->mode;
6448 Lisp_Object found = Qnil;
6449 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6451 coding->consumed = coding->consumed_char = 0;
6452 coding->produced = coding->produced_char = 0;
6453 coding_set_source (coding);
6455 src_end = coding->source + coding->src_bytes;
6457 coding->eol_seen = EOL_SEEN_NONE;
6458 /* If we have not yet decided the text encoding type, detect it
6459 now. */
6460 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6462 int c, i;
6463 struct coding_detection_info detect_info;
6464 bool null_byte_found = 0, eight_bit_found = 0;
6466 coding->head_ascii = 0;
6467 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6468 for (src = coding->source; src < src_end; src++)
6470 c = *src;
6471 if (c & 0x80)
6473 eight_bit_found = 1;
6474 if (null_byte_found)
6475 break;
6477 else if (c < 0x20)
6479 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6480 && ! inhibit_iso_escape_detection
6481 && ! detect_info.checked)
6483 if (detect_coding_iso_2022 (coding, &detect_info))
6485 /* We have scanned the whole data. */
6486 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6488 /* We didn't find an 8-bit code. We may
6489 have found a null-byte, but it's very
6490 rare that a binary file conforms to
6491 ISO-2022. */
6492 src = src_end;
6493 coding->head_ascii = src - coding->source;
6495 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6496 break;
6499 else if (! c && !inhibit_null_byte_detection)
6501 null_byte_found = 1;
6502 if (eight_bit_found)
6503 break;
6505 else if (! disable_ascii_optimization
6506 && ! inhibit_eol_conversion)
6508 if (c == '\r')
6510 if (src < src_end && src[1] == '\n')
6512 coding->eol_seen |= EOL_SEEN_CRLF;
6513 src++;
6514 if (! eight_bit_found)
6515 coding->head_ascii++;
6517 else
6518 coding->eol_seen |= EOL_SEEN_CR;
6520 else if (c == '\n')
6522 coding->eol_seen |= EOL_SEEN_LF;
6526 if (! eight_bit_found)
6527 coding->head_ascii++;
6529 else if (! eight_bit_found)
6530 coding->head_ascii++;
6533 if (null_byte_found || eight_bit_found
6534 || coding->head_ascii < coding->src_bytes
6535 || detect_info.found)
6537 enum coding_category category;
6538 struct coding_system *this;
6540 if (coding->head_ascii == coding->src_bytes)
6541 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6542 for (i = 0; i < coding_category_raw_text; i++)
6544 category = coding_priorities[i];
6545 this = coding_categories + category;
6546 if (detect_info.found & (1 << category))
6547 break;
6549 else
6551 if (null_byte_found)
6553 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6554 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6556 for (i = 0; i < coding_category_raw_text; i++)
6558 category = coding_priorities[i];
6559 this = coding_categories + category;
6560 /* Some of this->detector (e.g. detect_coding_sjis)
6561 require this information. */
6562 coding->id = this->id;
6563 if (this->id < 0)
6565 /* No coding system of this category is defined. */
6566 detect_info.rejected |= (1 << category);
6568 else if (category >= coding_category_raw_text)
6569 continue;
6570 else if (detect_info.checked & (1 << category))
6572 if (detect_info.found & (1 << category))
6573 break;
6575 else if ((*(this->detector)) (coding, &detect_info)
6576 && detect_info.found & (1 << category))
6577 break;
6581 if (i < coding_category_raw_text)
6583 if (category == coding_category_utf_8_auto)
6585 Lisp_Object coding_systems;
6587 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6588 coding_attr_utf_bom);
6589 if (CONSP (coding_systems))
6591 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6592 found = XCAR (coding_systems);
6593 else
6594 found = XCDR (coding_systems);
6596 else
6597 found = CODING_ID_NAME (this->id);
6599 else if (category == coding_category_utf_16_auto)
6601 Lisp_Object coding_systems;
6603 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6604 coding_attr_utf_bom);
6605 if (CONSP (coding_systems))
6607 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6608 found = XCAR (coding_systems);
6609 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6610 found = XCDR (coding_systems);
6612 else
6613 found = CODING_ID_NAME (this->id);
6615 else
6616 found = CODING_ID_NAME (this->id);
6618 else if (null_byte_found)
6619 found = Qno_conversion;
6620 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6621 == CATEGORY_MASK_ANY)
6622 found = Qraw_text;
6623 else if (detect_info.rejected)
6624 for (i = 0; i < coding_category_raw_text; i++)
6625 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6627 this = coding_categories + coding_priorities[i];
6628 found = CODING_ID_NAME (this->id);
6629 break;
6633 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6634 == coding_category_utf_8_auto)
6636 Lisp_Object coding_systems;
6637 struct coding_detection_info detect_info;
6639 coding_systems
6640 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6641 detect_info.found = detect_info.rejected = 0;
6642 if (check_ascii (coding) == coding->src_bytes)
6644 if (CONSP (coding_systems))
6645 found = XCDR (coding_systems);
6647 else
6649 if (CONSP (coding_systems)
6650 && detect_coding_utf_8 (coding, &detect_info))
6652 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6653 found = XCAR (coding_systems);
6654 else
6655 found = XCDR (coding_systems);
6659 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6660 == coding_category_utf_16_auto)
6662 Lisp_Object coding_systems;
6663 struct coding_detection_info detect_info;
6665 coding_systems
6666 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6667 detect_info.found = detect_info.rejected = 0;
6668 coding->head_ascii = 0;
6669 if (CONSP (coding_systems)
6670 && detect_coding_utf_16 (coding, &detect_info))
6672 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6673 found = XCAR (coding_systems);
6674 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6675 found = XCDR (coding_systems);
6679 if (! NILP (found))
6681 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6682 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6683 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6684 : EOL_SEEN_LF);
6686 setup_coding_system (found, coding);
6687 if (specified_eol != EOL_SEEN_NONE)
6688 adjust_coding_eol_type (coding, specified_eol);
6691 coding->mode = saved_mode;
6695 static void
6696 decode_eol (struct coding_system *coding)
6698 Lisp_Object eol_type;
6699 unsigned char *p, *pbeg, *pend;
6701 eol_type = CODING_ID_EOL_TYPE (coding->id);
6702 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6703 return;
6705 if (NILP (coding->dst_object))
6706 pbeg = coding->destination;
6707 else
6708 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6709 pend = pbeg + coding->produced;
6711 if (VECTORP (eol_type))
6713 int eol_seen = EOL_SEEN_NONE;
6715 for (p = pbeg; p < pend; p++)
6717 if (*p == '\n')
6718 eol_seen |= EOL_SEEN_LF;
6719 else if (*p == '\r')
6721 if (p + 1 < pend && *(p + 1) == '\n')
6723 eol_seen |= EOL_SEEN_CRLF;
6724 p++;
6726 else
6727 eol_seen |= EOL_SEEN_CR;
6730 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6731 if ((eol_seen & EOL_SEEN_CRLF) != 0
6732 && (eol_seen & EOL_SEEN_CR) != 0
6733 && (eol_seen & EOL_SEEN_LF) == 0)
6734 eol_seen = EOL_SEEN_CRLF;
6735 else if (eol_seen != EOL_SEEN_NONE
6736 && eol_seen != EOL_SEEN_LF
6737 && eol_seen != EOL_SEEN_CRLF
6738 && eol_seen != EOL_SEEN_CR)
6739 eol_seen = EOL_SEEN_LF;
6740 if (eol_seen != EOL_SEEN_NONE)
6741 eol_type = adjust_coding_eol_type (coding, eol_seen);
6744 if (EQ (eol_type, Qmac))
6746 for (p = pbeg; p < pend; p++)
6747 if (*p == '\r')
6748 *p = '\n';
6750 else if (EQ (eol_type, Qdos))
6752 ptrdiff_t n = 0;
6754 if (NILP (coding->dst_object))
6756 /* Start deleting '\r' from the tail to minimize the memory
6757 movement. */
6758 for (p = pend - 2; p >= pbeg; p--)
6759 if (*p == '\r')
6761 memmove (p, p + 1, pend-- - p - 1);
6762 n++;
6765 else
6767 ptrdiff_t pos_byte = coding->dst_pos_byte;
6768 ptrdiff_t pos = coding->dst_pos;
6769 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6771 while (pos < pos_end)
6773 p = BYTE_POS_ADDR (pos_byte);
6774 if (*p == '\r' && p[1] == '\n')
6776 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6777 n++;
6778 pos_end--;
6780 pos++;
6781 if (coding->dst_multibyte)
6782 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6783 else
6784 pos_byte++;
6787 coding->produced -= n;
6788 coding->produced_char -= n;
6793 /* Return a translation table (or list of them) from coding system
6794 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6795 not ENCODEP). */
6797 static Lisp_Object
6798 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6800 Lisp_Object standard, translation_table;
6801 Lisp_Object val;
6803 if (NILP (Venable_character_translation))
6805 if (max_lookup)
6806 *max_lookup = 0;
6807 return Qnil;
6809 if (encodep)
6810 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6811 standard = Vstandard_translation_table_for_encode;
6812 else
6813 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6814 standard = Vstandard_translation_table_for_decode;
6815 if (NILP (translation_table))
6816 translation_table = standard;
6817 else
6819 if (SYMBOLP (translation_table))
6820 translation_table = Fget (translation_table, Qtranslation_table);
6821 else if (CONSP (translation_table))
6823 translation_table = Fcopy_sequence (translation_table);
6824 for (val = translation_table; CONSP (val); val = XCDR (val))
6825 if (SYMBOLP (XCAR (val)))
6826 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6828 if (CHAR_TABLE_P (standard))
6830 if (CONSP (translation_table))
6831 translation_table = nconc2 (translation_table,
6832 Fcons (standard, Qnil));
6833 else
6834 translation_table = Fcons (translation_table,
6835 Fcons (standard, Qnil));
6839 if (max_lookup)
6841 *max_lookup = 1;
6842 if (CHAR_TABLE_P (translation_table)
6843 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6845 val = XCHAR_TABLE (translation_table)->extras[1];
6846 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6847 *max_lookup = XFASTINT (val);
6849 else if (CONSP (translation_table))
6851 Lisp_Object tail;
6853 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6854 if (CHAR_TABLE_P (XCAR (tail))
6855 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6857 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6858 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6859 *max_lookup = XFASTINT (tailval);
6863 return translation_table;
6866 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6867 do { \
6868 trans = Qnil; \
6869 if (CHAR_TABLE_P (table)) \
6871 trans = CHAR_TABLE_REF (table, c); \
6872 if (CHARACTERP (trans)) \
6873 c = XFASTINT (trans), trans = Qnil; \
6875 else if (CONSP (table)) \
6877 Lisp_Object tail; \
6879 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6880 if (CHAR_TABLE_P (XCAR (tail))) \
6882 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6883 if (CHARACTERP (trans)) \
6884 c = XFASTINT (trans), trans = Qnil; \
6885 else if (! NILP (trans)) \
6886 break; \
6889 } while (0)
6892 /* Return a translation of character(s) at BUF according to TRANS.
6893 TRANS is TO-CHAR or ((FROM . TO) ...) where
6894 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6895 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6896 translation is found, and Qnil if not found..
6897 If BUF is too short to lookup characters in FROM, return Qt. */
6899 static Lisp_Object
6900 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6903 if (INTEGERP (trans))
6904 return trans;
6905 for (; CONSP (trans); trans = XCDR (trans))
6907 Lisp_Object val = XCAR (trans);
6908 Lisp_Object from = XCAR (val);
6909 ptrdiff_t len = ASIZE (from);
6910 ptrdiff_t i;
6912 for (i = 0; i < len; i++)
6914 if (buf + i == buf_end)
6915 return Qt;
6916 if (XINT (AREF (from, i)) != buf[i])
6917 break;
6919 if (i == len)
6920 return val;
6922 return Qnil;
6926 static int
6927 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6928 bool last_block)
6930 unsigned char *dst = coding->destination + coding->produced;
6931 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6932 ptrdiff_t produced;
6933 ptrdiff_t produced_chars = 0;
6934 int carryover = 0;
6936 if (! coding->chars_at_source)
6938 /* Source characters are in coding->charbuf. */
6939 int *buf = coding->charbuf;
6940 int *buf_end = buf + coding->charbuf_used;
6942 if (EQ (coding->src_object, coding->dst_object))
6944 coding_set_source (coding);
6945 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6948 while (buf < buf_end)
6950 int c = *buf;
6951 ptrdiff_t i;
6953 if (c >= 0)
6955 ptrdiff_t from_nchars = 1, to_nchars = 1;
6956 Lisp_Object trans = Qnil;
6958 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6959 if (! NILP (trans))
6961 trans = get_translation (trans, buf, buf_end);
6962 if (INTEGERP (trans))
6963 c = XINT (trans);
6964 else if (CONSP (trans))
6966 from_nchars = ASIZE (XCAR (trans));
6967 trans = XCDR (trans);
6968 if (INTEGERP (trans))
6969 c = XINT (trans);
6970 else
6972 to_nchars = ASIZE (trans);
6973 c = XINT (AREF (trans, 0));
6976 else if (EQ (trans, Qt) && ! last_block)
6977 break;
6980 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6982 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6983 / MAX_MULTIBYTE_LENGTH)
6984 < to_nchars)
6985 memory_full (SIZE_MAX);
6986 dst = alloc_destination (coding,
6987 buf_end - buf
6988 + MAX_MULTIBYTE_LENGTH * to_nchars,
6989 dst);
6990 if (EQ (coding->src_object, coding->dst_object))
6992 coding_set_source (coding);
6993 dst_end = (((unsigned char *) coding->source)
6994 + coding->consumed);
6996 else
6997 dst_end = coding->destination + coding->dst_bytes;
7000 for (i = 0; i < to_nchars; i++)
7002 if (i > 0)
7003 c = XINT (AREF (trans, i));
7004 if (coding->dst_multibyte
7005 || ! CHAR_BYTE8_P (c))
7006 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7007 else
7008 *dst++ = CHAR_TO_BYTE8 (c);
7010 produced_chars += to_nchars;
7011 buf += from_nchars;
7013 else
7014 /* This is an annotation datum. (-C) is the length. */
7015 buf += -c;
7017 carryover = buf_end - buf;
7019 else
7021 /* Source characters are at coding->source. */
7022 const unsigned char *src = coding->source;
7023 const unsigned char *src_end = src + coding->consumed;
7025 if (EQ (coding->dst_object, coding->src_object))
7026 dst_end = (unsigned char *) src;
7027 if (coding->src_multibyte != coding->dst_multibyte)
7029 if (coding->src_multibyte)
7031 bool multibytep = 1;
7032 ptrdiff_t consumed_chars = 0;
7034 while (1)
7036 const unsigned char *src_base = src;
7037 int c;
7039 ONE_MORE_BYTE (c);
7040 if (dst == dst_end)
7042 if (EQ (coding->src_object, coding->dst_object))
7043 dst_end = (unsigned char *) src;
7044 if (dst == dst_end)
7046 ptrdiff_t offset = src - coding->source;
7048 dst = alloc_destination (coding, src_end - src + 1,
7049 dst);
7050 dst_end = coding->destination + coding->dst_bytes;
7051 coding_set_source (coding);
7052 src = coding->source + offset;
7053 src_end = coding->source + coding->consumed;
7054 if (EQ (coding->src_object, coding->dst_object))
7055 dst_end = (unsigned char *) src;
7058 *dst++ = c;
7059 produced_chars++;
7061 no_more_source:
7064 else
7065 while (src < src_end)
7067 bool multibytep = 1;
7068 int c = *src++;
7070 if (dst >= dst_end - 1)
7072 if (EQ (coding->src_object, coding->dst_object))
7073 dst_end = (unsigned char *) src;
7074 if (dst >= dst_end - 1)
7076 ptrdiff_t offset = src - coding->source;
7077 ptrdiff_t more_bytes;
7079 if (EQ (coding->src_object, coding->dst_object))
7080 more_bytes = ((src_end - src) / 2) + 2;
7081 else
7082 more_bytes = src_end - src + 2;
7083 dst = alloc_destination (coding, more_bytes, dst);
7084 dst_end = coding->destination + coding->dst_bytes;
7085 coding_set_source (coding);
7086 src = coding->source + offset;
7087 src_end = coding->source + coding->consumed;
7088 if (EQ (coding->src_object, coding->dst_object))
7089 dst_end = (unsigned char *) src;
7092 EMIT_ONE_BYTE (c);
7095 else
7097 if (!EQ (coding->src_object, coding->dst_object))
7099 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7101 if (require > 0)
7103 ptrdiff_t offset = src - coding->source;
7105 dst = alloc_destination (coding, require, dst);
7106 coding_set_source (coding);
7107 src = coding->source + offset;
7108 src_end = coding->source + coding->consumed;
7111 produced_chars = coding->consumed_char;
7112 while (src < src_end)
7113 *dst++ = *src++;
7117 produced = dst - (coding->destination + coding->produced);
7118 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7119 insert_from_gap (produced_chars, produced, 0);
7120 coding->produced += produced;
7121 coding->produced_char += produced_chars;
7122 return carryover;
7125 /* Compose text in CODING->object according to the annotation data at
7126 CHARBUF. CHARBUF is an array:
7127 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7130 static void
7131 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7133 int len;
7134 ptrdiff_t to;
7135 enum composition_method method;
7136 Lisp_Object components;
7138 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7139 to = pos + charbuf[2];
7140 method = (enum composition_method) (charbuf[4]);
7142 if (method == COMPOSITION_RELATIVE)
7143 components = Qnil;
7144 else
7146 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7147 int i, j;
7149 if (method == COMPOSITION_WITH_RULE)
7150 len = charbuf[2] * 3 - 2;
7151 charbuf += MAX_ANNOTATION_LENGTH;
7152 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7153 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7155 if (charbuf[i] >= 0)
7156 args[j] = make_number (charbuf[i]);
7157 else
7159 i++;
7160 args[j] = make_number (charbuf[i] % 0x100);
7163 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7165 compose_text (pos, to, components, Qnil, coding->dst_object);
7169 /* Put `charset' property on text in CODING->object according to
7170 the annotation data at CHARBUF. CHARBUF is an array:
7171 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7174 static void
7175 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7177 ptrdiff_t from = pos - charbuf[2];
7178 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7180 Fput_text_property (make_number (from), make_number (pos),
7181 Qcharset, CHARSET_NAME (charset),
7182 coding->dst_object);
7186 #define CHARBUF_SIZE 0x4000
7188 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7189 do { \
7190 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7191 coding->charbuf_size = CHARBUF_SIZE; \
7192 } while (0)
7195 static void
7196 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7198 int *charbuf = coding->charbuf;
7199 int *charbuf_end = charbuf + coding->charbuf_used;
7201 if (NILP (coding->dst_object))
7202 return;
7204 while (charbuf < charbuf_end)
7206 if (*charbuf >= 0)
7207 pos++, charbuf++;
7208 else
7210 int len = -*charbuf;
7212 if (len > 2)
7213 switch (charbuf[1])
7215 case CODING_ANNOTATE_COMPOSITION_MASK:
7216 produce_composition (coding, charbuf, pos);
7217 break;
7218 case CODING_ANNOTATE_CHARSET_MASK:
7219 produce_charset (coding, charbuf, pos);
7220 break;
7222 charbuf += len;
7227 /* Decode the data at CODING->src_object into CODING->dst_object.
7228 CODING->src_object is a buffer, a string, or nil.
7229 CODING->dst_object is a buffer.
7231 If CODING->src_object is a buffer, it must be the current buffer.
7232 In this case, if CODING->src_pos is positive, it is a position of
7233 the source text in the buffer, otherwise, the source text is in the
7234 gap area of the buffer, and CODING->src_pos specifies the offset of
7235 the text from GPT (which must be the same as PT). If this is the
7236 same buffer as CODING->dst_object, CODING->src_pos must be
7237 negative.
7239 If CODING->src_object is a string, CODING->src_pos is an index to
7240 that string.
7242 If CODING->src_object is nil, CODING->source must already point to
7243 the non-relocatable memory area. In this case, CODING->src_pos is
7244 an offset from CODING->source.
7246 The decoded data is inserted at the current point of the buffer
7247 CODING->dst_object.
7250 static void
7251 decode_coding (struct coding_system *coding)
7253 Lisp_Object attrs;
7254 Lisp_Object undo_list;
7255 Lisp_Object translation_table;
7256 struct ccl_spec cclspec;
7257 int carryover;
7258 int i;
7260 USE_SAFE_ALLOCA;
7262 if (BUFFERP (coding->src_object)
7263 && coding->src_pos > 0
7264 && coding->src_pos < GPT
7265 && coding->src_pos + coding->src_chars > GPT)
7266 move_gap_both (coding->src_pos, coding->src_pos_byte);
7268 undo_list = Qt;
7269 if (BUFFERP (coding->dst_object))
7271 set_buffer_internal (XBUFFER (coding->dst_object));
7272 if (GPT != PT)
7273 move_gap_both (PT, PT_BYTE);
7275 /* We must disable undo_list in order to record the whole insert
7276 transaction via record_insert at the end. But doing so also
7277 disables the recording of the first change to the undo_list.
7278 Therefore we check for first change here and record it via
7279 record_first_change if needed. */
7280 if (MODIFF <= SAVE_MODIFF)
7281 record_first_change ();
7283 undo_list = BVAR (current_buffer, undo_list);
7284 bset_undo_list (current_buffer, Qt);
7287 coding->consumed = coding->consumed_char = 0;
7288 coding->produced = coding->produced_char = 0;
7289 coding->chars_at_source = 0;
7290 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7291 coding->errors = 0;
7293 ALLOC_CONVERSION_WORK_AREA (coding);
7295 attrs = CODING_ID_ATTRS (coding->id);
7296 translation_table = get_translation_table (attrs, 0, NULL);
7298 carryover = 0;
7299 if (coding->decoder == decode_coding_ccl)
7301 coding->spec.ccl = &cclspec;
7302 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7306 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7308 coding_set_source (coding);
7309 coding->annotated = 0;
7310 coding->charbuf_used = carryover;
7311 (*(coding->decoder)) (coding);
7312 coding_set_destination (coding);
7313 carryover = produce_chars (coding, translation_table, 0);
7314 if (coding->annotated)
7315 produce_annotation (coding, pos);
7316 for (i = 0; i < carryover; i++)
7317 coding->charbuf[i]
7318 = coding->charbuf[coding->charbuf_used - carryover + i];
7320 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7321 || (coding->consumed < coding->src_bytes
7322 && (coding->result == CODING_RESULT_SUCCESS
7323 || coding->result == CODING_RESULT_INVALID_SRC)));
7325 if (carryover > 0)
7327 coding_set_destination (coding);
7328 coding->charbuf_used = carryover;
7329 produce_chars (coding, translation_table, 1);
7332 coding->carryover_bytes = 0;
7333 if (coding->consumed < coding->src_bytes)
7335 int nbytes = coding->src_bytes - coding->consumed;
7336 const unsigned char *src;
7338 coding_set_source (coding);
7339 coding_set_destination (coding);
7340 src = coding->source + coding->consumed;
7342 if (coding->mode & CODING_MODE_LAST_BLOCK)
7344 /* Flush out unprocessed data as binary chars. We are sure
7345 that the number of data is less than the size of
7346 coding->charbuf. */
7347 coding->charbuf_used = 0;
7348 coding->chars_at_source = 0;
7350 while (nbytes-- > 0)
7352 int c = *src++;
7354 if (c & 0x80)
7355 c = BYTE8_TO_CHAR (c);
7356 coding->charbuf[coding->charbuf_used++] = c;
7358 produce_chars (coding, Qnil, 1);
7360 else
7362 /* Record unprocessed bytes in coding->carryover. We are
7363 sure that the number of data is less than the size of
7364 coding->carryover. */
7365 unsigned char *p = coding->carryover;
7367 if (nbytes > sizeof coding->carryover)
7368 nbytes = sizeof coding->carryover;
7369 coding->carryover_bytes = nbytes;
7370 while (nbytes-- > 0)
7371 *p++ = *src++;
7373 coding->consumed = coding->src_bytes;
7376 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7377 && !inhibit_eol_conversion)
7378 decode_eol (coding);
7379 if (BUFFERP (coding->dst_object))
7381 bset_undo_list (current_buffer, undo_list);
7382 record_insert (coding->dst_pos, coding->produced_char);
7385 SAFE_FREE ();
7389 /* Extract an annotation datum from a composition starting at POS and
7390 ending before LIMIT of CODING->src_object (buffer or string), store
7391 the data in BUF, set *STOP to a starting position of the next
7392 composition (if any) or to LIMIT, and return the address of the
7393 next element of BUF.
7395 If such an annotation is not found, set *STOP to a starting
7396 position of a composition after POS (if any) or to LIMIT, and
7397 return BUF. */
7399 static int *
7400 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7401 struct coding_system *coding, int *buf,
7402 ptrdiff_t *stop)
7404 ptrdiff_t start, end;
7405 Lisp_Object prop;
7407 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7408 || end > limit)
7409 *stop = limit;
7410 else if (start > pos)
7411 *stop = start;
7412 else
7414 if (start == pos)
7416 /* We found a composition. Store the corresponding
7417 annotation data in BUF. */
7418 int *head = buf;
7419 enum composition_method method = COMPOSITION_METHOD (prop);
7420 int nchars = COMPOSITION_LENGTH (prop);
7422 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7423 if (method != COMPOSITION_RELATIVE)
7425 Lisp_Object components;
7426 ptrdiff_t i, len, i_byte;
7428 components = COMPOSITION_COMPONENTS (prop);
7429 if (VECTORP (components))
7431 len = ASIZE (components);
7432 for (i = 0; i < len; i++)
7433 *buf++ = XINT (AREF (components, i));
7435 else if (STRINGP (components))
7437 len = SCHARS (components);
7438 i = i_byte = 0;
7439 while (i < len)
7441 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7442 buf++;
7445 else if (INTEGERP (components))
7447 len = 1;
7448 *buf++ = XINT (components);
7450 else if (CONSP (components))
7452 for (len = 0; CONSP (components);
7453 len++, components = XCDR (components))
7454 *buf++ = XINT (XCAR (components));
7456 else
7457 emacs_abort ();
7458 *head -= len;
7462 if (find_composition (end, limit, &start, &end, &prop,
7463 coding->src_object)
7464 && end <= limit)
7465 *stop = start;
7466 else
7467 *stop = limit;
7469 return buf;
7473 /* Extract an annotation datum from a text property `charset' at POS of
7474 CODING->src_object (buffer of string), store the data in BUF, set
7475 *STOP to the position where the value of `charset' property changes
7476 (limiting by LIMIT), and return the address of the next element of
7477 BUF.
7479 If the property value is nil, set *STOP to the position where the
7480 property value is non-nil (limiting by LIMIT), and return BUF. */
7482 static int *
7483 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7484 struct coding_system *coding, int *buf,
7485 ptrdiff_t *stop)
7487 Lisp_Object val, next;
7488 int id;
7490 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7491 if (! NILP (val) && CHARSETP (val))
7492 id = XINT (CHARSET_SYMBOL_ID (val));
7493 else
7494 id = -1;
7495 ADD_CHARSET_DATA (buf, 0, id);
7496 next = Fnext_single_property_change (make_number (pos), Qcharset,
7497 coding->src_object,
7498 make_number (limit));
7499 *stop = XINT (next);
7500 return buf;
7504 static void
7505 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7506 int max_lookup)
7508 int *buf = coding->charbuf;
7509 int *buf_end = coding->charbuf + coding->charbuf_size;
7510 const unsigned char *src = coding->source + coding->consumed;
7511 const unsigned char *src_end = coding->source + coding->src_bytes;
7512 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7513 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7514 bool multibytep = coding->src_multibyte;
7515 Lisp_Object eol_type;
7516 int c;
7517 ptrdiff_t stop, stop_composition, stop_charset;
7518 int *lookup_buf = NULL;
7520 if (! NILP (translation_table))
7521 lookup_buf = alloca (sizeof (int) * max_lookup);
7523 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7524 if (VECTORP (eol_type))
7525 eol_type = Qunix;
7527 /* Note: composition handling is not yet implemented. */
7528 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7530 if (NILP (coding->src_object))
7531 stop = stop_composition = stop_charset = end_pos;
7532 else
7534 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7535 stop = stop_composition = pos;
7536 else
7537 stop = stop_composition = end_pos;
7538 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7539 stop = stop_charset = pos;
7540 else
7541 stop_charset = end_pos;
7544 /* Compensate for CRLF and conversion. */
7545 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7546 while (buf < buf_end)
7548 Lisp_Object trans;
7550 if (pos == stop)
7552 if (pos == end_pos)
7553 break;
7554 if (pos == stop_composition)
7555 buf = handle_composition_annotation (pos, end_pos, coding,
7556 buf, &stop_composition);
7557 if (pos == stop_charset)
7558 buf = handle_charset_annotation (pos, end_pos, coding,
7559 buf, &stop_charset);
7560 stop = (stop_composition < stop_charset
7561 ? stop_composition : stop_charset);
7564 if (! multibytep)
7566 int bytes;
7568 if (coding->encoder == encode_coding_raw_text
7569 || coding->encoder == encode_coding_ccl)
7570 c = *src++, pos++;
7571 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7572 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7573 else
7574 c = BYTE8_TO_CHAR (*src), src++, pos++;
7576 else
7577 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7578 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7579 c = '\n';
7580 if (! EQ (eol_type, Qunix))
7582 if (c == '\n')
7584 if (EQ (eol_type, Qdos))
7585 *buf++ = '\r';
7586 else
7587 c = '\r';
7591 trans = Qnil;
7592 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7593 if (NILP (trans))
7594 *buf++ = c;
7595 else
7597 ptrdiff_t from_nchars = 1, to_nchars = 1;
7598 int *lookup_buf_end;
7599 const unsigned char *p = src;
7600 int i;
7602 lookup_buf[0] = c;
7603 for (i = 1; i < max_lookup && p < src_end; i++)
7604 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7605 lookup_buf_end = lookup_buf + i;
7606 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7607 if (INTEGERP (trans))
7608 c = XINT (trans);
7609 else if (CONSP (trans))
7611 from_nchars = ASIZE (XCAR (trans));
7612 trans = XCDR (trans);
7613 if (INTEGERP (trans))
7614 c = XINT (trans);
7615 else
7617 to_nchars = ASIZE (trans);
7618 if (buf_end - buf < to_nchars)
7619 break;
7620 c = XINT (AREF (trans, 0));
7623 else
7624 break;
7625 *buf++ = c;
7626 for (i = 1; i < to_nchars; i++)
7627 *buf++ = XINT (AREF (trans, i));
7628 for (i = 1; i < from_nchars; i++, pos++)
7629 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7633 coding->consumed = src - coding->source;
7634 coding->consumed_char = pos - coding->src_pos;
7635 coding->charbuf_used = buf - coding->charbuf;
7636 coding->chars_at_source = 0;
7640 /* Encode the text at CODING->src_object into CODING->dst_object.
7641 CODING->src_object is a buffer or a string.
7642 CODING->dst_object is a buffer or nil.
7644 If CODING->src_object is a buffer, it must be the current buffer.
7645 In this case, if CODING->src_pos is positive, it is a position of
7646 the source text in the buffer, otherwise. the source text is in the
7647 gap area of the buffer, and coding->src_pos specifies the offset of
7648 the text from GPT (which must be the same as PT). If this is the
7649 same buffer as CODING->dst_object, CODING->src_pos must be
7650 negative and CODING should not have `pre-write-conversion'.
7652 If CODING->src_object is a string, CODING should not have
7653 `pre-write-conversion'.
7655 If CODING->dst_object is a buffer, the encoded data is inserted at
7656 the current point of that buffer.
7658 If CODING->dst_object is nil, the encoded data is placed at the
7659 memory area specified by CODING->destination. */
7661 static void
7662 encode_coding (struct coding_system *coding)
7664 Lisp_Object attrs;
7665 Lisp_Object translation_table;
7666 int max_lookup;
7667 struct ccl_spec cclspec;
7669 USE_SAFE_ALLOCA;
7671 attrs = CODING_ID_ATTRS (coding->id);
7672 if (coding->encoder == encode_coding_raw_text)
7673 translation_table = Qnil, max_lookup = 0;
7674 else
7675 translation_table = get_translation_table (attrs, 1, &max_lookup);
7677 if (BUFFERP (coding->dst_object))
7679 set_buffer_internal (XBUFFER (coding->dst_object));
7680 coding->dst_multibyte
7681 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7684 coding->consumed = coding->consumed_char = 0;
7685 coding->produced = coding->produced_char = 0;
7686 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7687 coding->errors = 0;
7689 ALLOC_CONVERSION_WORK_AREA (coding);
7691 if (coding->encoder == encode_coding_ccl)
7693 coding->spec.ccl = &cclspec;
7694 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7696 do {
7697 coding_set_source (coding);
7698 consume_chars (coding, translation_table, max_lookup);
7699 coding_set_destination (coding);
7700 (*(coding->encoder)) (coding);
7701 } while (coding->consumed_char < coding->src_chars);
7703 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7704 insert_from_gap (coding->produced_char, coding->produced, 0);
7706 SAFE_FREE ();
7710 /* Name (or base name) of work buffer for code conversion. */
7711 static Lisp_Object Vcode_conversion_workbuf_name;
7713 /* A working buffer used by the top level conversion. Once it is
7714 created, it is never destroyed. It has the name
7715 Vcode_conversion_workbuf_name. The other working buffers are
7716 destroyed after the use is finished, and their names are modified
7717 versions of Vcode_conversion_workbuf_name. */
7718 static Lisp_Object Vcode_conversion_reused_workbuf;
7720 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7721 static bool reused_workbuf_in_use;
7724 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7725 multibyteness of returning buffer. */
7727 static Lisp_Object
7728 make_conversion_work_buffer (bool multibyte)
7730 Lisp_Object name, workbuf;
7731 struct buffer *current;
7733 if (reused_workbuf_in_use)
7735 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7736 workbuf = Fget_buffer_create (name);
7738 else
7740 reused_workbuf_in_use = 1;
7741 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7742 Vcode_conversion_reused_workbuf
7743 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7744 workbuf = Vcode_conversion_reused_workbuf;
7746 current = current_buffer;
7747 set_buffer_internal (XBUFFER (workbuf));
7748 /* We can't allow modification hooks to run in the work buffer. For
7749 instance, directory_files_internal assumes that file decoding
7750 doesn't compile new regexps. */
7751 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7752 Ferase_buffer ();
7753 bset_undo_list (current_buffer, Qt);
7754 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7755 set_buffer_internal (current);
7756 return workbuf;
7760 static Lisp_Object
7761 code_conversion_restore (Lisp_Object arg)
7763 Lisp_Object current, workbuf;
7764 struct gcpro gcpro1;
7766 GCPRO1 (arg);
7767 current = XCAR (arg);
7768 workbuf = XCDR (arg);
7769 if (! NILP (workbuf))
7771 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7772 reused_workbuf_in_use = 0;
7773 else
7774 Fkill_buffer (workbuf);
7776 set_buffer_internal (XBUFFER (current));
7777 UNGCPRO;
7778 return Qnil;
7781 Lisp_Object
7782 code_conversion_save (bool with_work_buf, bool multibyte)
7784 Lisp_Object workbuf = Qnil;
7786 if (with_work_buf)
7787 workbuf = make_conversion_work_buffer (multibyte);
7788 record_unwind_protect (code_conversion_restore,
7789 Fcons (Fcurrent_buffer (), workbuf));
7790 return workbuf;
7793 void
7794 decode_coding_gap (struct coding_system *coding,
7795 ptrdiff_t chars, ptrdiff_t bytes)
7797 ptrdiff_t count = SPECPDL_INDEX ();
7798 Lisp_Object attrs;
7800 coding->src_object = Fcurrent_buffer ();
7801 coding->src_chars = chars;
7802 coding->src_bytes = bytes;
7803 coding->src_pos = -chars;
7804 coding->src_pos_byte = -bytes;
7805 coding->src_multibyte = chars < bytes;
7806 coding->dst_object = coding->src_object;
7807 coding->dst_pos = PT;
7808 coding->dst_pos_byte = PT_BYTE;
7809 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7811 coding->head_ascii = -1;
7812 coding->detected_utf8_chars = -1;
7813 coding->eol_seen = EOL_SEEN_NONE;
7814 if (CODING_REQUIRE_DETECTION (coding))
7815 detect_coding (coding);
7816 attrs = CODING_ID_ATTRS (coding->id);
7817 if (! disable_ascii_optimization
7818 && ! coding->src_multibyte
7819 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7820 && NILP (CODING_ATTR_POST_READ (attrs))
7821 && NILP (get_translation_table (attrs, 0, NULL)))
7823 chars = coding->head_ascii;
7824 if (chars < 0)
7825 chars = check_ascii (coding);
7826 if (chars != bytes)
7828 /* There exists a non-ASCII byte. */
7829 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7831 if (coding->detected_utf8_chars >= 0)
7832 chars = coding->detected_utf8_chars;
7833 else
7834 chars = check_utf_8 (coding);
7835 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7836 && coding->head_ascii == 0
7837 && coding->source[0] == UTF_8_BOM_1
7838 && coding->source[1] == UTF_8_BOM_2
7839 && coding->source[2] == UTF_8_BOM_3)
7841 chars--;
7842 bytes -= 3;
7843 coding->src_bytes -= 3;
7846 else
7847 chars = -1;
7849 if (chars >= 0)
7851 Lisp_Object eol_type;
7853 eol_type = CODING_ID_EOL_TYPE (coding->id);
7854 if (VECTORP (eol_type))
7856 if (coding->eol_seen != EOL_SEEN_NONE)
7857 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7859 if (EQ (eol_type, Qmac))
7861 unsigned char *src_end = GAP_END_ADDR;
7862 unsigned char *src = src_end - coding->src_bytes;
7864 while (src < src_end)
7866 if (*src++ == '\r')
7867 src[-1] = '\n';
7870 else if (EQ (eol_type, Qdos))
7872 unsigned char *src = GAP_END_ADDR;
7873 unsigned char *src_beg = src - coding->src_bytes;
7874 unsigned char *dst = src;
7875 ptrdiff_t diff;
7877 while (src_beg < src)
7879 *--dst = *--src;
7880 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7881 src--;
7883 diff = dst - src;
7884 bytes -= diff;
7885 chars -= diff;
7887 coding->produced = bytes;
7888 coding->produced_char = chars;
7889 insert_from_gap (chars, bytes, 1);
7890 return;
7893 code_conversion_save (0, 0);
7895 coding->mode |= CODING_MODE_LAST_BLOCK;
7896 current_buffer->text->inhibit_shrinking = 1;
7897 decode_coding (coding);
7898 current_buffer->text->inhibit_shrinking = 0;
7900 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7902 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7903 Lisp_Object val;
7905 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7906 val = call1 (CODING_ATTR_POST_READ (attrs),
7907 make_number (coding->produced_char));
7908 CHECK_NATNUM (val);
7909 coding->produced_char += Z - prev_Z;
7910 coding->produced += Z_BYTE - prev_Z_BYTE;
7913 unbind_to (count, Qnil);
7917 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7918 SRC_OBJECT into DST_OBJECT by coding context CODING.
7920 SRC_OBJECT is a buffer, a string, or Qnil.
7922 If it is a buffer, the text is at point of the buffer. FROM and TO
7923 are positions in the buffer.
7925 If it is a string, the text is at the beginning of the string.
7926 FROM and TO are indices to the string.
7928 If it is nil, the text is at coding->source. FROM and TO are
7929 indices to coding->source.
7931 DST_OBJECT is a buffer, Qt, or Qnil.
7933 If it is a buffer, the decoded text is inserted at point of the
7934 buffer. If the buffer is the same as SRC_OBJECT, the source text
7935 is deleted.
7937 If it is Qt, a string is made from the decoded text, and
7938 set in CODING->dst_object.
7940 If it is Qnil, the decoded text is stored at CODING->destination.
7941 The caller must allocate CODING->dst_bytes bytes at
7942 CODING->destination by xmalloc. If the decoded text is longer than
7943 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7946 void
7947 decode_coding_object (struct coding_system *coding,
7948 Lisp_Object src_object,
7949 ptrdiff_t from, ptrdiff_t from_byte,
7950 ptrdiff_t to, ptrdiff_t to_byte,
7951 Lisp_Object dst_object)
7953 ptrdiff_t count = SPECPDL_INDEX ();
7954 unsigned char *destination IF_LINT (= NULL);
7955 ptrdiff_t dst_bytes IF_LINT (= 0);
7956 ptrdiff_t chars = to - from;
7957 ptrdiff_t bytes = to_byte - from_byte;
7958 Lisp_Object attrs;
7959 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7960 bool need_marker_adjustment = 0;
7961 Lisp_Object old_deactivate_mark;
7963 old_deactivate_mark = Vdeactivate_mark;
7965 if (NILP (dst_object))
7967 destination = coding->destination;
7968 dst_bytes = coding->dst_bytes;
7971 coding->src_object = src_object;
7972 coding->src_chars = chars;
7973 coding->src_bytes = bytes;
7974 coding->src_multibyte = chars < bytes;
7976 if (STRINGP (src_object))
7978 coding->src_pos = from;
7979 coding->src_pos_byte = from_byte;
7981 else if (BUFFERP (src_object))
7983 set_buffer_internal (XBUFFER (src_object));
7984 if (from != GPT)
7985 move_gap_both (from, from_byte);
7986 if (EQ (src_object, dst_object))
7988 struct Lisp_Marker *tail;
7990 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7992 tail->need_adjustment
7993 = tail->charpos == (tail->insertion_type ? from : to);
7994 need_marker_adjustment |= tail->need_adjustment;
7996 saved_pt = PT, saved_pt_byte = PT_BYTE;
7997 TEMP_SET_PT_BOTH (from, from_byte);
7998 current_buffer->text->inhibit_shrinking = 1;
7999 del_range_both (from, from_byte, to, to_byte, 1);
8000 coding->src_pos = -chars;
8001 coding->src_pos_byte = -bytes;
8003 else
8005 coding->src_pos = from;
8006 coding->src_pos_byte = from_byte;
8010 if (CODING_REQUIRE_DETECTION (coding))
8011 detect_coding (coding);
8012 attrs = CODING_ID_ATTRS (coding->id);
8014 if (EQ (dst_object, Qt)
8015 || (! NILP (CODING_ATTR_POST_READ (attrs))
8016 && NILP (dst_object)))
8018 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8019 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8020 coding->dst_pos = BEG;
8021 coding->dst_pos_byte = BEG_BYTE;
8023 else if (BUFFERP (dst_object))
8025 code_conversion_save (0, 0);
8026 coding->dst_object = dst_object;
8027 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8028 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8029 coding->dst_multibyte
8030 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8032 else
8034 code_conversion_save (0, 0);
8035 coding->dst_object = Qnil;
8036 /* Most callers presume this will return a multibyte result, and they
8037 won't use `binary' or `raw-text' anyway, so let's not worry about
8038 CODING_FOR_UNIBYTE. */
8039 coding->dst_multibyte = 1;
8042 decode_coding (coding);
8044 if (BUFFERP (coding->dst_object))
8045 set_buffer_internal (XBUFFER (coding->dst_object));
8047 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8049 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8050 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8051 Lisp_Object val;
8053 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8054 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8055 old_deactivate_mark);
8056 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8057 make_number (coding->produced_char));
8058 UNGCPRO;
8059 CHECK_NATNUM (val);
8060 coding->produced_char += Z - prev_Z;
8061 coding->produced += Z_BYTE - prev_Z_BYTE;
8064 if (EQ (dst_object, Qt))
8066 coding->dst_object = Fbuffer_string ();
8068 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8070 set_buffer_internal (XBUFFER (coding->dst_object));
8071 if (dst_bytes < coding->produced)
8073 eassert (coding->produced > 0);
8074 destination = xrealloc (destination, coding->produced);
8075 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8076 move_gap_both (BEGV, BEGV_BYTE);
8077 memcpy (destination, BEGV_ADDR, coding->produced);
8078 coding->destination = destination;
8082 if (saved_pt >= 0)
8084 /* This is the case of:
8085 (BUFFERP (src_object) && EQ (src_object, dst_object))
8086 As we have moved PT while replacing the original buffer
8087 contents, we must recover it now. */
8088 set_buffer_internal (XBUFFER (src_object));
8089 current_buffer->text->inhibit_shrinking = 0;
8090 if (saved_pt < from)
8091 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8092 else if (saved_pt < from + chars)
8093 TEMP_SET_PT_BOTH (from, from_byte);
8094 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8095 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8096 saved_pt_byte + (coding->produced - bytes));
8097 else
8098 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8099 saved_pt_byte + (coding->produced - bytes));
8101 if (need_marker_adjustment)
8103 struct Lisp_Marker *tail;
8105 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8106 if (tail->need_adjustment)
8108 tail->need_adjustment = 0;
8109 if (tail->insertion_type)
8111 tail->bytepos = from_byte;
8112 tail->charpos = from;
8114 else
8116 tail->bytepos = from_byte + coding->produced;
8117 tail->charpos
8118 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8119 ? tail->bytepos : from + coding->produced_char);
8125 Vdeactivate_mark = old_deactivate_mark;
8126 unbind_to (count, coding->dst_object);
8130 void
8131 encode_coding_object (struct coding_system *coding,
8132 Lisp_Object src_object,
8133 ptrdiff_t from, ptrdiff_t from_byte,
8134 ptrdiff_t to, ptrdiff_t to_byte,
8135 Lisp_Object dst_object)
8137 ptrdiff_t count = SPECPDL_INDEX ();
8138 ptrdiff_t chars = to - from;
8139 ptrdiff_t bytes = to_byte - from_byte;
8140 Lisp_Object attrs;
8141 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8142 bool need_marker_adjustment = 0;
8143 bool kill_src_buffer = 0;
8144 Lisp_Object old_deactivate_mark;
8146 old_deactivate_mark = Vdeactivate_mark;
8148 coding->src_object = src_object;
8149 coding->src_chars = chars;
8150 coding->src_bytes = bytes;
8151 coding->src_multibyte = chars < bytes;
8153 attrs = CODING_ID_ATTRS (coding->id);
8155 if (EQ (src_object, dst_object))
8157 struct Lisp_Marker *tail;
8159 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8161 tail->need_adjustment
8162 = tail->charpos == (tail->insertion_type ? from : to);
8163 need_marker_adjustment |= tail->need_adjustment;
8167 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8169 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8170 set_buffer_internal (XBUFFER (coding->src_object));
8171 if (STRINGP (src_object))
8172 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8173 else if (BUFFERP (src_object))
8174 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8175 else
8176 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8178 if (EQ (src_object, dst_object))
8180 set_buffer_internal (XBUFFER (src_object));
8181 saved_pt = PT, saved_pt_byte = PT_BYTE;
8182 del_range_both (from, from_byte, to, to_byte, 1);
8183 set_buffer_internal (XBUFFER (coding->src_object));
8187 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8189 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8190 old_deactivate_mark);
8191 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8192 make_number (BEG), make_number (Z));
8193 UNGCPRO;
8195 if (XBUFFER (coding->src_object) != current_buffer)
8196 kill_src_buffer = 1;
8197 coding->src_object = Fcurrent_buffer ();
8198 if (BEG != GPT)
8199 move_gap_both (BEG, BEG_BYTE);
8200 coding->src_chars = Z - BEG;
8201 coding->src_bytes = Z_BYTE - BEG_BYTE;
8202 coding->src_pos = BEG;
8203 coding->src_pos_byte = BEG_BYTE;
8204 coding->src_multibyte = Z < Z_BYTE;
8206 else if (STRINGP (src_object))
8208 code_conversion_save (0, 0);
8209 coding->src_pos = from;
8210 coding->src_pos_byte = from_byte;
8212 else if (BUFFERP (src_object))
8214 code_conversion_save (0, 0);
8215 set_buffer_internal (XBUFFER (src_object));
8216 if (EQ (src_object, dst_object))
8218 saved_pt = PT, saved_pt_byte = PT_BYTE;
8219 coding->src_object = del_range_1 (from, to, 1, 1);
8220 coding->src_pos = 0;
8221 coding->src_pos_byte = 0;
8223 else
8225 if (from < GPT && to >= GPT)
8226 move_gap_both (from, from_byte);
8227 coding->src_pos = from;
8228 coding->src_pos_byte = from_byte;
8231 else
8232 code_conversion_save (0, 0);
8234 if (BUFFERP (dst_object))
8236 coding->dst_object = dst_object;
8237 if (EQ (src_object, dst_object))
8239 coding->dst_pos = from;
8240 coding->dst_pos_byte = from_byte;
8242 else
8244 struct buffer *current = current_buffer;
8246 set_buffer_temp (XBUFFER (dst_object));
8247 coding->dst_pos = PT;
8248 coding->dst_pos_byte = PT_BYTE;
8249 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8250 set_buffer_temp (current);
8252 coding->dst_multibyte
8253 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8255 else if (EQ (dst_object, Qt))
8257 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8258 coding->dst_object = Qnil;
8259 coding->destination = xmalloc (dst_bytes);
8260 coding->dst_bytes = dst_bytes;
8261 coding->dst_multibyte = 0;
8263 else
8265 coding->dst_object = Qnil;
8266 coding->dst_multibyte = 0;
8269 encode_coding (coding);
8271 if (EQ (dst_object, Qt))
8273 if (BUFFERP (coding->dst_object))
8274 coding->dst_object = Fbuffer_string ();
8275 else
8277 coding->dst_object
8278 = make_unibyte_string ((char *) coding->destination,
8279 coding->produced);
8280 xfree (coding->destination);
8284 if (saved_pt >= 0)
8286 /* This is the case of:
8287 (BUFFERP (src_object) && EQ (src_object, dst_object))
8288 As we have moved PT while replacing the original buffer
8289 contents, we must recover it now. */
8290 set_buffer_internal (XBUFFER (src_object));
8291 if (saved_pt < from)
8292 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8293 else if (saved_pt < from + chars)
8294 TEMP_SET_PT_BOTH (from, from_byte);
8295 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8296 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8297 saved_pt_byte + (coding->produced - bytes));
8298 else
8299 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8300 saved_pt_byte + (coding->produced - bytes));
8302 if (need_marker_adjustment)
8304 struct Lisp_Marker *tail;
8306 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8307 if (tail->need_adjustment)
8309 tail->need_adjustment = 0;
8310 if (tail->insertion_type)
8312 tail->bytepos = from_byte;
8313 tail->charpos = from;
8315 else
8317 tail->bytepos = from_byte + coding->produced;
8318 tail->charpos
8319 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8320 ? tail->bytepos : from + coding->produced_char);
8326 if (kill_src_buffer)
8327 Fkill_buffer (coding->src_object);
8329 Vdeactivate_mark = old_deactivate_mark;
8330 unbind_to (count, Qnil);
8334 Lisp_Object
8335 preferred_coding_system (void)
8337 int id = coding_categories[coding_priorities[0]].id;
8339 return CODING_ID_NAME (id);
8342 #if defined (WINDOWSNT) || defined (CYGWIN)
8344 Lisp_Object
8345 from_unicode (Lisp_Object str)
8347 CHECK_STRING (str);
8348 if (!STRING_MULTIBYTE (str) &&
8349 SBYTES (str) & 1)
8351 str = Fsubstring (str, make_number (0), make_number (-1));
8354 return code_convert_string_norecord (str, Qutf_16le, 0);
8357 Lisp_Object
8358 from_unicode_buffer (const wchar_t* wstr)
8360 return from_unicode (
8361 make_unibyte_string (
8362 (char*) wstr,
8363 /* we get one of the two final 0 bytes for free. */
8364 1 + sizeof (wchar_t) * wcslen (wstr)));
8367 wchar_t *
8368 to_unicode (Lisp_Object str, Lisp_Object *buf)
8370 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8371 /* We need to make another copy (in addition to the one made by
8372 code_convert_string_norecord) to ensure that the final string is
8373 _doubly_ zero terminated --- that is, that the string is
8374 terminated by two zero bytes and one utf-16le null character.
8375 Because strings are already terminated with a single zero byte,
8376 we just add one additional zero. */
8377 str = make_uninit_string (SBYTES (*buf) + 1);
8378 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8379 SDATA (str) [SBYTES (*buf)] = '\0';
8380 *buf = str;
8381 return WCSDATA (*buf);
8384 #endif /* WINDOWSNT || CYGWIN */
8387 #ifdef emacs
8388 /*** 8. Emacs Lisp library functions ***/
8390 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8391 doc: /* Return t if OBJECT is nil or a coding-system.
8392 See the documentation of `define-coding-system' for information
8393 about coding-system objects. */)
8394 (Lisp_Object object)
8396 if (NILP (object)
8397 || CODING_SYSTEM_ID (object) >= 0)
8398 return Qt;
8399 if (! SYMBOLP (object)
8400 || NILP (Fget (object, Qcoding_system_define_form)))
8401 return Qnil;
8402 return Qt;
8405 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8406 Sread_non_nil_coding_system, 1, 1, 0,
8407 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8408 (Lisp_Object prompt)
8410 Lisp_Object val;
8413 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8414 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8416 while (SCHARS (val) == 0);
8417 return (Fintern (val, Qnil));
8420 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8421 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8422 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8423 Ignores case when completing coding systems (all Emacs coding systems
8424 are lower-case). */)
8425 (Lisp_Object prompt, Lisp_Object default_coding_system)
8427 Lisp_Object val;
8428 ptrdiff_t count = SPECPDL_INDEX ();
8430 if (SYMBOLP (default_coding_system))
8431 default_coding_system = SYMBOL_NAME (default_coding_system);
8432 specbind (Qcompletion_ignore_case, Qt);
8433 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8434 Qt, Qnil, Qcoding_system_history,
8435 default_coding_system, Qnil);
8436 unbind_to (count, Qnil);
8437 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8440 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8441 1, 1, 0,
8442 doc: /* Check validity of CODING-SYSTEM.
8443 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8444 It is valid if it is nil or a symbol defined as a coding system by the
8445 function `define-coding-system'. */)
8446 (Lisp_Object coding_system)
8448 Lisp_Object define_form;
8450 define_form = Fget (coding_system, Qcoding_system_define_form);
8451 if (! NILP (define_form))
8453 Fput (coding_system, Qcoding_system_define_form, Qnil);
8454 safe_eval (define_form);
8456 if (!NILP (Fcoding_system_p (coding_system)))
8457 return coding_system;
8458 xsignal1 (Qcoding_system_error, coding_system);
8462 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8463 HIGHEST, return the coding system of the highest
8464 priority among the detected coding systems. Otherwise return a
8465 list of detected coding systems sorted by their priorities. If
8466 MULTIBYTEP, it is assumed that the bytes are in correct
8467 multibyte form but contains only ASCII and eight-bit chars.
8468 Otherwise, the bytes are raw bytes.
8470 CODING-SYSTEM controls the detection as below:
8472 If it is nil, detect both text-format and eol-format. If the
8473 text-format part of CODING-SYSTEM is already specified
8474 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8475 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8476 detect only text-format. */
8478 Lisp_Object
8479 detect_coding_system (const unsigned char *src,
8480 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8481 bool highest, bool multibytep,
8482 Lisp_Object coding_system)
8484 const unsigned char *src_end = src + src_bytes;
8485 Lisp_Object attrs, eol_type;
8486 Lisp_Object val = Qnil;
8487 struct coding_system coding;
8488 ptrdiff_t id;
8489 struct coding_detection_info detect_info;
8490 enum coding_category base_category;
8491 bool null_byte_found = 0, eight_bit_found = 0;
8493 if (NILP (coding_system))
8494 coding_system = Qundecided;
8495 setup_coding_system (coding_system, &coding);
8496 attrs = CODING_ID_ATTRS (coding.id);
8497 eol_type = CODING_ID_EOL_TYPE (coding.id);
8498 coding_system = CODING_ATTR_BASE_NAME (attrs);
8500 coding.source = src;
8501 coding.src_chars = src_chars;
8502 coding.src_bytes = src_bytes;
8503 coding.src_multibyte = multibytep;
8504 coding.consumed = 0;
8505 coding.mode |= CODING_MODE_LAST_BLOCK;
8506 coding.head_ascii = 0;
8508 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8510 /* At first, detect text-format if necessary. */
8511 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8512 if (base_category == coding_category_undecided)
8514 enum coding_category category IF_LINT (= 0);
8515 struct coding_system *this IF_LINT (= NULL);
8516 int c, i;
8518 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8519 for (; src < src_end; src++)
8521 c = *src;
8522 if (c & 0x80)
8524 eight_bit_found = 1;
8525 if (null_byte_found)
8526 break;
8528 else if (c < 0x20)
8530 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8531 && ! inhibit_iso_escape_detection
8532 && ! detect_info.checked)
8534 if (detect_coding_iso_2022 (&coding, &detect_info))
8536 /* We have scanned the whole data. */
8537 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8539 /* We didn't find an 8-bit code. We may
8540 have found a null-byte, but it's very
8541 rare that a binary file confirm to
8542 ISO-2022. */
8543 src = src_end;
8544 coding.head_ascii = src - coding.source;
8546 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8547 break;
8550 else if (! c && !inhibit_null_byte_detection)
8552 null_byte_found = 1;
8553 if (eight_bit_found)
8554 break;
8556 if (! eight_bit_found)
8557 coding.head_ascii++;
8559 else if (! eight_bit_found)
8560 coding.head_ascii++;
8563 if (null_byte_found || eight_bit_found
8564 || coding.head_ascii < coding.src_bytes
8565 || detect_info.found)
8567 if (coding.head_ascii == coding.src_bytes)
8568 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8569 for (i = 0; i < coding_category_raw_text; i++)
8571 category = coding_priorities[i];
8572 this = coding_categories + category;
8573 if (detect_info.found & (1 << category))
8574 break;
8576 else
8578 if (null_byte_found)
8580 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8581 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8583 for (i = 0; i < coding_category_raw_text; i++)
8585 category = coding_priorities[i];
8586 this = coding_categories + category;
8588 if (this->id < 0)
8590 /* No coding system of this category is defined. */
8591 detect_info.rejected |= (1 << category);
8593 else if (category >= coding_category_raw_text)
8594 continue;
8595 else if (detect_info.checked & (1 << category))
8597 if (highest
8598 && (detect_info.found & (1 << category)))
8599 break;
8601 else if ((*(this->detector)) (&coding, &detect_info)
8602 && highest
8603 && (detect_info.found & (1 << category)))
8605 if (category == coding_category_utf_16_auto)
8607 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8608 category = coding_category_utf_16_le;
8609 else
8610 category = coding_category_utf_16_be;
8612 break;
8618 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8619 || null_byte_found)
8621 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8622 id = CODING_SYSTEM_ID (Qno_conversion);
8623 val = Fcons (make_number (id), Qnil);
8625 else if (! detect_info.rejected && ! detect_info.found)
8627 detect_info.found = CATEGORY_MASK_ANY;
8628 id = coding_categories[coding_category_undecided].id;
8629 val = Fcons (make_number (id), Qnil);
8631 else if (highest)
8633 if (detect_info.found)
8635 detect_info.found = 1 << category;
8636 val = Fcons (make_number (this->id), Qnil);
8638 else
8639 for (i = 0; i < coding_category_raw_text; i++)
8640 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8642 detect_info.found = 1 << coding_priorities[i];
8643 id = coding_categories[coding_priorities[i]].id;
8644 val = Fcons (make_number (id), Qnil);
8645 break;
8648 else
8650 int mask = detect_info.rejected | detect_info.found;
8651 int found = 0;
8653 for (i = coding_category_raw_text - 1; i >= 0; i--)
8655 category = coding_priorities[i];
8656 if (! (mask & (1 << category)))
8658 found |= 1 << category;
8659 id = coding_categories[category].id;
8660 if (id >= 0)
8661 val = Fcons (make_number (id), val);
8664 for (i = coding_category_raw_text - 1; i >= 0; i--)
8666 category = coding_priorities[i];
8667 if (detect_info.found & (1 << category))
8669 id = coding_categories[category].id;
8670 val = Fcons (make_number (id), val);
8673 detect_info.found |= found;
8676 else if (base_category == coding_category_utf_8_auto)
8678 if (detect_coding_utf_8 (&coding, &detect_info))
8680 struct coding_system *this;
8682 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8683 this = coding_categories + coding_category_utf_8_sig;
8684 else
8685 this = coding_categories + coding_category_utf_8_nosig;
8686 val = Fcons (make_number (this->id), Qnil);
8689 else if (base_category == coding_category_utf_16_auto)
8691 if (detect_coding_utf_16 (&coding, &detect_info))
8693 struct coding_system *this;
8695 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8696 this = coding_categories + coding_category_utf_16_le;
8697 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8698 this = coding_categories + coding_category_utf_16_be;
8699 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8700 this = coding_categories + coding_category_utf_16_be_nosig;
8701 else
8702 this = coding_categories + coding_category_utf_16_le_nosig;
8703 val = Fcons (make_number (this->id), Qnil);
8706 else
8708 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8709 val = Fcons (make_number (coding.id), Qnil);
8712 /* Then, detect eol-format if necessary. */
8714 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8715 Lisp_Object tail;
8717 if (VECTORP (eol_type))
8719 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8721 if (null_byte_found)
8722 normal_eol = EOL_SEEN_LF;
8723 else
8724 normal_eol = detect_eol (coding.source, src_bytes,
8725 coding_category_raw_text);
8727 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8728 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8729 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8730 coding_category_utf_16_be);
8731 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8732 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8733 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8734 coding_category_utf_16_le);
8736 else
8738 if (EQ (eol_type, Qunix))
8739 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8740 else if (EQ (eol_type, Qdos))
8741 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8742 else
8743 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8746 for (tail = val; CONSP (tail); tail = XCDR (tail))
8748 enum coding_category category;
8749 int this_eol;
8751 id = XINT (XCAR (tail));
8752 attrs = CODING_ID_ATTRS (id);
8753 category = XINT (CODING_ATTR_CATEGORY (attrs));
8754 eol_type = CODING_ID_EOL_TYPE (id);
8755 if (VECTORP (eol_type))
8757 if (category == coding_category_utf_16_be
8758 || category == coding_category_utf_16_be_nosig)
8759 this_eol = utf_16_be_eol;
8760 else if (category == coding_category_utf_16_le
8761 || category == coding_category_utf_16_le_nosig)
8762 this_eol = utf_16_le_eol;
8763 else
8764 this_eol = normal_eol;
8766 if (this_eol == EOL_SEEN_LF)
8767 XSETCAR (tail, AREF (eol_type, 0));
8768 else if (this_eol == EOL_SEEN_CRLF)
8769 XSETCAR (tail, AREF (eol_type, 1));
8770 else if (this_eol == EOL_SEEN_CR)
8771 XSETCAR (tail, AREF (eol_type, 2));
8772 else
8773 XSETCAR (tail, CODING_ID_NAME (id));
8775 else
8776 XSETCAR (tail, CODING_ID_NAME (id));
8780 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8784 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8785 2, 3, 0,
8786 doc: /* Detect coding system of the text in the region between START and END.
8787 Return a list of possible coding systems ordered by priority.
8788 The coding systems to try and their priorities follows what
8789 the function `coding-system-priority-list' (which see) returns.
8791 If only ASCII characters are found (except for such ISO-2022 control
8792 characters as ESC), it returns a list of single element `undecided'
8793 or its subsidiary coding system according to a detected end-of-line
8794 format.
8796 If optional argument HIGHEST is non-nil, return the coding system of
8797 highest priority. */)
8798 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8800 ptrdiff_t from, to;
8801 ptrdiff_t from_byte, to_byte;
8803 validate_region (&start, &end);
8804 from = XINT (start), to = XINT (end);
8805 from_byte = CHAR_TO_BYTE (from);
8806 to_byte = CHAR_TO_BYTE (to);
8808 if (from < GPT && to >= GPT)
8809 move_gap_both (to, to_byte);
8811 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8812 to - from, to_byte - from_byte,
8813 !NILP (highest),
8814 !NILP (BVAR (current_buffer
8815 , enable_multibyte_characters)),
8816 Qnil);
8819 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8820 1, 2, 0,
8821 doc: /* Detect coding system of the text in STRING.
8822 Return a list of possible coding systems ordered by priority.
8823 The coding systems to try and their priorities follows what
8824 the function `coding-system-priority-list' (which see) returns.
8826 If only ASCII characters are found (except for such ISO-2022 control
8827 characters as ESC), it returns a list of single element `undecided'
8828 or its subsidiary coding system according to a detected end-of-line
8829 format.
8831 If optional argument HIGHEST is non-nil, return the coding system of
8832 highest priority. */)
8833 (Lisp_Object string, Lisp_Object highest)
8835 CHECK_STRING (string);
8837 return detect_coding_system (SDATA (string),
8838 SCHARS (string), SBYTES (string),
8839 !NILP (highest), STRING_MULTIBYTE (string),
8840 Qnil);
8844 static bool
8845 char_encodable_p (int c, Lisp_Object attrs)
8847 Lisp_Object tail;
8848 struct charset *charset;
8849 Lisp_Object translation_table;
8851 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8852 if (! NILP (translation_table))
8853 c = translate_char (translation_table, c);
8854 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8855 CONSP (tail); tail = XCDR (tail))
8857 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8858 if (CHAR_CHARSET_P (c, charset))
8859 break;
8861 return (! NILP (tail));
8865 /* Return a list of coding systems that safely encode the text between
8866 START and END. If EXCLUDE is non-nil, it is a list of coding
8867 systems not to check. The returned list doesn't contain any such
8868 coding systems. In any case, if the text contains only ASCII or is
8869 unibyte, return t. */
8871 DEFUN ("find-coding-systems-region-internal",
8872 Ffind_coding_systems_region_internal,
8873 Sfind_coding_systems_region_internal, 2, 3, 0,
8874 doc: /* Internal use only. */)
8875 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8877 Lisp_Object coding_attrs_list, safe_codings;
8878 ptrdiff_t start_byte, end_byte;
8879 const unsigned char *p, *pbeg, *pend;
8880 int c;
8881 Lisp_Object tail, elt, work_table;
8883 if (STRINGP (start))
8885 if (!STRING_MULTIBYTE (start)
8886 || SCHARS (start) == SBYTES (start))
8887 return Qt;
8888 start_byte = 0;
8889 end_byte = SBYTES (start);
8891 else
8893 CHECK_NUMBER_COERCE_MARKER (start);
8894 CHECK_NUMBER_COERCE_MARKER (end);
8895 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8896 args_out_of_range (start, end);
8897 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8898 return Qt;
8899 start_byte = CHAR_TO_BYTE (XINT (start));
8900 end_byte = CHAR_TO_BYTE (XINT (end));
8901 if (XINT (end) - XINT (start) == end_byte - start_byte)
8902 return Qt;
8904 if (XINT (start) < GPT && XINT (end) > GPT)
8906 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8907 move_gap_both (XINT (start), start_byte);
8908 else
8909 move_gap_both (XINT (end), end_byte);
8913 coding_attrs_list = Qnil;
8914 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8915 if (NILP (exclude)
8916 || NILP (Fmemq (XCAR (tail), exclude)))
8918 Lisp_Object attrs;
8920 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8921 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8922 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8924 ASET (attrs, coding_attr_trans_tbl,
8925 get_translation_table (attrs, 1, NULL));
8926 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8930 if (STRINGP (start))
8931 p = pbeg = SDATA (start);
8932 else
8933 p = pbeg = BYTE_POS_ADDR (start_byte);
8934 pend = p + (end_byte - start_byte);
8936 while (p < pend && ASCII_BYTE_P (*p)) p++;
8937 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8939 work_table = Fmake_char_table (Qnil, Qnil);
8940 while (p < pend)
8942 if (ASCII_BYTE_P (*p))
8943 p++;
8944 else
8946 c = STRING_CHAR_ADVANCE (p);
8947 if (!NILP (char_table_ref (work_table, c)))
8948 /* This character was already checked. Ignore it. */
8949 continue;
8951 charset_map_loaded = 0;
8952 for (tail = coding_attrs_list; CONSP (tail);)
8954 elt = XCAR (tail);
8955 if (NILP (elt))
8956 tail = XCDR (tail);
8957 else if (char_encodable_p (c, elt))
8958 tail = XCDR (tail);
8959 else if (CONSP (XCDR (tail)))
8961 XSETCAR (tail, XCAR (XCDR (tail)));
8962 XSETCDR (tail, XCDR (XCDR (tail)));
8964 else
8966 XSETCAR (tail, Qnil);
8967 tail = XCDR (tail);
8970 if (charset_map_loaded)
8972 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8974 if (STRINGP (start))
8975 pbeg = SDATA (start);
8976 else
8977 pbeg = BYTE_POS_ADDR (start_byte);
8978 p = pbeg + p_offset;
8979 pend = pbeg + pend_offset;
8981 char_table_set (work_table, c, Qt);
8985 safe_codings = list2 (Qraw_text, Qno_conversion);
8986 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8987 if (! NILP (XCAR (tail)))
8988 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8990 return safe_codings;
8994 DEFUN ("unencodable-char-position", Funencodable_char_position,
8995 Sunencodable_char_position, 3, 5, 0,
8996 doc: /*
8997 Return position of first un-encodable character in a region.
8998 START and END specify the region and CODING-SYSTEM specifies the
8999 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9001 If optional 4th argument COUNT is non-nil, it specifies at most how
9002 many un-encodable characters to search. In this case, the value is a
9003 list of positions.
9005 If optional 5th argument STRING is non-nil, it is a string to search
9006 for un-encodable characters. In that case, START and END are indexes
9007 to the string. */)
9008 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9010 EMACS_INT n;
9011 struct coding_system coding;
9012 Lisp_Object attrs, charset_list, translation_table;
9013 Lisp_Object positions;
9014 ptrdiff_t from, to;
9015 const unsigned char *p, *stop, *pend;
9016 bool ascii_compatible;
9018 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9019 attrs = CODING_ID_ATTRS (coding.id);
9020 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9021 return Qnil;
9022 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9023 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9024 translation_table = get_translation_table (attrs, 1, NULL);
9026 if (NILP (string))
9028 validate_region (&start, &end);
9029 from = XINT (start);
9030 to = XINT (end);
9031 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9032 || (ascii_compatible
9033 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9034 return Qnil;
9035 p = CHAR_POS_ADDR (from);
9036 pend = CHAR_POS_ADDR (to);
9037 if (from < GPT && to >= GPT)
9038 stop = GPT_ADDR;
9039 else
9040 stop = pend;
9042 else
9044 CHECK_STRING (string);
9045 CHECK_NATNUM (start);
9046 CHECK_NATNUM (end);
9047 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9048 args_out_of_range_3 (string, start, end);
9049 from = XINT (start);
9050 to = XINT (end);
9051 if (! STRING_MULTIBYTE (string))
9052 return Qnil;
9053 p = SDATA (string) + string_char_to_byte (string, from);
9054 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9055 if (ascii_compatible && (to - from) == (pend - p))
9056 return Qnil;
9059 if (NILP (count))
9060 n = 1;
9061 else
9063 CHECK_NATNUM (count);
9064 n = XINT (count);
9067 positions = Qnil;
9068 charset_map_loaded = 0;
9069 while (1)
9071 int c;
9073 if (ascii_compatible)
9074 while (p < stop && ASCII_BYTE_P (*p))
9075 p++, from++;
9076 if (p >= stop)
9078 if (p >= pend)
9079 break;
9080 stop = pend;
9081 p = GAP_END_ADDR;
9084 c = STRING_CHAR_ADVANCE (p);
9085 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9086 && ! char_charset (translate_char (translation_table, c),
9087 charset_list, NULL))
9089 positions = Fcons (make_number (from), positions);
9090 n--;
9091 if (n == 0)
9092 break;
9095 from++;
9096 if (charset_map_loaded && NILP (string))
9098 p = CHAR_POS_ADDR (from);
9099 pend = CHAR_POS_ADDR (to);
9100 if (from < GPT && to >= GPT)
9101 stop = GPT_ADDR;
9102 else
9103 stop = pend;
9104 charset_map_loaded = 0;
9108 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9112 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9113 Scheck_coding_systems_region, 3, 3, 0,
9114 doc: /* Check if the region is encodable by coding systems.
9116 START and END are buffer positions specifying the region.
9117 CODING-SYSTEM-LIST is a list of coding systems to check.
9119 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9120 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9121 whole region, POS0, POS1, ... are buffer positions where non-encodable
9122 characters are found.
9124 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9125 value is nil.
9127 START may be a string. In that case, check if the string is
9128 encodable, and the value contains indices to the string instead of
9129 buffer positions. END is ignored.
9131 If the current buffer (or START if it is a string) is unibyte, the value
9132 is nil. */)
9133 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9135 Lisp_Object list;
9136 ptrdiff_t start_byte, end_byte;
9137 ptrdiff_t pos;
9138 const unsigned char *p, *pbeg, *pend;
9139 int c;
9140 Lisp_Object tail, elt, attrs;
9142 if (STRINGP (start))
9144 if (!STRING_MULTIBYTE (start)
9145 || SCHARS (start) == SBYTES (start))
9146 return Qnil;
9147 start_byte = 0;
9148 end_byte = SBYTES (start);
9149 pos = 0;
9151 else
9153 CHECK_NUMBER_COERCE_MARKER (start);
9154 CHECK_NUMBER_COERCE_MARKER (end);
9155 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9156 args_out_of_range (start, end);
9157 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9158 return Qnil;
9159 start_byte = CHAR_TO_BYTE (XINT (start));
9160 end_byte = CHAR_TO_BYTE (XINT (end));
9161 if (XINT (end) - XINT (start) == end_byte - start_byte)
9162 return Qnil;
9164 if (XINT (start) < GPT && XINT (end) > GPT)
9166 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9167 move_gap_both (XINT (start), start_byte);
9168 else
9169 move_gap_both (XINT (end), end_byte);
9171 pos = XINT (start);
9174 list = Qnil;
9175 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9177 elt = XCAR (tail);
9178 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9179 ASET (attrs, coding_attr_trans_tbl,
9180 get_translation_table (attrs, 1, NULL));
9181 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9184 if (STRINGP (start))
9185 p = pbeg = SDATA (start);
9186 else
9187 p = pbeg = BYTE_POS_ADDR (start_byte);
9188 pend = p + (end_byte - start_byte);
9190 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9191 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9193 while (p < pend)
9195 if (ASCII_BYTE_P (*p))
9196 p++;
9197 else
9199 c = STRING_CHAR_ADVANCE (p);
9201 charset_map_loaded = 0;
9202 for (tail = list; CONSP (tail); tail = XCDR (tail))
9204 elt = XCDR (XCAR (tail));
9205 if (! char_encodable_p (c, XCAR (elt)))
9206 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9208 if (charset_map_loaded)
9210 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9212 if (STRINGP (start))
9213 pbeg = SDATA (start);
9214 else
9215 pbeg = BYTE_POS_ADDR (start_byte);
9216 p = pbeg + p_offset;
9217 pend = pbeg + pend_offset;
9220 pos++;
9223 tail = list;
9224 list = Qnil;
9225 for (; CONSP (tail); tail = XCDR (tail))
9227 elt = XCAR (tail);
9228 if (CONSP (XCDR (XCDR (elt))))
9229 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9230 list);
9233 return list;
9237 static Lisp_Object
9238 code_convert_region (Lisp_Object start, Lisp_Object end,
9239 Lisp_Object coding_system, Lisp_Object dst_object,
9240 bool encodep, bool norecord)
9242 struct coding_system coding;
9243 ptrdiff_t from, from_byte, to, to_byte;
9244 Lisp_Object src_object;
9246 if (NILP (coding_system))
9247 coding_system = Qno_conversion;
9248 else
9249 CHECK_CODING_SYSTEM (coding_system);
9250 src_object = Fcurrent_buffer ();
9251 if (NILP (dst_object))
9252 dst_object = src_object;
9253 else if (! EQ (dst_object, Qt))
9254 CHECK_BUFFER (dst_object);
9256 validate_region (&start, &end);
9257 from = XFASTINT (start);
9258 from_byte = CHAR_TO_BYTE (from);
9259 to = XFASTINT (end);
9260 to_byte = CHAR_TO_BYTE (to);
9262 setup_coding_system (coding_system, &coding);
9263 coding.mode |= CODING_MODE_LAST_BLOCK;
9265 if (encodep)
9266 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9267 dst_object);
9268 else
9269 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9270 dst_object);
9271 if (! norecord)
9272 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9274 return (BUFFERP (dst_object)
9275 ? make_number (coding.produced_char)
9276 : coding.dst_object);
9280 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9281 3, 4, "r\nzCoding system: ",
9282 doc: /* Decode the current region from the specified coding system.
9283 When called from a program, takes four arguments:
9284 START, END, CODING-SYSTEM, and DESTINATION.
9285 START and END are buffer positions.
9287 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9288 If nil, the region between START and END is replaced by the decoded text.
9289 If buffer, the decoded text is inserted in that buffer after point (point
9290 does not move).
9291 In those cases, the length of the decoded text is returned.
9292 If DESTINATION is t, the decoded text is returned.
9294 This function sets `last-coding-system-used' to the precise coding system
9295 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9296 not fully specified.) */)
9297 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9299 return code_convert_region (start, end, coding_system, destination, 0, 0);
9302 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9303 3, 4, "r\nzCoding system: ",
9304 doc: /* Encode the current region by specified coding system.
9305 When called from a program, takes four arguments:
9306 START, END, CODING-SYSTEM and DESTINATION.
9307 START and END are buffer positions.
9309 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9310 If nil, the region between START and END is replace by the encoded text.
9311 If buffer, the encoded text is inserted in that buffer after point (point
9312 does not move).
9313 In those cases, the length of the encoded text is returned.
9314 If DESTINATION is t, the encoded text is returned.
9316 This function sets `last-coding-system-used' to the precise coding system
9317 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9318 not fully specified.) */)
9319 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9321 return code_convert_region (start, end, coding_system, destination, 1, 0);
9324 Lisp_Object
9325 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9326 Lisp_Object dst_object, bool encodep, bool nocopy,
9327 bool norecord)
9329 struct coding_system coding;
9330 ptrdiff_t chars, bytes;
9332 CHECK_STRING (string);
9333 if (NILP (coding_system))
9335 if (! norecord)
9336 Vlast_coding_system_used = Qno_conversion;
9337 if (NILP (dst_object))
9338 return (nocopy ? Fcopy_sequence (string) : string);
9341 if (NILP (coding_system))
9342 coding_system = Qno_conversion;
9343 else
9344 CHECK_CODING_SYSTEM (coding_system);
9345 if (NILP (dst_object))
9346 dst_object = Qt;
9347 else if (! EQ (dst_object, Qt))
9348 CHECK_BUFFER (dst_object);
9350 setup_coding_system (coding_system, &coding);
9351 coding.mode |= CODING_MODE_LAST_BLOCK;
9352 chars = SCHARS (string);
9353 bytes = SBYTES (string);
9354 if (encodep)
9355 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9356 else
9357 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9358 if (! norecord)
9359 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9361 return (BUFFERP (dst_object)
9362 ? make_number (coding.produced_char)
9363 : coding.dst_object);
9367 /* Encode or decode STRING according to CODING_SYSTEM.
9368 Do not set Vlast_coding_system_used.
9370 This function is called only from macros DECODE_FILE and
9371 ENCODE_FILE, thus we ignore character composition. */
9373 Lisp_Object
9374 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9375 bool encodep)
9377 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9381 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9382 2, 4, 0,
9383 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9385 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9386 if the decoding operation is trivial.
9388 Optional fourth arg BUFFER non-nil means that the decoded text is
9389 inserted in that buffer after point (point does not move). In this
9390 case, the return value is the length of the decoded text.
9392 This function sets `last-coding-system-used' to the precise coding system
9393 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9394 not fully specified.) */)
9395 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9397 return code_convert_string (string, coding_system, buffer,
9398 0, ! NILP (nocopy), 0);
9401 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9402 2, 4, 0,
9403 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9405 Optional third arg NOCOPY non-nil means it is OK to return STRING
9406 itself if the encoding operation is trivial.
9408 Optional fourth arg BUFFER non-nil means that the encoded text is
9409 inserted in that buffer after point (point does not move). In this
9410 case, the return value is the length of the encoded text.
9412 This function sets `last-coding-system-used' to the precise coding system
9413 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9414 not fully specified.) */)
9415 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9417 return code_convert_string (string, coding_system, buffer,
9418 1, ! NILP (nocopy), 0);
9422 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9423 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9424 Return the corresponding character. */)
9425 (Lisp_Object code)
9427 Lisp_Object spec, attrs, val;
9428 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9429 EMACS_INT ch;
9430 int c;
9432 CHECK_NATNUM (code);
9433 ch = XFASTINT (code);
9434 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9435 attrs = AREF (spec, 0);
9437 if (ASCII_BYTE_P (ch)
9438 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9439 return code;
9441 val = CODING_ATTR_CHARSET_LIST (attrs);
9442 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9443 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9444 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9446 if (ch <= 0x7F)
9448 c = ch;
9449 charset = charset_roman;
9451 else if (ch >= 0xA0 && ch < 0xDF)
9453 c = ch - 0x80;
9454 charset = charset_kana;
9456 else
9458 EMACS_INT c1 = ch >> 8;
9459 int c2 = ch & 0xFF;
9461 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9462 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9463 error ("Invalid code: %"pI"d", ch);
9464 c = ch;
9465 SJIS_TO_JIS (c);
9466 charset = charset_kanji;
9468 c = DECODE_CHAR (charset, c);
9469 if (c < 0)
9470 error ("Invalid code: %"pI"d", ch);
9471 return make_number (c);
9475 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9476 doc: /* Encode a Japanese character CH to shift_jis encoding.
9477 Return the corresponding code in SJIS. */)
9478 (Lisp_Object ch)
9480 Lisp_Object spec, attrs, charset_list;
9481 int c;
9482 struct charset *charset;
9483 unsigned code;
9485 CHECK_CHARACTER (ch);
9486 c = XFASTINT (ch);
9487 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9488 attrs = AREF (spec, 0);
9490 if (ASCII_CHAR_P (c)
9491 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9492 return ch;
9494 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9495 charset = char_charset (c, charset_list, &code);
9496 if (code == CHARSET_INVALID_CODE (charset))
9497 error ("Can't encode by shift_jis encoding: %c", c);
9498 JIS_TO_SJIS (code);
9500 return make_number (code);
9503 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9504 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9505 Return the corresponding character. */)
9506 (Lisp_Object code)
9508 Lisp_Object spec, attrs, val;
9509 struct charset *charset_roman, *charset_big5, *charset;
9510 EMACS_INT ch;
9511 int c;
9513 CHECK_NATNUM (code);
9514 ch = XFASTINT (code);
9515 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9516 attrs = AREF (spec, 0);
9518 if (ASCII_BYTE_P (ch)
9519 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9520 return code;
9522 val = CODING_ATTR_CHARSET_LIST (attrs);
9523 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9524 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9526 if (ch <= 0x7F)
9528 c = ch;
9529 charset = charset_roman;
9531 else
9533 EMACS_INT b1 = ch >> 8;
9534 int b2 = ch & 0x7F;
9535 if (b1 < 0xA1 || b1 > 0xFE
9536 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9537 error ("Invalid code: %"pI"d", ch);
9538 c = ch;
9539 charset = charset_big5;
9541 c = DECODE_CHAR (charset, c);
9542 if (c < 0)
9543 error ("Invalid code: %"pI"d", ch);
9544 return make_number (c);
9547 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9548 doc: /* Encode the Big5 character CH to BIG5 coding system.
9549 Return the corresponding character code in Big5. */)
9550 (Lisp_Object ch)
9552 Lisp_Object spec, attrs, charset_list;
9553 struct charset *charset;
9554 int c;
9555 unsigned code;
9557 CHECK_CHARACTER (ch);
9558 c = XFASTINT (ch);
9559 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9560 attrs = AREF (spec, 0);
9561 if (ASCII_CHAR_P (c)
9562 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9563 return ch;
9565 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9566 charset = char_charset (c, charset_list, &code);
9567 if (code == CHARSET_INVALID_CODE (charset))
9568 error ("Can't encode by Big5 encoding: %c", c);
9570 return make_number (code);
9574 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9575 Sset_terminal_coding_system_internal, 1, 2, 0,
9576 doc: /* Internal use only. */)
9577 (Lisp_Object coding_system, Lisp_Object terminal)
9579 struct terminal *term = get_terminal (terminal, 1);
9580 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9581 CHECK_SYMBOL (coding_system);
9582 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9583 /* We had better not send unsafe characters to terminal. */
9584 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9585 /* Character composition should be disabled. */
9586 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9587 terminal_coding->src_multibyte = 1;
9588 terminal_coding->dst_multibyte = 0;
9589 tset_charset_list
9590 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9591 ? coding_charset_list (terminal_coding)
9592 : Fcons (make_number (charset_ascii), Qnil)));
9593 return Qnil;
9596 DEFUN ("set-safe-terminal-coding-system-internal",
9597 Fset_safe_terminal_coding_system_internal,
9598 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9599 doc: /* Internal use only. */)
9600 (Lisp_Object coding_system)
9602 CHECK_SYMBOL (coding_system);
9603 setup_coding_system (Fcheck_coding_system (coding_system),
9604 &safe_terminal_coding);
9605 /* Character composition should be disabled. */
9606 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9607 safe_terminal_coding.src_multibyte = 1;
9608 safe_terminal_coding.dst_multibyte = 0;
9609 return Qnil;
9612 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9613 Sterminal_coding_system, 0, 1, 0,
9614 doc: /* Return coding system specified for terminal output on the given terminal.
9615 TERMINAL may be a terminal object, a frame, or nil for the selected
9616 frame's terminal device. */)
9617 (Lisp_Object terminal)
9619 struct coding_system *terminal_coding
9620 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9621 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9623 /* For backward compatibility, return nil if it is `undecided'. */
9624 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9627 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9628 Sset_keyboard_coding_system_internal, 1, 2, 0,
9629 doc: /* Internal use only. */)
9630 (Lisp_Object coding_system, Lisp_Object terminal)
9632 struct terminal *t = get_terminal (terminal, 1);
9633 CHECK_SYMBOL (coding_system);
9634 if (NILP (coding_system))
9635 coding_system = Qno_conversion;
9636 else
9637 Fcheck_coding_system (coding_system);
9638 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9639 /* Character composition should be disabled. */
9640 TERMINAL_KEYBOARD_CODING (t)->common_flags
9641 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9642 return Qnil;
9645 DEFUN ("keyboard-coding-system",
9646 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9647 doc: /* Return coding system specified for decoding keyboard input. */)
9648 (Lisp_Object terminal)
9650 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9651 (get_terminal (terminal, 1))->id);
9655 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9656 Sfind_operation_coding_system, 1, MANY, 0,
9657 doc: /* Choose a coding system for an operation based on the target name.
9658 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9659 DECODING-SYSTEM is the coding system to use for decoding
9660 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9661 for encoding (in case OPERATION does encoding).
9663 The first argument OPERATION specifies an I/O primitive:
9664 For file I/O, `insert-file-contents' or `write-region'.
9665 For process I/O, `call-process', `call-process-region', or `start-process'.
9666 For network I/O, `open-network-stream'.
9668 The remaining arguments should be the same arguments that were passed
9669 to the primitive. Depending on which primitive, one of those arguments
9670 is selected as the TARGET. For example, if OPERATION does file I/O,
9671 whichever argument specifies the file name is TARGET.
9673 TARGET has a meaning which depends on OPERATION:
9674 For file I/O, TARGET is a file name (except for the special case below).
9675 For process I/O, TARGET is a process name.
9676 For network I/O, TARGET is a service name or a port number.
9678 This function looks up what is specified for TARGET in
9679 `file-coding-system-alist', `process-coding-system-alist',
9680 or `network-coding-system-alist' depending on OPERATION.
9681 They may specify a coding system, a cons of coding systems,
9682 or a function symbol to call.
9683 In the last case, we call the function with one argument,
9684 which is a list of all the arguments given to this function.
9685 If the function can't decide a coding system, it can return
9686 `undecided' so that the normal code-detection is performed.
9688 If OPERATION is `insert-file-contents', the argument corresponding to
9689 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9690 file name to look up, and BUFFER is a buffer that contains the file's
9691 contents (not yet decoded). If `file-coding-system-alist' specifies a
9692 function to call for FILENAME, that function should examine the
9693 contents of BUFFER instead of reading the file.
9695 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9696 (ptrdiff_t nargs, Lisp_Object *args)
9698 Lisp_Object operation, target_idx, target, val;
9699 register Lisp_Object chain;
9701 if (nargs < 2)
9702 error ("Too few arguments");
9703 operation = args[0];
9704 if (!SYMBOLP (operation)
9705 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9706 error ("Invalid first argument");
9707 if (nargs <= 1 + XFASTINT (target_idx))
9708 error ("Too few arguments for operation `%s'",
9709 SDATA (SYMBOL_NAME (operation)));
9710 target = args[XFASTINT (target_idx) + 1];
9711 if (!(STRINGP (target)
9712 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9713 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9714 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9715 error ("Invalid argument %"pI"d of operation `%s'",
9716 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9717 if (CONSP (target))
9718 target = XCAR (target);
9720 chain = ((EQ (operation, Qinsert_file_contents)
9721 || EQ (operation, Qwrite_region))
9722 ? Vfile_coding_system_alist
9723 : (EQ (operation, Qopen_network_stream)
9724 ? Vnetwork_coding_system_alist
9725 : Vprocess_coding_system_alist));
9726 if (NILP (chain))
9727 return Qnil;
9729 for (; CONSP (chain); chain = XCDR (chain))
9731 Lisp_Object elt;
9733 elt = XCAR (chain);
9734 if (CONSP (elt)
9735 && ((STRINGP (target)
9736 && STRINGP (XCAR (elt))
9737 && fast_string_match (XCAR (elt), target) >= 0)
9738 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9740 val = XCDR (elt);
9741 /* Here, if VAL is both a valid coding system and a valid
9742 function symbol, we return VAL as a coding system. */
9743 if (CONSP (val))
9744 return val;
9745 if (! SYMBOLP (val))
9746 return Qnil;
9747 if (! NILP (Fcoding_system_p (val)))
9748 return Fcons (val, val);
9749 if (! NILP (Ffboundp (val)))
9751 /* We use call1 rather than safe_call1
9752 so as to get bug reports about functions called here
9753 which don't handle the current interface. */
9754 val = call1 (val, Flist (nargs, args));
9755 if (CONSP (val))
9756 return val;
9757 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9758 return Fcons (val, val);
9760 return Qnil;
9763 return Qnil;
9766 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9767 Sset_coding_system_priority, 0, MANY, 0,
9768 doc: /* Assign higher priority to the coding systems given as arguments.
9769 If multiple coding systems belong to the same category,
9770 all but the first one are ignored.
9772 usage: (set-coding-system-priority &rest coding-systems) */)
9773 (ptrdiff_t nargs, Lisp_Object *args)
9775 ptrdiff_t i, j;
9776 bool changed[coding_category_max];
9777 enum coding_category priorities[coding_category_max];
9779 memset (changed, 0, sizeof changed);
9781 for (i = j = 0; i < nargs; i++)
9783 enum coding_category category;
9784 Lisp_Object spec, attrs;
9786 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9787 attrs = AREF (spec, 0);
9788 category = XINT (CODING_ATTR_CATEGORY (attrs));
9789 if (changed[category])
9790 /* Ignore this coding system because a coding system of the
9791 same category already had a higher priority. */
9792 continue;
9793 changed[category] = 1;
9794 priorities[j++] = category;
9795 if (coding_categories[category].id >= 0
9796 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9797 setup_coding_system (args[i], &coding_categories[category]);
9798 Fset (AREF (Vcoding_category_table, category), args[i]);
9801 /* Now we have decided top J priorities. Reflect the order of the
9802 original priorities to the remaining priorities. */
9804 for (i = j, j = 0; i < coding_category_max; i++, j++)
9806 while (j < coding_category_max
9807 && changed[coding_priorities[j]])
9808 j++;
9809 if (j == coding_category_max)
9810 emacs_abort ();
9811 priorities[i] = coding_priorities[j];
9814 memcpy (coding_priorities, priorities, sizeof priorities);
9816 /* Update `coding-category-list'. */
9817 Vcoding_category_list = Qnil;
9818 for (i = coding_category_max; i-- > 0; )
9819 Vcoding_category_list
9820 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9821 Vcoding_category_list);
9823 return Qnil;
9826 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9827 Scoding_system_priority_list, 0, 1, 0,
9828 doc: /* Return a list of coding systems ordered by their priorities.
9829 The list contains a subset of coding systems; i.e. coding systems
9830 assigned to each coding category (see `coding-category-list').
9832 HIGHESTP non-nil means just return the highest priority one. */)
9833 (Lisp_Object highestp)
9835 int i;
9836 Lisp_Object val;
9838 for (i = 0, val = Qnil; i < coding_category_max; i++)
9840 enum coding_category category = coding_priorities[i];
9841 int id = coding_categories[category].id;
9842 Lisp_Object attrs;
9844 if (id < 0)
9845 continue;
9846 attrs = CODING_ID_ATTRS (id);
9847 if (! NILP (highestp))
9848 return CODING_ATTR_BASE_NAME (attrs);
9849 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9851 return Fnreverse (val);
9854 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9856 static Lisp_Object
9857 make_subsidiaries (Lisp_Object base)
9859 Lisp_Object subsidiaries;
9860 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9861 char *buf = alloca (base_name_len + 6);
9862 int i;
9864 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9865 subsidiaries = make_uninit_vector (3);
9866 for (i = 0; i < 3; i++)
9868 strcpy (buf + base_name_len, suffixes[i]);
9869 ASET (subsidiaries, i, intern (buf));
9871 return subsidiaries;
9875 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9876 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9877 doc: /* For internal use only.
9878 usage: (define-coding-system-internal ...) */)
9879 (ptrdiff_t nargs, Lisp_Object *args)
9881 Lisp_Object name;
9882 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9883 Lisp_Object attrs; /* Vector of attributes. */
9884 Lisp_Object eol_type;
9885 Lisp_Object aliases;
9886 Lisp_Object coding_type, charset_list, safe_charsets;
9887 enum coding_category category;
9888 Lisp_Object tail, val;
9889 int max_charset_id = 0;
9890 int i;
9892 if (nargs < coding_arg_max)
9893 goto short_args;
9895 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9897 name = args[coding_arg_name];
9898 CHECK_SYMBOL (name);
9899 ASET (attrs, coding_attr_base_name, name);
9901 val = args[coding_arg_mnemonic];
9902 if (! STRINGP (val))
9903 CHECK_CHARACTER (val);
9904 ASET (attrs, coding_attr_mnemonic, val);
9906 coding_type = args[coding_arg_coding_type];
9907 CHECK_SYMBOL (coding_type);
9908 ASET (attrs, coding_attr_type, coding_type);
9910 charset_list = args[coding_arg_charset_list];
9911 if (SYMBOLP (charset_list))
9913 if (EQ (charset_list, Qiso_2022))
9915 if (! EQ (coding_type, Qiso_2022))
9916 error ("Invalid charset-list");
9917 charset_list = Viso_2022_charset_list;
9919 else if (EQ (charset_list, Qemacs_mule))
9921 if (! EQ (coding_type, Qemacs_mule))
9922 error ("Invalid charset-list");
9923 charset_list = Vemacs_mule_charset_list;
9925 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9927 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9928 error ("Invalid charset-list");
9929 if (max_charset_id < XFASTINT (XCAR (tail)))
9930 max_charset_id = XFASTINT (XCAR (tail));
9933 else
9935 charset_list = Fcopy_sequence (charset_list);
9936 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9938 struct charset *charset;
9940 val = XCAR (tail);
9941 CHECK_CHARSET_GET_CHARSET (val, charset);
9942 if (EQ (coding_type, Qiso_2022)
9943 ? CHARSET_ISO_FINAL (charset) < 0
9944 : EQ (coding_type, Qemacs_mule)
9945 ? CHARSET_EMACS_MULE_ID (charset) < 0
9946 : 0)
9947 error ("Can't handle charset `%s'",
9948 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9950 XSETCAR (tail, make_number (charset->id));
9951 if (max_charset_id < charset->id)
9952 max_charset_id = charset->id;
9955 ASET (attrs, coding_attr_charset_list, charset_list);
9957 safe_charsets = make_uninit_string (max_charset_id + 1);
9958 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9959 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9960 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9961 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9963 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9965 val = args[coding_arg_decode_translation_table];
9966 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9967 CHECK_SYMBOL (val);
9968 ASET (attrs, coding_attr_decode_tbl, val);
9970 val = args[coding_arg_encode_translation_table];
9971 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9972 CHECK_SYMBOL (val);
9973 ASET (attrs, coding_attr_encode_tbl, val);
9975 val = args[coding_arg_post_read_conversion];
9976 CHECK_SYMBOL (val);
9977 ASET (attrs, coding_attr_post_read, val);
9979 val = args[coding_arg_pre_write_conversion];
9980 CHECK_SYMBOL (val);
9981 ASET (attrs, coding_attr_pre_write, val);
9983 val = args[coding_arg_default_char];
9984 if (NILP (val))
9985 ASET (attrs, coding_attr_default_char, make_number (' '));
9986 else
9988 CHECK_CHARACTER (val);
9989 ASET (attrs, coding_attr_default_char, val);
9992 val = args[coding_arg_for_unibyte];
9993 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9995 val = args[coding_arg_plist];
9996 CHECK_LIST (val);
9997 ASET (attrs, coding_attr_plist, val);
9999 if (EQ (coding_type, Qcharset))
10001 /* Generate a lisp vector of 256 elements. Each element is nil,
10002 integer, or a list of charset IDs.
10004 If Nth element is nil, the byte code N is invalid in this
10005 coding system.
10007 If Nth element is a number NUM, N is the first byte of a
10008 charset whose ID is NUM.
10010 If Nth element is a list of charset IDs, N is the first byte
10011 of one of them. The list is sorted by dimensions of the
10012 charsets. A charset of smaller dimension comes first. */
10013 val = Fmake_vector (make_number (256), Qnil);
10015 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10017 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10018 int dim = CHARSET_DIMENSION (charset);
10019 int idx = (dim - 1) * 4;
10021 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10022 ASET (attrs, coding_attr_ascii_compat, Qt);
10024 for (i = charset->code_space[idx];
10025 i <= charset->code_space[idx + 1]; i++)
10027 Lisp_Object tmp, tmp2;
10028 int dim2;
10030 tmp = AREF (val, i);
10031 if (NILP (tmp))
10032 tmp = XCAR (tail);
10033 else if (NUMBERP (tmp))
10035 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10036 if (dim < dim2)
10037 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10038 else
10039 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10041 else
10043 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10045 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10046 if (dim < dim2)
10047 break;
10049 if (NILP (tmp2))
10050 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10051 else
10053 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10054 XSETCAR (tmp2, XCAR (tail));
10057 ASET (val, i, tmp);
10060 ASET (attrs, coding_attr_charset_valids, val);
10061 category = coding_category_charset;
10063 else if (EQ (coding_type, Qccl))
10065 Lisp_Object valids;
10067 if (nargs < coding_arg_ccl_max)
10068 goto short_args;
10070 val = args[coding_arg_ccl_decoder];
10071 CHECK_CCL_PROGRAM (val);
10072 if (VECTORP (val))
10073 val = Fcopy_sequence (val);
10074 ASET (attrs, coding_attr_ccl_decoder, val);
10076 val = args[coding_arg_ccl_encoder];
10077 CHECK_CCL_PROGRAM (val);
10078 if (VECTORP (val))
10079 val = Fcopy_sequence (val);
10080 ASET (attrs, coding_attr_ccl_encoder, val);
10082 val = args[coding_arg_ccl_valids];
10083 valids = Fmake_string (make_number (256), make_number (0));
10084 for (tail = val; CONSP (tail); tail = XCDR (tail))
10086 int from, to;
10088 val = XCAR (tail);
10089 if (INTEGERP (val))
10091 if (! (0 <= XINT (val) && XINT (val) <= 255))
10092 args_out_of_range_3 (val, make_number (0), make_number (255));
10093 from = to = XINT (val);
10095 else
10097 CHECK_CONS (val);
10098 CHECK_NATNUM_CAR (val);
10099 CHECK_NUMBER_CDR (val);
10100 if (XINT (XCAR (val)) > 255)
10101 args_out_of_range_3 (XCAR (val),
10102 make_number (0), make_number (255));
10103 from = XINT (XCAR (val));
10104 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10105 args_out_of_range_3 (XCDR (val),
10106 XCAR (val), make_number (255));
10107 to = XINT (XCDR (val));
10109 for (i = from; i <= to; i++)
10110 SSET (valids, i, 1);
10112 ASET (attrs, coding_attr_ccl_valids, valids);
10114 category = coding_category_ccl;
10116 else if (EQ (coding_type, Qutf_16))
10118 Lisp_Object bom, endian;
10120 ASET (attrs, coding_attr_ascii_compat, Qnil);
10122 if (nargs < coding_arg_utf16_max)
10123 goto short_args;
10125 bom = args[coding_arg_utf16_bom];
10126 if (! NILP (bom) && ! EQ (bom, Qt))
10128 CHECK_CONS (bom);
10129 val = XCAR (bom);
10130 CHECK_CODING_SYSTEM (val);
10131 val = XCDR (bom);
10132 CHECK_CODING_SYSTEM (val);
10134 ASET (attrs, coding_attr_utf_bom, bom);
10136 endian = args[coding_arg_utf16_endian];
10137 CHECK_SYMBOL (endian);
10138 if (NILP (endian))
10139 endian = Qbig;
10140 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10141 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10142 ASET (attrs, coding_attr_utf_16_endian, endian);
10144 category = (CONSP (bom)
10145 ? coding_category_utf_16_auto
10146 : NILP (bom)
10147 ? (EQ (endian, Qbig)
10148 ? coding_category_utf_16_be_nosig
10149 : coding_category_utf_16_le_nosig)
10150 : (EQ (endian, Qbig)
10151 ? coding_category_utf_16_be
10152 : coding_category_utf_16_le));
10154 else if (EQ (coding_type, Qiso_2022))
10156 Lisp_Object initial, reg_usage, request, flags;
10158 if (nargs < coding_arg_iso2022_max)
10159 goto short_args;
10161 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10162 CHECK_VECTOR (initial);
10163 for (i = 0; i < 4; i++)
10165 val = AREF (initial, i);
10166 if (! NILP (val))
10168 struct charset *charset;
10170 CHECK_CHARSET_GET_CHARSET (val, charset);
10171 ASET (initial, i, make_number (CHARSET_ID (charset)));
10172 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10173 ASET (attrs, coding_attr_ascii_compat, Qt);
10175 else
10176 ASET (initial, i, make_number (-1));
10179 reg_usage = args[coding_arg_iso2022_reg_usage];
10180 CHECK_CONS (reg_usage);
10181 CHECK_NUMBER_CAR (reg_usage);
10182 CHECK_NUMBER_CDR (reg_usage);
10184 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10185 for (tail = request; CONSP (tail); tail = XCDR (tail))
10187 int id;
10188 Lisp_Object tmp1;
10190 val = XCAR (tail);
10191 CHECK_CONS (val);
10192 tmp1 = XCAR (val);
10193 CHECK_CHARSET_GET_ID (tmp1, id);
10194 CHECK_NATNUM_CDR (val);
10195 if (XINT (XCDR (val)) >= 4)
10196 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10197 XSETCAR (val, make_number (id));
10200 flags = args[coding_arg_iso2022_flags];
10201 CHECK_NATNUM (flags);
10202 i = XINT (flags) & INT_MAX;
10203 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10204 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10205 flags = make_number (i);
10207 ASET (attrs, coding_attr_iso_initial, initial);
10208 ASET (attrs, coding_attr_iso_usage, reg_usage);
10209 ASET (attrs, coding_attr_iso_request, request);
10210 ASET (attrs, coding_attr_iso_flags, flags);
10211 setup_iso_safe_charsets (attrs);
10213 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10214 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10215 | CODING_ISO_FLAG_SINGLE_SHIFT))
10216 ? coding_category_iso_7_else
10217 : EQ (args[coding_arg_charset_list], Qiso_2022)
10218 ? coding_category_iso_7
10219 : coding_category_iso_7_tight);
10220 else
10222 int id = XINT (AREF (initial, 1));
10224 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10225 || EQ (args[coding_arg_charset_list], Qiso_2022)
10226 || id < 0)
10227 ? coding_category_iso_8_else
10228 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10229 ? coding_category_iso_8_1
10230 : coding_category_iso_8_2);
10232 if (category != coding_category_iso_8_1
10233 && category != coding_category_iso_8_2)
10234 ASET (attrs, coding_attr_ascii_compat, Qnil);
10236 else if (EQ (coding_type, Qemacs_mule))
10238 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10239 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10240 ASET (attrs, coding_attr_ascii_compat, Qt);
10241 category = coding_category_emacs_mule;
10243 else if (EQ (coding_type, Qshift_jis))
10246 struct charset *charset;
10248 if (XINT (Flength (charset_list)) != 3
10249 && XINT (Flength (charset_list)) != 4)
10250 error ("There should be three or four charsets");
10252 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10253 if (CHARSET_DIMENSION (charset) != 1)
10254 error ("Dimension of charset %s is not one",
10255 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10256 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10257 ASET (attrs, coding_attr_ascii_compat, Qt);
10259 charset_list = XCDR (charset_list);
10260 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10261 if (CHARSET_DIMENSION (charset) != 1)
10262 error ("Dimension of charset %s is not one",
10263 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10265 charset_list = XCDR (charset_list);
10266 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10267 if (CHARSET_DIMENSION (charset) != 2)
10268 error ("Dimension of charset %s is not two",
10269 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10271 charset_list = XCDR (charset_list);
10272 if (! NILP (charset_list))
10274 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10275 if (CHARSET_DIMENSION (charset) != 2)
10276 error ("Dimension of charset %s is not two",
10277 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10280 category = coding_category_sjis;
10281 Vsjis_coding_system = name;
10283 else if (EQ (coding_type, Qbig5))
10285 struct charset *charset;
10287 if (XINT (Flength (charset_list)) != 2)
10288 error ("There should be just two charsets");
10290 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10291 if (CHARSET_DIMENSION (charset) != 1)
10292 error ("Dimension of charset %s is not one",
10293 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10294 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10295 ASET (attrs, coding_attr_ascii_compat, Qt);
10297 charset_list = XCDR (charset_list);
10298 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10299 if (CHARSET_DIMENSION (charset) != 2)
10300 error ("Dimension of charset %s is not two",
10301 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10303 category = coding_category_big5;
10304 Vbig5_coding_system = name;
10306 else if (EQ (coding_type, Qraw_text))
10308 category = coding_category_raw_text;
10309 ASET (attrs, coding_attr_ascii_compat, Qt);
10311 else if (EQ (coding_type, Qutf_8))
10313 Lisp_Object bom;
10315 if (nargs < coding_arg_utf8_max)
10316 goto short_args;
10318 bom = args[coding_arg_utf8_bom];
10319 if (! NILP (bom) && ! EQ (bom, Qt))
10321 CHECK_CONS (bom);
10322 val = XCAR (bom);
10323 CHECK_CODING_SYSTEM (val);
10324 val = XCDR (bom);
10325 CHECK_CODING_SYSTEM (val);
10327 ASET (attrs, coding_attr_utf_bom, bom);
10328 if (NILP (bom))
10329 ASET (attrs, coding_attr_ascii_compat, Qt);
10331 category = (CONSP (bom) ? coding_category_utf_8_auto
10332 : NILP (bom) ? coding_category_utf_8_nosig
10333 : coding_category_utf_8_sig);
10335 else if (EQ (coding_type, Qundecided))
10336 category = coding_category_undecided;
10337 else
10338 error ("Invalid coding system type: %s",
10339 SDATA (SYMBOL_NAME (coding_type)));
10341 ASET (attrs, coding_attr_category, make_number (category));
10342 ASET (attrs, coding_attr_plist,
10343 Fcons (QCcategory,
10344 Fcons (AREF (Vcoding_category_table, category),
10345 CODING_ATTR_PLIST (attrs))));
10346 ASET (attrs, coding_attr_plist,
10347 Fcons (QCascii_compatible_p,
10348 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10349 CODING_ATTR_PLIST (attrs))));
10351 eol_type = args[coding_arg_eol_type];
10352 if (! NILP (eol_type)
10353 && ! EQ (eol_type, Qunix)
10354 && ! EQ (eol_type, Qdos)
10355 && ! EQ (eol_type, Qmac))
10356 error ("Invalid eol-type");
10358 aliases = Fcons (name, Qnil);
10360 if (NILP (eol_type))
10362 eol_type = make_subsidiaries (name);
10363 for (i = 0; i < 3; i++)
10365 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10367 this_name = AREF (eol_type, i);
10368 this_aliases = Fcons (this_name, Qnil);
10369 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10370 this_spec = make_uninit_vector (3);
10371 ASET (this_spec, 0, attrs);
10372 ASET (this_spec, 1, this_aliases);
10373 ASET (this_spec, 2, this_eol_type);
10374 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10375 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10376 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10377 if (NILP (val))
10378 Vcoding_system_alist
10379 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10380 Vcoding_system_alist);
10384 spec_vec = make_uninit_vector (3);
10385 ASET (spec_vec, 0, attrs);
10386 ASET (spec_vec, 1, aliases);
10387 ASET (spec_vec, 2, eol_type);
10389 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10390 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10391 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10392 if (NILP (val))
10393 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10394 Vcoding_system_alist);
10397 int id = coding_categories[category].id;
10399 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10400 setup_coding_system (name, &coding_categories[category]);
10403 return Qnil;
10405 short_args:
10406 return Fsignal (Qwrong_number_of_arguments,
10407 Fcons (intern ("define-coding-system-internal"),
10408 make_number (nargs)));
10412 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10413 3, 3, 0,
10414 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10415 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10417 Lisp_Object spec, attrs;
10419 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10420 attrs = AREF (spec, 0);
10421 if (EQ (prop, QCmnemonic))
10423 if (! STRINGP (val))
10424 CHECK_CHARACTER (val);
10425 ASET (attrs, coding_attr_mnemonic, val);
10427 else if (EQ (prop, QCdefault_char))
10429 if (NILP (val))
10430 val = make_number (' ');
10431 else
10432 CHECK_CHARACTER (val);
10433 ASET (attrs, coding_attr_default_char, val);
10435 else if (EQ (prop, QCdecode_translation_table))
10437 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10438 CHECK_SYMBOL (val);
10439 ASET (attrs, coding_attr_decode_tbl, val);
10441 else if (EQ (prop, QCencode_translation_table))
10443 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10444 CHECK_SYMBOL (val);
10445 ASET (attrs, coding_attr_encode_tbl, val);
10447 else if (EQ (prop, QCpost_read_conversion))
10449 CHECK_SYMBOL (val);
10450 ASET (attrs, coding_attr_post_read, val);
10452 else if (EQ (prop, QCpre_write_conversion))
10454 CHECK_SYMBOL (val);
10455 ASET (attrs, coding_attr_pre_write, val);
10457 else if (EQ (prop, QCascii_compatible_p))
10459 ASET (attrs, coding_attr_ascii_compat, val);
10462 ASET (attrs, coding_attr_plist,
10463 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10464 return val;
10468 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10469 Sdefine_coding_system_alias, 2, 2, 0,
10470 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10471 (Lisp_Object alias, Lisp_Object coding_system)
10473 Lisp_Object spec, aliases, eol_type, val;
10475 CHECK_SYMBOL (alias);
10476 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10477 aliases = AREF (spec, 1);
10478 /* ALIASES should be a list of length more than zero, and the first
10479 element is a base coding system. Append ALIAS at the tail of the
10480 list. */
10481 while (!NILP (XCDR (aliases)))
10482 aliases = XCDR (aliases);
10483 XSETCDR (aliases, Fcons (alias, Qnil));
10485 eol_type = AREF (spec, 2);
10486 if (VECTORP (eol_type))
10488 Lisp_Object subsidiaries;
10489 int i;
10491 subsidiaries = make_subsidiaries (alias);
10492 for (i = 0; i < 3; i++)
10493 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10494 AREF (eol_type, i));
10497 Fputhash (alias, spec, Vcoding_system_hash_table);
10498 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10499 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10500 if (NILP (val))
10501 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10502 Vcoding_system_alist);
10504 return Qnil;
10507 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10508 1, 1, 0,
10509 doc: /* Return the base of CODING-SYSTEM.
10510 Any alias or subsidiary coding system is not a base coding system. */)
10511 (Lisp_Object coding_system)
10513 Lisp_Object spec, attrs;
10515 if (NILP (coding_system))
10516 return (Qno_conversion);
10517 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10518 attrs = AREF (spec, 0);
10519 return CODING_ATTR_BASE_NAME (attrs);
10522 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10523 1, 1, 0,
10524 doc: "Return the property list of CODING-SYSTEM.")
10525 (Lisp_Object coding_system)
10527 Lisp_Object spec, attrs;
10529 if (NILP (coding_system))
10530 coding_system = Qno_conversion;
10531 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10532 attrs = AREF (spec, 0);
10533 return CODING_ATTR_PLIST (attrs);
10537 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10538 1, 1, 0,
10539 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10540 (Lisp_Object coding_system)
10542 Lisp_Object spec;
10544 if (NILP (coding_system))
10545 coding_system = Qno_conversion;
10546 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10547 return AREF (spec, 1);
10550 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10551 Scoding_system_eol_type, 1, 1, 0,
10552 doc: /* Return eol-type of CODING-SYSTEM.
10553 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10555 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10556 and CR respectively.
10558 A vector value indicates that a format of end-of-line should be
10559 detected automatically. Nth element of the vector is the subsidiary
10560 coding system whose eol-type is N. */)
10561 (Lisp_Object coding_system)
10563 Lisp_Object spec, eol_type;
10564 int n;
10566 if (NILP (coding_system))
10567 coding_system = Qno_conversion;
10568 if (! CODING_SYSTEM_P (coding_system))
10569 return Qnil;
10570 spec = CODING_SYSTEM_SPEC (coding_system);
10571 eol_type = AREF (spec, 2);
10572 if (VECTORP (eol_type))
10573 return Fcopy_sequence (eol_type);
10574 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10575 return make_number (n);
10578 #endif /* emacs */
10581 /*** 9. Post-amble ***/
10583 void
10584 init_coding_once (void)
10586 int i;
10588 for (i = 0; i < coding_category_max; i++)
10590 coding_categories[i].id = -1;
10591 coding_priorities[i] = i;
10594 /* ISO2022 specific initialize routine. */
10595 for (i = 0; i < 0x20; i++)
10596 iso_code_class[i] = ISO_control_0;
10597 for (i = 0x21; i < 0x7F; i++)
10598 iso_code_class[i] = ISO_graphic_plane_0;
10599 for (i = 0x80; i < 0xA0; i++)
10600 iso_code_class[i] = ISO_control_1;
10601 for (i = 0xA1; i < 0xFF; i++)
10602 iso_code_class[i] = ISO_graphic_plane_1;
10603 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10604 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10605 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10606 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10607 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10608 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10609 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10610 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10611 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10613 for (i = 0; i < 256; i++)
10615 emacs_mule_bytes[i] = 1;
10617 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10618 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10619 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10620 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10623 #ifdef emacs
10625 void
10626 syms_of_coding (void)
10628 staticpro (&Vcoding_system_hash_table);
10630 Lisp_Object args[2];
10631 args[0] = QCtest;
10632 args[1] = Qeq;
10633 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10636 staticpro (&Vsjis_coding_system);
10637 Vsjis_coding_system = Qnil;
10639 staticpro (&Vbig5_coding_system);
10640 Vbig5_coding_system = Qnil;
10642 staticpro (&Vcode_conversion_reused_workbuf);
10643 Vcode_conversion_reused_workbuf = Qnil;
10645 staticpro (&Vcode_conversion_workbuf_name);
10646 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10648 reused_workbuf_in_use = 0;
10650 DEFSYM (Qcharset, "charset");
10651 DEFSYM (Qtarget_idx, "target-idx");
10652 DEFSYM (Qcoding_system_history, "coding-system-history");
10653 Fset (Qcoding_system_history, Qnil);
10655 /* Target FILENAME is the first argument. */
10656 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10657 /* Target FILENAME is the third argument. */
10658 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10660 DEFSYM (Qcall_process, "call-process");
10661 /* Target PROGRAM is the first argument. */
10662 Fput (Qcall_process, Qtarget_idx, make_number (0));
10664 DEFSYM (Qcall_process_region, "call-process-region");
10665 /* Target PROGRAM is the third argument. */
10666 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10668 DEFSYM (Qstart_process, "start-process");
10669 /* Target PROGRAM is the third argument. */
10670 Fput (Qstart_process, Qtarget_idx, make_number (2));
10672 DEFSYM (Qopen_network_stream, "open-network-stream");
10673 /* Target SERVICE is the fourth argument. */
10674 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10676 DEFSYM (Qcoding_system, "coding-system");
10677 DEFSYM (Qcoding_aliases, "coding-aliases");
10679 DEFSYM (Qeol_type, "eol-type");
10680 DEFSYM (Qunix, "unix");
10681 DEFSYM (Qdos, "dos");
10682 DEFSYM (Qmac, "mac");
10684 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10685 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10686 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10687 DEFSYM (Qdefault_char, "default-char");
10688 DEFSYM (Qundecided, "undecided");
10689 DEFSYM (Qno_conversion, "no-conversion");
10690 DEFSYM (Qraw_text, "raw-text");
10692 DEFSYM (Qiso_2022, "iso-2022");
10694 DEFSYM (Qutf_8, "utf-8");
10695 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10697 #if defined (WINDOWSNT) || defined (CYGWIN)
10698 /* No, not utf-16-le: that one has a BOM. */
10699 DEFSYM (Qutf_16le, "utf-16le");
10700 #endif
10702 DEFSYM (Qutf_16, "utf-16");
10703 DEFSYM (Qbig, "big");
10704 DEFSYM (Qlittle, "little");
10706 DEFSYM (Qshift_jis, "shift-jis");
10707 DEFSYM (Qbig5, "big5");
10709 DEFSYM (Qcoding_system_p, "coding-system-p");
10711 DEFSYM (Qcoding_system_error, "coding-system-error");
10712 Fput (Qcoding_system_error, Qerror_conditions,
10713 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10714 Fput (Qcoding_system_error, Qerror_message,
10715 build_pure_c_string ("Invalid coding system"));
10717 /* Intern this now in case it isn't already done.
10718 Setting this variable twice is harmless.
10719 But don't staticpro it here--that is done in alloc.c. */
10720 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10722 DEFSYM (Qtranslation_table, "translation-table");
10723 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10724 DEFSYM (Qtranslation_table_id, "translation-table-id");
10725 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10726 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10728 DEFSYM (Qvalid_codes, "valid-codes");
10730 DEFSYM (Qemacs_mule, "emacs-mule");
10732 DEFSYM (QCcategory, ":category");
10733 DEFSYM (QCmnemonic, ":mnemonic");
10734 DEFSYM (QCdefault_char, ":default-char");
10735 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10736 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10737 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10738 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10739 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10741 Vcoding_category_table
10742 = Fmake_vector (make_number (coding_category_max), Qnil);
10743 staticpro (&Vcoding_category_table);
10744 /* Followings are target of code detection. */
10745 ASET (Vcoding_category_table, coding_category_iso_7,
10746 intern_c_string ("coding-category-iso-7"));
10747 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10748 intern_c_string ("coding-category-iso-7-tight"));
10749 ASET (Vcoding_category_table, coding_category_iso_8_1,
10750 intern_c_string ("coding-category-iso-8-1"));
10751 ASET (Vcoding_category_table, coding_category_iso_8_2,
10752 intern_c_string ("coding-category-iso-8-2"));
10753 ASET (Vcoding_category_table, coding_category_iso_7_else,
10754 intern_c_string ("coding-category-iso-7-else"));
10755 ASET (Vcoding_category_table, coding_category_iso_8_else,
10756 intern_c_string ("coding-category-iso-8-else"));
10757 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10758 intern_c_string ("coding-category-utf-8-auto"));
10759 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10760 intern_c_string ("coding-category-utf-8"));
10761 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10762 intern_c_string ("coding-category-utf-8-sig"));
10763 ASET (Vcoding_category_table, coding_category_utf_16_be,
10764 intern_c_string ("coding-category-utf-16-be"));
10765 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10766 intern_c_string ("coding-category-utf-16-auto"));
10767 ASET (Vcoding_category_table, coding_category_utf_16_le,
10768 intern_c_string ("coding-category-utf-16-le"));
10769 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10770 intern_c_string ("coding-category-utf-16-be-nosig"));
10771 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10772 intern_c_string ("coding-category-utf-16-le-nosig"));
10773 ASET (Vcoding_category_table, coding_category_charset,
10774 intern_c_string ("coding-category-charset"));
10775 ASET (Vcoding_category_table, coding_category_sjis,
10776 intern_c_string ("coding-category-sjis"));
10777 ASET (Vcoding_category_table, coding_category_big5,
10778 intern_c_string ("coding-category-big5"));
10779 ASET (Vcoding_category_table, coding_category_ccl,
10780 intern_c_string ("coding-category-ccl"));
10781 ASET (Vcoding_category_table, coding_category_emacs_mule,
10782 intern_c_string ("coding-category-emacs-mule"));
10783 /* Followings are NOT target of code detection. */
10784 ASET (Vcoding_category_table, coding_category_raw_text,
10785 intern_c_string ("coding-category-raw-text"));
10786 ASET (Vcoding_category_table, coding_category_undecided,
10787 intern_c_string ("coding-category-undecided"));
10789 DEFSYM (Qinsufficient_source, "insufficient-source");
10790 DEFSYM (Qinvalid_source, "invalid-source");
10791 DEFSYM (Qinterrupted, "interrupted");
10792 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10794 defsubr (&Scoding_system_p);
10795 defsubr (&Sread_coding_system);
10796 defsubr (&Sread_non_nil_coding_system);
10797 defsubr (&Scheck_coding_system);
10798 defsubr (&Sdetect_coding_region);
10799 defsubr (&Sdetect_coding_string);
10800 defsubr (&Sfind_coding_systems_region_internal);
10801 defsubr (&Sunencodable_char_position);
10802 defsubr (&Scheck_coding_systems_region);
10803 defsubr (&Sdecode_coding_region);
10804 defsubr (&Sencode_coding_region);
10805 defsubr (&Sdecode_coding_string);
10806 defsubr (&Sencode_coding_string);
10807 defsubr (&Sdecode_sjis_char);
10808 defsubr (&Sencode_sjis_char);
10809 defsubr (&Sdecode_big5_char);
10810 defsubr (&Sencode_big5_char);
10811 defsubr (&Sset_terminal_coding_system_internal);
10812 defsubr (&Sset_safe_terminal_coding_system_internal);
10813 defsubr (&Sterminal_coding_system);
10814 defsubr (&Sset_keyboard_coding_system_internal);
10815 defsubr (&Skeyboard_coding_system);
10816 defsubr (&Sfind_operation_coding_system);
10817 defsubr (&Sset_coding_system_priority);
10818 defsubr (&Sdefine_coding_system_internal);
10819 defsubr (&Sdefine_coding_system_alias);
10820 defsubr (&Scoding_system_put);
10821 defsubr (&Scoding_system_base);
10822 defsubr (&Scoding_system_plist);
10823 defsubr (&Scoding_system_aliases);
10824 defsubr (&Scoding_system_eol_type);
10825 defsubr (&Scoding_system_priority_list);
10827 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10828 doc: /* List of coding systems.
10830 Do not alter the value of this variable manually. This variable should be
10831 updated by the functions `define-coding-system' and
10832 `define-coding-system-alias'. */);
10833 Vcoding_system_list = Qnil;
10835 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10836 doc: /* Alist of coding system names.
10837 Each element is one element list of coding system name.
10838 This variable is given to `completing-read' as COLLECTION argument.
10840 Do not alter the value of this variable manually. This variable should be
10841 updated by the functions `make-coding-system' and
10842 `define-coding-system-alias'. */);
10843 Vcoding_system_alist = Qnil;
10845 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10846 doc: /* List of coding-categories (symbols) ordered by priority.
10848 On detecting a coding system, Emacs tries code detection algorithms
10849 associated with each coding-category one by one in this order. When
10850 one algorithm agrees with a byte sequence of source text, the coding
10851 system bound to the corresponding coding-category is selected.
10853 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10855 int i;
10857 Vcoding_category_list = Qnil;
10858 for (i = coding_category_max - 1; i >= 0; i--)
10859 Vcoding_category_list
10860 = Fcons (AREF (Vcoding_category_table, i),
10861 Vcoding_category_list);
10864 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10865 doc: /* Specify the coding system for read operations.
10866 It is useful to bind this variable with `let', but do not set it globally.
10867 If the value is a coding system, it is used for decoding on read operation.
10868 If not, an appropriate element is used from one of the coding system alists.
10869 There are three such tables: `file-coding-system-alist',
10870 `process-coding-system-alist', and `network-coding-system-alist'. */);
10871 Vcoding_system_for_read = Qnil;
10873 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10874 doc: /* Specify the coding system for write operations.
10875 Programs bind this variable with `let', but you should not set it globally.
10876 If the value is a coding system, it is used for encoding of output,
10877 when writing it to a file and when sending it to a file or subprocess.
10879 If this does not specify a coding system, an appropriate element
10880 is used from one of the coding system alists.
10881 There are three such tables: `file-coding-system-alist',
10882 `process-coding-system-alist', and `network-coding-system-alist'.
10883 For output to files, if the above procedure does not specify a coding system,
10884 the value of `buffer-file-coding-system' is used. */);
10885 Vcoding_system_for_write = Qnil;
10887 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10888 doc: /*
10889 Coding system used in the latest file or process I/O. */);
10890 Vlast_coding_system_used = Qnil;
10892 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10893 doc: /*
10894 Error status of the last code conversion.
10896 When an error was detected in the last code conversion, this variable
10897 is set to one of the following symbols.
10898 `insufficient-source'
10899 `inconsistent-eol'
10900 `invalid-source'
10901 `interrupted'
10902 `insufficient-memory'
10903 When no error was detected, the value doesn't change. So, to check
10904 the error status of a code conversion by this variable, you must
10905 explicitly set this variable to nil before performing code
10906 conversion. */);
10907 Vlast_code_conversion_error = Qnil;
10909 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10910 doc: /*
10911 *Non-nil means always inhibit code conversion of end-of-line format.
10912 See info node `Coding Systems' and info node `Text and Binary' concerning
10913 such conversion. */);
10914 inhibit_eol_conversion = 0;
10916 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10917 doc: /*
10918 Non-nil means process buffer inherits coding system of process output.
10919 Bind it to t if the process output is to be treated as if it were a file
10920 read from some filesystem. */);
10921 inherit_process_coding_system = 0;
10923 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10924 doc: /*
10925 Alist to decide a coding system to use for a file I/O operation.
10926 The format is ((PATTERN . VAL) ...),
10927 where PATTERN is a regular expression matching a file name,
10928 VAL is a coding system, a cons of coding systems, or a function symbol.
10929 If VAL is a coding system, it is used for both decoding and encoding
10930 the file contents.
10931 If VAL is a cons of coding systems, the car part is used for decoding,
10932 and the cdr part is used for encoding.
10933 If VAL is a function symbol, the function must return a coding system
10934 or a cons of coding systems which are used as above. The function is
10935 called with an argument that is a list of the arguments with which
10936 `find-operation-coding-system' was called. If the function can't decide
10937 a coding system, it can return `undecided' so that the normal
10938 code-detection is performed.
10940 See also the function `find-operation-coding-system'
10941 and the variable `auto-coding-alist'. */);
10942 Vfile_coding_system_alist = Qnil;
10944 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10945 doc: /*
10946 Alist to decide a coding system to use for a process I/O operation.
10947 The format is ((PATTERN . VAL) ...),
10948 where PATTERN is a regular expression matching a program name,
10949 VAL is a coding system, a cons of coding systems, or a function symbol.
10950 If VAL is a coding system, it is used for both decoding what received
10951 from the program and encoding what sent to the program.
10952 If VAL is a cons of coding systems, the car part is used for decoding,
10953 and the cdr part is used for encoding.
10954 If VAL is a function symbol, the function must return a coding system
10955 or a cons of coding systems which are used as above.
10957 See also the function `find-operation-coding-system'. */);
10958 Vprocess_coding_system_alist = Qnil;
10960 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10961 doc: /*
10962 Alist to decide a coding system to use for a network I/O operation.
10963 The format is ((PATTERN . VAL) ...),
10964 where PATTERN is a regular expression matching a network service name
10965 or is a port number to connect to,
10966 VAL is a coding system, a cons of coding systems, or a function symbol.
10967 If VAL is a coding system, it is used for both decoding what received
10968 from the network stream and encoding what sent to the network stream.
10969 If VAL is a cons of coding systems, the car part is used for decoding,
10970 and the cdr part is used for encoding.
10971 If VAL is a function symbol, the function must return a coding system
10972 or a cons of coding systems which are used as above.
10974 See also the function `find-operation-coding-system'. */);
10975 Vnetwork_coding_system_alist = Qnil;
10977 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10978 doc: /* Coding system to use with system messages.
10979 Also used for decoding keyboard input on X Window system. */);
10980 Vlocale_coding_system = Qnil;
10982 /* The eol mnemonics are reset in startup.el system-dependently. */
10983 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10984 doc: /*
10985 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10986 eol_mnemonic_unix = build_pure_c_string (":");
10988 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10989 doc: /*
10990 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10991 eol_mnemonic_dos = build_pure_c_string ("\\");
10993 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10994 doc: /*
10995 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10996 eol_mnemonic_mac = build_pure_c_string ("/");
10998 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10999 doc: /*
11000 *String displayed in mode line when end-of-line format is not yet determined. */);
11001 eol_mnemonic_undecided = build_pure_c_string (":");
11003 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11004 doc: /*
11005 *Non-nil enables character translation while encoding and decoding. */);
11006 Venable_character_translation = Qt;
11008 DEFVAR_LISP ("standard-translation-table-for-decode",
11009 Vstandard_translation_table_for_decode,
11010 doc: /* Table for translating characters while decoding. */);
11011 Vstandard_translation_table_for_decode = Qnil;
11013 DEFVAR_LISP ("standard-translation-table-for-encode",
11014 Vstandard_translation_table_for_encode,
11015 doc: /* Table for translating characters while encoding. */);
11016 Vstandard_translation_table_for_encode = Qnil;
11018 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11019 doc: /* Alist of charsets vs revision numbers.
11020 While encoding, if a charset (car part of an element) is found,
11021 designate it with the escape sequence identifying revision (cdr part
11022 of the element). */);
11023 Vcharset_revision_table = Qnil;
11025 DEFVAR_LISP ("default-process-coding-system",
11026 Vdefault_process_coding_system,
11027 doc: /* Cons of coding systems used for process I/O by default.
11028 The car part is used for decoding a process output,
11029 the cdr part is used for encoding a text to be sent to a process. */);
11030 Vdefault_process_coding_system = Qnil;
11032 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11033 doc: /*
11034 Table of extra Latin codes in the range 128..159 (inclusive).
11035 This is a vector of length 256.
11036 If Nth element is non-nil, the existence of code N in a file
11037 \(or output of subprocess) doesn't prevent it to be detected as
11038 a coding system of ISO 2022 variant which has a flag
11039 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11040 or reading output of a subprocess.
11041 Only 128th through 159th elements have a meaning. */);
11042 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11044 DEFVAR_LISP ("select-safe-coding-system-function",
11045 Vselect_safe_coding_system_function,
11046 doc: /*
11047 Function to call to select safe coding system for encoding a text.
11049 If set, this function is called to force a user to select a proper
11050 coding system which can encode the text in the case that a default
11051 coding system used in each operation can't encode the text. The
11052 function should take care that the buffer is not modified while
11053 the coding system is being selected.
11055 The default value is `select-safe-coding-system' (which see). */);
11056 Vselect_safe_coding_system_function = Qnil;
11058 DEFVAR_BOOL ("coding-system-require-warning",
11059 coding_system_require_warning,
11060 doc: /* Internal use only.
11061 If non-nil, on writing a file, `select-safe-coding-system-function' is
11062 called even if `coding-system-for-write' is non-nil. The command
11063 `universal-coding-system-argument' binds this variable to t temporarily. */);
11064 coding_system_require_warning = 0;
11067 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11068 inhibit_iso_escape_detection,
11069 doc: /*
11070 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11072 When Emacs reads text, it tries to detect how the text is encoded.
11073 This code detection is sensitive to escape sequences. If Emacs sees
11074 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11075 of the ISO2022 encodings, and decodes text by the corresponding coding
11076 system (e.g. `iso-2022-7bit').
11078 However, there may be a case that you want to read escape sequences in
11079 a file as is. In such a case, you can set this variable to non-nil.
11080 Then the code detection will ignore any escape sequences, and no text is
11081 detected as encoded in some ISO-2022 encoding. The result is that all
11082 escape sequences become visible in a buffer.
11084 The default value is nil, and it is strongly recommended not to change
11085 it. That is because many Emacs Lisp source files that contain
11086 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11087 in Emacs's distribution, and they won't be decoded correctly on
11088 reading if you suppress escape sequence detection.
11090 The other way to read escape sequences in a file without decoding is
11091 to explicitly specify some coding system that doesn't use ISO-2022
11092 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11093 inhibit_iso_escape_detection = 0;
11095 DEFVAR_BOOL ("inhibit-null-byte-detection",
11096 inhibit_null_byte_detection,
11097 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11098 By default, Emacs treats it as binary data, and does not attempt to
11099 decode it. The effect is as if you specified `no-conversion' for
11100 reading that text.
11102 Set this to non-nil when a regular text happens to include null bytes.
11103 Examples are Index nodes of Info files and null-byte delimited output
11104 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11105 decode text as usual. */);
11106 inhibit_null_byte_detection = 0;
11108 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11109 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11110 Internal use only. Removed after the experimental optimizer gets stable. */);
11111 disable_ascii_optimization = 0;
11113 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11114 doc: /* Char table for translating self-inserting characters.
11115 This is applied to the result of input methods, not their input.
11116 See also `keyboard-translate-table'.
11118 Use of this variable for character code unification was rendered
11119 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11120 internal character representation. */);
11121 Vtranslation_table_for_input = Qnil;
11124 Lisp_Object args[coding_arg_max];
11125 Lisp_Object plist[16];
11126 int i;
11128 for (i = 0; i < coding_arg_max; i++)
11129 args[i] = Qnil;
11131 plist[0] = intern_c_string (":name");
11132 plist[1] = args[coding_arg_name] = Qno_conversion;
11133 plist[2] = intern_c_string (":mnemonic");
11134 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11135 plist[4] = intern_c_string (":coding-type");
11136 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11137 plist[6] = intern_c_string (":ascii-compatible-p");
11138 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11139 plist[8] = intern_c_string (":default-char");
11140 plist[9] = args[coding_arg_default_char] = make_number (0);
11141 plist[10] = intern_c_string (":for-unibyte");
11142 plist[11] = args[coding_arg_for_unibyte] = Qt;
11143 plist[12] = intern_c_string (":docstring");
11144 plist[13] = build_pure_c_string ("Do no conversion.\n\
11146 When you visit a file with this coding, the file is read into a\n\
11147 unibyte buffer as is, thus each byte of a file is treated as a\n\
11148 character.");
11149 plist[14] = intern_c_string (":eol-type");
11150 plist[15] = args[coding_arg_eol_type] = Qunix;
11151 args[coding_arg_plist] = Flist (16, plist);
11152 Fdefine_coding_system_internal (coding_arg_max, args);
11154 plist[1] = args[coding_arg_name] = Qundecided;
11155 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11156 plist[5] = args[coding_arg_coding_type] = Qundecided;
11157 /* This is already set.
11158 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11159 plist[8] = intern_c_string (":charset-list");
11160 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11161 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11162 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11163 plist[15] = args[coding_arg_eol_type] = Qnil;
11164 args[coding_arg_plist] = Flist (16, plist);
11165 Fdefine_coding_system_internal (coding_arg_max, args);
11168 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11171 int i;
11173 for (i = 0; i < coding_category_max; i++)
11174 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11176 #if defined (DOS_NT)
11177 system_eol_type = Qdos;
11178 #else
11179 system_eol_type = Qunix;
11180 #endif
11181 staticpro (&system_eol_type);
11184 char *
11185 emacs_strerror (int error_number)
11187 char *str;
11189 synchronize_system_messages_locale ();
11190 str = strerror (error_number);
11192 if (! NILP (Vlocale_coding_system))
11194 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11195 Vlocale_coding_system,
11197 str = SSDATA (dec);
11200 return str;
11203 #endif /* emacs */